polars_core/datatypes/
dtype.rs

1use std::collections::BTreeMap;
2
3use arrow::datatypes::{
4    DTYPE_CATEGORICAL_NEW, DTYPE_ENUM_VALUES_LEGACY, DTYPE_ENUM_VALUES_NEW, Metadata,
5};
6#[cfg(feature = "dtype-array")]
7use polars_utils::format_tuple;
8use polars_utils::itertools::Itertools;
9#[cfg(any(feature = "serde-lazy", feature = "serde"))]
10use serde::{Deserialize, Serialize};
11pub use temporal::time_zone::TimeZone;
12
13use super::*;
14#[cfg(feature = "object")]
15use crate::chunked_array::object::registry::get_object_physical_type;
16use crate::utils::materialize_dyn_int;
17
18static MAINTAIN_PL_TYPE: &str = "maintain_type";
19static PL_KEY: &str = "pl";
20
21pub trait MetaDataExt: IntoMetadata {
22    fn pl_enum_metadata(&self) -> Option<&str> {
23        let md = self.into_metadata_ref();
24        let values = md
25            .get(DTYPE_ENUM_VALUES_NEW)
26            .or_else(|| md.get(DTYPE_ENUM_VALUES_LEGACY));
27        Some(values?.as_str())
28    }
29
30    fn pl_categorical_metadata(&self) -> Option<&str> {
31        // We ignore DTYPE_CATEGORICAL_LEGACY here, as we already map all
32        // string-typed arrow dictionaries to the global Categories, and the
33        // legacy metadata format only specifies the now-removed physical
34        // ordering parameter.
35        Some(
36            self.into_metadata_ref()
37                .get(DTYPE_CATEGORICAL_NEW)?
38                .as_str(),
39        )
40    }
41
42    fn maintain_type(&self) -> bool {
43        let metadata = self.into_metadata_ref();
44        metadata.get(PL_KEY).map(|s| s.as_str()) == Some(MAINTAIN_PL_TYPE)
45    }
46}
47
48impl MetaDataExt for Metadata {}
49pub trait IntoMetadata {
50    #[allow(clippy::wrong_self_convention)]
51    fn into_metadata_ref(&self) -> &Metadata;
52}
53
54impl IntoMetadata for Metadata {
55    fn into_metadata_ref(&self) -> &Metadata {
56        self
57    }
58}
59
60#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Default)]
61#[cfg_attr(
62    any(feature = "serde", feature = "serde-lazy"),
63    derive(Serialize, Deserialize)
64)]
65#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
66pub enum UnknownKind {
67    Ufunc,
68    // Hold the value to determine the concrete size.
69    Int(i128),
70    Float,
71    // Can be Categorical or String
72    Str,
73    #[default]
74    Any,
75}
76
77impl UnknownKind {
78    pub fn materialize(&self) -> Option<DataType> {
79        let dtype = match self {
80            UnknownKind::Int(v) => materialize_dyn_int(*v).dtype(),
81            UnknownKind::Float => DataType::Float64,
82            UnknownKind::Str => DataType::String,
83            UnknownKind::Any | UnknownKind::Ufunc => return None,
84        };
85        Some(dtype)
86    }
87}
88
89#[derive(Clone, Debug)]
90pub enum DataType {
91    Boolean,
92    UInt8,
93    UInt16,
94    UInt32,
95    UInt64,
96    Int8,
97    Int16,
98    Int32,
99    Int64,
100    Int128,
101    Float32,
102    Float64,
103    /// Fixed point decimal type optional precision and non-negative scale.
104    /// This is backed by a signed 128-bit integer which allows for up to 38 significant digits.
105    /// Meaning max precision is 38.
106    #[cfg(feature = "dtype-decimal")]
107    Decimal(Option<usize>, Option<usize>), // precision/scale; scale being None means "infer"
108    /// String data
109    String,
110    Binary,
111    BinaryOffset,
112    /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
113    /// in days (32 bits).
114    Date,
115    /// A 64-bit date representing the elapsed time since UNIX epoch (1970-01-01)
116    /// in the given timeunit (64 bits).
117    Datetime(TimeUnit, Option<TimeZone>),
118    /// 64-bit integer representing difference between times in milliseconds or nanoseconds
119    Duration(TimeUnit),
120    /// A 64-bit time representing the elapsed time since midnight in nanoseconds
121    Time,
122    /// A nested list with a fixed size in each row
123    #[cfg(feature = "dtype-array")]
124    Array(Box<DataType>, usize),
125    /// A nested list with a variable size in each row
126    List(Box<DataType>),
127    /// A generic type that can be used in a `Series`
128    /// &'static str can be used to determine/set inner type
129    #[cfg(feature = "object")]
130    Object(&'static str),
131    Null,
132    #[cfg(feature = "dtype-categorical")]
133    Categorical(Arc<Categories>, Arc<CategoricalMapping>),
134    // It is an Option, so that matching Enum/Categoricals can take the same guards.
135    #[cfg(feature = "dtype-categorical")]
136    Enum(Arc<FrozenCategories>, Arc<CategoricalMapping>),
137    #[cfg(feature = "dtype-struct")]
138    Struct(Vec<Field>),
139    // some logical types we cannot know statically, e.g. Datetime
140    Unknown(UnknownKind),
141}
142
143impl Default for DataType {
144    fn default() -> Self {
145        DataType::Unknown(UnknownKind::Any)
146    }
147}
148
149pub trait AsRefDataType {
150    fn as_ref_dtype(&self) -> &DataType;
151}
152
153impl Hash for DataType {
154    fn hash<H: Hasher>(&self, state: &mut H) {
155        std::mem::discriminant(self).hash(state)
156    }
157}
158
159impl PartialEq for DataType {
160    fn eq(&self, other: &Self) -> bool {
161        use DataType::*;
162        {
163            match (self, other) {
164                #[cfg(feature = "dtype-categorical")]
165                (Categorical(cats_l, _), Categorical(cats_r, _)) => Arc::ptr_eq(cats_l, cats_r),
166                #[cfg(feature = "dtype-categorical")]
167                (Enum(fcats_l, _), Enum(fcats_r, _)) => Arc::ptr_eq(fcats_l, fcats_r),
168                (Datetime(tu_l, tz_l), Datetime(tu_r, tz_r)) => tu_l == tu_r && tz_l == tz_r,
169                (List(left_inner), List(right_inner)) => left_inner == right_inner,
170                #[cfg(feature = "dtype-duration")]
171                (Duration(tu_l), Duration(tu_r)) => tu_l == tu_r,
172                #[cfg(feature = "dtype-decimal")]
173                (Decimal(l_prec, l_scale), Decimal(r_prec, r_scale)) => {
174                    let is_prec_eq = l_prec.is_none() || r_prec.is_none() || l_prec == r_prec;
175                    let is_scale_eq = l_scale.is_none() || r_scale.is_none() || l_scale == r_scale;
176
177                    is_prec_eq && is_scale_eq
178                },
179                #[cfg(feature = "object")]
180                (Object(lhs), Object(rhs)) => lhs == rhs,
181                #[cfg(feature = "dtype-struct")]
182                (Struct(lhs), Struct(rhs)) => {
183                    std::ptr::eq(Vec::as_ptr(lhs), Vec::as_ptr(rhs)) || lhs == rhs
184                },
185                #[cfg(feature = "dtype-array")]
186                (Array(left_inner, left_width), Array(right_inner, right_width)) => {
187                    left_width == right_width && left_inner == right_inner
188                },
189                (Unknown(l), Unknown(r)) => match (l, r) {
190                    (UnknownKind::Int(_), UnknownKind::Int(_)) => true,
191                    _ => l == r,
192                },
193                _ => std::mem::discriminant(self) == std::mem::discriminant(other),
194            }
195        }
196    }
197}
198
199impl Eq for DataType {}
200
201impl DataType {
202    pub const IDX_DTYPE: Self = {
203        #[cfg(not(feature = "bigidx"))]
204        {
205            DataType::UInt32
206        }
207        #[cfg(feature = "bigidx")]
208        {
209            DataType::UInt64
210        }
211    };
212
213    pub fn value_within_range(&self, other: AnyValue) -> bool {
214        use DataType::*;
215        match self {
216            UInt8 => other.extract::<u8>().is_some(),
217            #[cfg(feature = "dtype-u16")]
218            UInt16 => other.extract::<u16>().is_some(),
219            UInt32 => other.extract::<u32>().is_some(),
220            UInt64 => other.extract::<u64>().is_some(),
221            #[cfg(feature = "dtype-i8")]
222            Int8 => other.extract::<i8>().is_some(),
223            #[cfg(feature = "dtype-i16")]
224            Int16 => other.extract::<i16>().is_some(),
225            Int32 => other.extract::<i32>().is_some(),
226            Int64 => other.extract::<i64>().is_some(),
227            _ => false,
228        }
229    }
230
231    /// Check if the whole dtype is known.
232    pub fn is_known(&self) -> bool {
233        match self {
234            DataType::List(inner) => inner.is_known(),
235            #[cfg(feature = "dtype-array")]
236            DataType::Array(inner, _) => inner.is_known(),
237            #[cfg(feature = "dtype-struct")]
238            DataType::Struct(fields) => fields.iter().all(|fld| fld.dtype.is_known()),
239            DataType::Unknown(_) => false,
240            _ => true,
241        }
242    }
243
244    /// Materialize this datatype if it is unknown. All other datatypes
245    /// are left unchanged.
246    pub fn materialize_unknown(self, allow_unknown: bool) -> PolarsResult<DataType> {
247        match self {
248            DataType::Unknown(u) => match u.materialize() {
249                Some(known) => Ok(known),
250                None => {
251                    if allow_unknown {
252                        Ok(DataType::Unknown(u))
253                    } else {
254                        polars_bail!(SchemaMismatch: "failed to materialize unknown type")
255                    }
256                },
257            },
258            DataType::List(inner) => Ok(DataType::List(Box::new(
259                inner.materialize_unknown(allow_unknown)?,
260            ))),
261            #[cfg(feature = "dtype-array")]
262            DataType::Array(inner, size) => Ok(DataType::Array(
263                Box::new(inner.materialize_unknown(allow_unknown)?),
264                size,
265            )),
266            #[cfg(feature = "dtype-struct")]
267            DataType::Struct(fields) => Ok(DataType::Struct(
268                fields
269                    .into_iter()
270                    .map(|f| {
271                        PolarsResult::Ok(Field::new(
272                            f.name,
273                            f.dtype.materialize_unknown(allow_unknown)?,
274                        ))
275                    })
276                    .try_collect_vec()?,
277            )),
278            _ => Ok(self),
279        }
280    }
281
282    #[cfg(feature = "dtype-array")]
283    /// Get the full shape of a multidimensional array.
284    pub fn get_shape(&self) -> Option<Vec<usize>> {
285        fn get_shape_impl(dt: &DataType, shape: &mut Vec<usize>) {
286            if let DataType::Array(inner, size) = dt {
287                shape.push(*size);
288                get_shape_impl(inner, shape);
289            }
290        }
291
292        if let DataType::Array(inner, size) = self {
293            let mut shape = vec![*size];
294            get_shape_impl(inner, &mut shape);
295            Some(shape)
296        } else {
297            None
298        }
299    }
300
301    /// Get the inner data type of a nested type.
302    pub fn inner_dtype(&self) -> Option<&DataType> {
303        match self {
304            DataType::List(inner) => Some(inner),
305            #[cfg(feature = "dtype-array")]
306            DataType::Array(inner, _) => Some(inner),
307            _ => None,
308        }
309    }
310
311    /// Get the absolute inner data type of a nested type.
312    pub fn leaf_dtype(&self) -> &DataType {
313        let mut prev = self;
314        while let Some(dtype) = prev.inner_dtype() {
315            prev = dtype
316        }
317        prev
318    }
319
320    #[cfg(feature = "dtype-array")]
321    /// Get the inner data type of a multidimensional array.
322    pub fn array_leaf_dtype(&self) -> Option<&DataType> {
323        let mut prev = self;
324        match prev {
325            DataType::Array(_, _) => {
326                while let DataType::Array(inner, _) = &prev {
327                    prev = inner;
328                }
329                Some(prev)
330            },
331            _ => None,
332        }
333    }
334
335    /// Cast the leaf types of Lists/Arrays and keep the nesting.
336    pub fn cast_leaf(&self, to: DataType) -> DataType {
337        use DataType::*;
338        match self {
339            List(inner) => List(Box::new(inner.cast_leaf(to))),
340            #[cfg(feature = "dtype-array")]
341            Array(inner, size) => Array(Box::new(inner.cast_leaf(to)), *size),
342            _ => to,
343        }
344    }
345
346    /// Return whether the cast to `to` makes sense.
347    ///
348    /// If it `None`, we are not sure.
349    pub fn can_cast_to(&self, to: &DataType) -> Option<bool> {
350        if self == to {
351            return Some(true);
352        }
353        if self.is_primitive_numeric() && to.is_primitive_numeric() {
354            return Some(true);
355        }
356
357        if self.is_null() {
358            return Some(true);
359        }
360
361        use DataType as D;
362        Some(match (self, to) {
363            #[cfg(feature = "dtype-categorical")]
364            (D::Categorical(_, _) | D::Enum(_, _), D::Binary)
365            | (D::Binary, D::Categorical(_, _) | D::Enum(_, _)) => false, // TODO @ cat-rework: why can we not cast to Binary?
366
367            #[cfg(feature = "object")]
368            (D::Object(_), D::Object(_)) => true,
369            #[cfg(feature = "object")]
370            (D::Object(_), _) | (_, D::Object(_)) => false,
371
372            (D::Boolean, dt) | (dt, D::Boolean) => match dt {
373                dt if dt.is_primitive_numeric() => true,
374                #[cfg(feature = "dtype-decimal")]
375                D::Decimal(_, _) => true,
376                D::String | D::Binary => true,
377                _ => false,
378            },
379
380            (D::List(from), D::List(to)) => from.can_cast_to(to)?,
381            #[cfg(feature = "dtype-array")]
382            (D::Array(from, l_width), D::Array(to, r_width)) => {
383                l_width == r_width && from.can_cast_to(to)?
384            },
385            #[cfg(feature = "dtype-struct")]
386            (D::Struct(l_fields), D::Struct(r_fields)) => {
387                if l_fields.is_empty() {
388                    return Some(true);
389                }
390
391                if l_fields.len() != r_fields.len() {
392                    return Some(false);
393                }
394
395                for (l, r) in l_fields.iter().zip(r_fields) {
396                    if !l.dtype().can_cast_to(r.dtype())? {
397                        return Some(false);
398                    }
399                }
400
401                true
402            },
403
404            // @NOTE: we are being conversative
405            _ => return None,
406        })
407    }
408
409    pub fn implode(self) -> DataType {
410        DataType::List(Box::new(self))
411    }
412
413    /// Convert to the physical data type
414    #[must_use]
415    pub fn to_physical(&self) -> DataType {
416        use DataType::*;
417        match self {
418            Date => Int32,
419            Datetime(_, _) => Int64,
420            Duration(_) => Int64,
421            Time => Int64,
422            #[cfg(feature = "dtype-decimal")]
423            Decimal(_, _) => Int128,
424            #[cfg(feature = "dtype-categorical")]
425            Categorical(cats, _) => cats.physical().dtype(),
426            #[cfg(feature = "dtype-categorical")]
427            Enum(fcats, _) => fcats.physical().dtype(),
428            #[cfg(feature = "dtype-array")]
429            Array(dt, width) => Array(Box::new(dt.to_physical()), *width),
430            List(dt) => List(Box::new(dt.to_physical())),
431            #[cfg(feature = "dtype-struct")]
432            Struct(fields) => {
433                let new_fields = fields
434                    .iter()
435                    .map(|s| Field::new(s.name().clone(), s.dtype().to_physical()))
436                    .collect();
437                Struct(new_fields)
438            },
439            _ => self.clone(),
440        }
441    }
442
443    pub fn is_supported_list_arithmetic_input(&self) -> bool {
444        self.is_primitive_numeric() || self.is_bool() || self.is_null()
445    }
446
447    /// Check if this [`DataType`] is a logical type
448    pub fn is_logical(&self) -> bool {
449        self != &self.to_physical()
450    }
451
452    /// Check if this [`DataType`] is a temporal type
453    pub fn is_temporal(&self) -> bool {
454        use DataType::*;
455        matches!(self, Date | Datetime(_, _) | Duration(_) | Time)
456    }
457
458    /// Check if datatype is a primitive type. By that we mean that
459    /// it is not a nested or logical type.
460    pub fn is_primitive(&self) -> bool {
461        self.is_primitive_numeric()
462            | matches!(
463                self,
464                DataType::Boolean | DataType::String | DataType::Binary
465            )
466    }
467
468    /// Check if this [`DataType`] is a primitive numeric type (excludes Decimal).
469    pub fn is_primitive_numeric(&self) -> bool {
470        self.is_float() || self.is_integer()
471    }
472
473    /// Check if this [`DataType`] is a boolean.
474    pub fn is_bool(&self) -> bool {
475        matches!(self, DataType::Boolean)
476    }
477
478    /// Check if this [`DataType`] is a list.
479    pub fn is_list(&self) -> bool {
480        matches!(self, DataType::List(_))
481    }
482
483    /// Check if this [`DataType`] is an array.
484    pub fn is_array(&self) -> bool {
485        #[cfg(feature = "dtype-array")]
486        {
487            matches!(self, DataType::Array(_, _))
488        }
489        #[cfg(not(feature = "dtype-array"))]
490        {
491            false
492        }
493    }
494
495    pub fn is_nested(&self) -> bool {
496        self.is_list() || self.is_struct() || self.is_array()
497    }
498
499    /// Check if this [`DataType`] is a struct
500    pub fn is_struct(&self) -> bool {
501        #[cfg(feature = "dtype-struct")]
502        {
503            matches!(self, DataType::Struct(_))
504        }
505        #[cfg(not(feature = "dtype-struct"))]
506        {
507            false
508        }
509    }
510
511    pub fn is_binary(&self) -> bool {
512        matches!(self, DataType::Binary)
513    }
514
515    pub fn is_date(&self) -> bool {
516        matches!(self, DataType::Date)
517    }
518    pub fn is_datetime(&self) -> bool {
519        matches!(self, DataType::Datetime(..))
520    }
521
522    pub fn is_duration(&self) -> bool {
523        matches!(self, DataType::Duration(..))
524    }
525
526    pub fn is_object(&self) -> bool {
527        #[cfg(feature = "object")]
528        {
529            matches!(self, DataType::Object(_))
530        }
531        #[cfg(not(feature = "object"))]
532        {
533            false
534        }
535    }
536
537    pub fn is_null(&self) -> bool {
538        matches!(self, DataType::Null)
539    }
540
541    pub fn contains_views(&self) -> bool {
542        use DataType::*;
543        match self {
544            Binary | String => true,
545            List(inner) => inner.contains_views(),
546            #[cfg(feature = "dtype-array")]
547            Array(inner, _) => inner.contains_views(),
548            #[cfg(feature = "dtype-struct")]
549            Struct(fields) => fields.iter().any(|field| field.dtype.contains_views()),
550            _ => false,
551        }
552    }
553
554    pub fn contains_categoricals(&self) -> bool {
555        use DataType::*;
556        match self {
557            #[cfg(feature = "dtype-categorical")]
558            Categorical(_, _) | Enum(_, _) => true,
559            List(inner) => inner.contains_categoricals(),
560            #[cfg(feature = "dtype-array")]
561            Array(inner, _) => inner.contains_categoricals(),
562            #[cfg(feature = "dtype-struct")]
563            Struct(fields) => fields
564                .iter()
565                .any(|field| field.dtype.contains_categoricals()),
566            _ => false,
567        }
568    }
569
570    pub fn contains_objects(&self) -> bool {
571        use DataType::*;
572        match self {
573            #[cfg(feature = "object")]
574            Object(_) => true,
575            List(inner) => inner.contains_objects(),
576            #[cfg(feature = "dtype-array")]
577            Array(inner, _) => inner.contains_objects(),
578            #[cfg(feature = "dtype-struct")]
579            Struct(fields) => fields.iter().any(|field| field.dtype.contains_objects()),
580            _ => false,
581        }
582    }
583
584    pub fn contains_list_recursive(&self) -> bool {
585        use DataType as D;
586        match self {
587            D::List(_) => true,
588            #[cfg(feature = "dtype-array")]
589            D::Array(inner, _) => inner.contains_list_recursive(),
590            #[cfg(feature = "dtype-struct")]
591            D::Struct(fields) => fields
592                .iter()
593                .any(|field| field.dtype.contains_list_recursive()),
594            _ => false,
595        }
596    }
597
598    pub fn contains_unknown(&self) -> bool {
599        use DataType as D;
600        match self {
601            D::Unknown(_) => true,
602            D::List(inner) => inner.contains_unknown(),
603            #[cfg(feature = "dtype-array")]
604            D::Array(inner, _) => inner.contains_unknown(),
605            #[cfg(feature = "dtype-struct")]
606            D::Struct(fields) => fields.iter().any(|field| field.dtype.contains_unknown()),
607            _ => false,
608        }
609    }
610
611    /// Check if type is sortable
612    pub fn is_ord(&self) -> bool {
613        let phys = self.to_physical();
614        phys.is_primitive_numeric()
615            || self.is_decimal()
616            || matches!(
617                phys,
618                DataType::Binary | DataType::String | DataType::Boolean
619            )
620    }
621
622    /// Check if this [`DataType`] is a Decimal type (of any scale/precision).
623    pub fn is_decimal(&self) -> bool {
624        match self {
625            #[cfg(feature = "dtype-decimal")]
626            DataType::Decimal(_, _) => true,
627            _ => false,
628        }
629    }
630
631    /// Check if this [`DataType`] is a basic floating point type (excludes Decimal).
632    /// Note, this also includes `Unknown(UnknownKind::Float)`.
633    pub fn is_float(&self) -> bool {
634        matches!(
635            self,
636            DataType::Float32 | DataType::Float64 | DataType::Unknown(UnknownKind::Float)
637        )
638    }
639
640    /// Check if this [`DataType`] is an integer. Note, this also includes `Unknown(UnknownKind::Int)`.
641    pub fn is_integer(&self) -> bool {
642        matches!(
643            self,
644            DataType::Int8
645                | DataType::Int16
646                | DataType::Int32
647                | DataType::Int64
648                | DataType::Int128
649                | DataType::UInt8
650                | DataType::UInt16
651                | DataType::UInt32
652                | DataType::UInt64
653                | DataType::Unknown(UnknownKind::Int(_))
654        )
655    }
656
657    pub fn is_signed_integer(&self) -> bool {
658        // allow because it cannot be replaced when object feature is activated
659        matches!(
660            self,
661            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 | DataType::Int128
662        )
663    }
664
665    pub fn is_unsigned_integer(&self) -> bool {
666        matches!(
667            self,
668            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64,
669        )
670    }
671
672    pub fn is_string(&self) -> bool {
673        matches!(self, DataType::String | DataType::Unknown(UnknownKind::Str))
674    }
675
676    pub fn is_categorical(&self) -> bool {
677        #[cfg(feature = "dtype-categorical")]
678        {
679            matches!(self, DataType::Categorical(_, _))
680        }
681        #[cfg(not(feature = "dtype-categorical"))]
682        {
683            false
684        }
685    }
686
687    pub fn is_enum(&self) -> bool {
688        #[cfg(feature = "dtype-categorical")]
689        {
690            matches!(self, DataType::Enum(_, _))
691        }
692        #[cfg(not(feature = "dtype-categorical"))]
693        {
694            false
695        }
696    }
697
698    /// Convert to an Arrow Field.
699    pub fn to_arrow_field(&self, name: PlSmallStr, compat_level: CompatLevel) -> ArrowField {
700        let metadata = match self {
701            #[cfg(feature = "dtype-categorical")]
702            DataType::Enum(fcats, _map) => {
703                let cats = fcats.categories();
704                let strings_size: usize = cats
705                    .values_iter()
706                    .map(|s| (s.len() + 1).ilog10() as usize + 1 + s.len())
707                    .sum();
708                let mut encoded = String::with_capacity(strings_size);
709                for cat in cats.values_iter() {
710                    encoded.push_str(itoa::Buffer::new().format(cat.len()));
711                    encoded.push(';');
712                    encoded.push_str(cat);
713                }
714                Some(BTreeMap::from([(
715                    PlSmallStr::from_static(DTYPE_ENUM_VALUES_NEW),
716                    PlSmallStr::from_string(encoded),
717                )]))
718            },
719            #[cfg(feature = "dtype-categorical")]
720            DataType::Categorical(cats, _) => {
721                let mut encoded = String::new();
722                encoded.push_str(itoa::Buffer::new().format(cats.name().len()));
723                encoded.push(';');
724                encoded.push_str(cats.name());
725                encoded.push_str(itoa::Buffer::new().format(cats.namespace().len()));
726                encoded.push(';');
727                encoded.push_str(cats.namespace());
728                encoded.push_str(cats.physical().as_str());
729                encoded.push(';');
730
731                Some(BTreeMap::from([(
732                    PlSmallStr::from_static(DTYPE_CATEGORICAL_NEW),
733                    PlSmallStr::from_string(encoded),
734                )]))
735            },
736            DataType::BinaryOffset => Some(BTreeMap::from([(
737                PlSmallStr::from_static(PL_KEY),
738                PlSmallStr::from_static(MAINTAIN_PL_TYPE),
739            )])),
740            _ => None,
741        };
742
743        let field = ArrowField::new(name, self.to_arrow(compat_level), true);
744
745        if let Some(metadata) = metadata {
746            field.with_metadata(metadata)
747        } else {
748            field
749        }
750    }
751
752    /// Try to get the maximum value for this datatype.
753    pub fn max(&self) -> PolarsResult<Scalar> {
754        use DataType::*;
755        let v = match self {
756            Int8 => Scalar::from(i8::MAX),
757            Int16 => Scalar::from(i16::MAX),
758            Int32 => Scalar::from(i32::MAX),
759            Int64 => Scalar::from(i64::MAX),
760            Int128 => Scalar::from(i128::MAX),
761            UInt8 => Scalar::from(u8::MAX),
762            UInt16 => Scalar::from(u16::MAX),
763            UInt32 => Scalar::from(u32::MAX),
764            UInt64 => Scalar::from(u64::MAX),
765            Float32 => Scalar::from(f32::INFINITY),
766            Float64 => Scalar::from(f64::INFINITY),
767            #[cfg(feature = "dtype-time")]
768            Time => Scalar::new(Time, AnyValue::Time(NS_IN_DAY - 1)),
769            dt => polars_bail!(ComputeError: "cannot determine upper bound for dtype `{}`", dt),
770        };
771        Ok(v)
772    }
773
774    /// Try to get the minimum value for this datatype.
775    pub fn min(&self) -> PolarsResult<Scalar> {
776        use DataType::*;
777        let v = match self {
778            Int8 => Scalar::from(i8::MIN),
779            Int16 => Scalar::from(i16::MIN),
780            Int32 => Scalar::from(i32::MIN),
781            Int64 => Scalar::from(i64::MIN),
782            Int128 => Scalar::from(i128::MIN),
783            UInt8 => Scalar::from(u8::MIN),
784            UInt16 => Scalar::from(u16::MIN),
785            UInt32 => Scalar::from(u32::MIN),
786            UInt64 => Scalar::from(u64::MIN),
787            Float32 => Scalar::from(f32::NEG_INFINITY),
788            Float64 => Scalar::from(f64::NEG_INFINITY),
789            #[cfg(feature = "dtype-time")]
790            Time => Scalar::new(Time, AnyValue::Time(0)),
791            dt => polars_bail!(ComputeError: "cannot determine lower bound for dtype `{}`", dt),
792        };
793        Ok(v)
794    }
795
796    /// Convert to an Arrow data type.
797    #[inline]
798    pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowDataType {
799        self.try_to_arrow(compat_level).unwrap()
800    }
801
802    #[inline]
803    pub fn try_to_arrow(&self, compat_level: CompatLevel) -> PolarsResult<ArrowDataType> {
804        use DataType::*;
805        match self {
806            Boolean => Ok(ArrowDataType::Boolean),
807            UInt8 => Ok(ArrowDataType::UInt8),
808            UInt16 => Ok(ArrowDataType::UInt16),
809            UInt32 => Ok(ArrowDataType::UInt32),
810            UInt64 => Ok(ArrowDataType::UInt64),
811            Int8 => Ok(ArrowDataType::Int8),
812            Int16 => Ok(ArrowDataType::Int16),
813            Int32 => Ok(ArrowDataType::Int32),
814            Int64 => Ok(ArrowDataType::Int64),
815            Int128 => Ok(ArrowDataType::Int128),
816            Float32 => Ok(ArrowDataType::Float32),
817            Float64 => Ok(ArrowDataType::Float64),
818            #[cfg(feature = "dtype-decimal")]
819            Decimal(precision, scale) => {
820                let precision = (*precision).unwrap_or(38);
821                polars_ensure!(precision <= 38 && precision > 0, InvalidOperation: "decimal precision should be <= 38 & >= 1");
822
823                Ok(ArrowDataType::Decimal(
824                    precision,
825                    scale.unwrap_or(0), // and what else can we do here?
826                ))
827            },
828            String => {
829                let dt = if compat_level.0 >= 1 {
830                    ArrowDataType::Utf8View
831                } else {
832                    ArrowDataType::LargeUtf8
833                };
834                Ok(dt)
835            },
836            Binary => {
837                let dt = if compat_level.0 >= 1 {
838                    ArrowDataType::BinaryView
839                } else {
840                    ArrowDataType::LargeBinary
841                };
842                Ok(dt)
843            },
844            Date => Ok(ArrowDataType::Date32),
845            Datetime(unit, tz) => Ok(ArrowDataType::Timestamp(
846                unit.to_arrow(),
847                tz.as_deref().cloned(),
848            )),
849            Duration(unit) => Ok(ArrowDataType::Duration(unit.to_arrow())),
850            Time => Ok(ArrowDataType::Time64(ArrowTimeUnit::Nanosecond)),
851            #[cfg(feature = "dtype-array")]
852            Array(dt, size) => Ok(dt
853                .try_to_arrow(compat_level)?
854                .to_fixed_size_list(*size, true)),
855            List(dt) => Ok(ArrowDataType::LargeList(Box::new(
856                dt.to_arrow_field(LIST_VALUES_NAME, compat_level),
857            ))),
858            Null => Ok(ArrowDataType::Null),
859            #[cfg(feature = "object")]
860            Object(_) => Ok(get_object_physical_type()),
861            #[cfg(feature = "dtype-categorical")]
862            Categorical(_, _) | Enum(_, _) => {
863                let arrow_phys = match self.cat_physical().unwrap() {
864                    CategoricalPhysical::U8 => IntegerType::UInt8,
865                    CategoricalPhysical::U16 => IntegerType::UInt16,
866                    CategoricalPhysical::U32 => IntegerType::UInt32,
867                };
868
869                let values = if compat_level.0 >= 1 {
870                    ArrowDataType::Utf8View
871                } else {
872                    ArrowDataType::LargeUtf8
873                };
874
875                Ok(ArrowDataType::Dictionary(
876                    arrow_phys,
877                    Box::new(values),
878                    false,
879                ))
880            },
881            #[cfg(feature = "dtype-struct")]
882            Struct(fields) => {
883                let fields = fields
884                    .iter()
885                    .map(|fld| fld.to_arrow(compat_level))
886                    .collect();
887                Ok(ArrowDataType::Struct(fields))
888            },
889            BinaryOffset => Ok(ArrowDataType::LargeBinary),
890            Unknown(kind) => {
891                let dt = match kind {
892                    UnknownKind::Any | UnknownKind::Ufunc => ArrowDataType::Unknown,
893                    UnknownKind::Float => ArrowDataType::Float64,
894                    UnknownKind::Str => ArrowDataType::Utf8View,
895                    UnknownKind::Int(v) => {
896                        return materialize_dyn_int(*v).dtype().try_to_arrow(compat_level);
897                    },
898                };
899                Ok(dt)
900            },
901        }
902    }
903
904    pub fn is_nested_null(&self) -> bool {
905        use DataType::*;
906        match self {
907            Null => true,
908            List(field) => field.is_nested_null(),
909            #[cfg(feature = "dtype-array")]
910            Array(field, _) => field.is_nested_null(),
911            #[cfg(feature = "dtype-struct")]
912            Struct(fields) => fields.iter().all(|fld| fld.dtype.is_nested_null()),
913            _ => false,
914        }
915    }
916
917    /// Answers if this type matches the given type of a schema.
918    ///
919    /// Allows (nested) Null types in this type to match any type in the schema,
920    /// but not vice versa. In such a case Ok(true) is returned, because a cast
921    /// is necessary. If no cast is necessary Ok(false) is returned, and an
922    /// error is returned if the types are incompatible.
923    pub fn matches_schema_type(&self, schema_type: &DataType) -> PolarsResult<bool> {
924        match (self, schema_type) {
925            (DataType::List(l), DataType::List(r)) => l.matches_schema_type(r),
926            #[cfg(feature = "dtype-array")]
927            (DataType::Array(l, sl), DataType::Array(r, sr)) => {
928                Ok(l.matches_schema_type(r)? && sl == sr)
929            },
930            #[cfg(feature = "dtype-struct")]
931            (DataType::Struct(l), DataType::Struct(r)) => {
932                if l.len() != r.len() {
933                    polars_bail!(SchemaMismatch: "structs have different number of fields: {} vs {}", l.len(), r.len());
934                }
935                let mut must_cast = false;
936                for (l, r) in l.iter().zip(r.iter()) {
937                    must_cast |= l.dtype.matches_schema_type(&r.dtype)?;
938                }
939                Ok(must_cast)
940            },
941            (DataType::Null, DataType::Null) => Ok(false),
942            #[cfg(feature = "dtype-decimal")]
943            (DataType::Decimal(_, s1), DataType::Decimal(_, s2)) => Ok(s1 != s2),
944            // We don't allow the other way around, only if our current type is
945            // null and the schema isn't we allow it.
946            (DataType::Null, _) => Ok(true),
947            #[cfg(feature = "dtype-categorical")]
948            (DataType::Categorical(l, _), DataType::Categorical(r, _)) => {
949                ensure_same_categories(l, r)?;
950                Ok(false)
951            },
952            #[cfg(feature = "dtype-categorical")]
953            (DataType::Enum(l, _), DataType::Enum(r, _)) => {
954                ensure_same_frozen_categories(l, r)?;
955                Ok(false)
956            },
957
958            (l, r) if l == r => Ok(false),
959            (l, r) => {
960                polars_bail!(SchemaMismatch: "type {:?} is incompatible with expected type {:?}", l, r)
961            },
962        }
963    }
964
965    #[inline]
966    pub fn is_unknown(&self) -> bool {
967        matches!(self, DataType::Unknown(_))
968    }
969
970    pub fn nesting_level(&self) -> usize {
971        let mut level = 0;
972        let mut slf = self;
973        while let Some(inner_dtype) = slf.inner_dtype() {
974            level += 1;
975            slf = inner_dtype;
976        }
977        level
978    }
979
980    /// If this dtype is a Categorical or Enum, returns the physical backing type.
981    #[cfg(feature = "dtype-categorical")]
982    pub fn cat_physical(&self) -> PolarsResult<CategoricalPhysical> {
983        match self {
984            DataType::Categorical(cats, _) => Ok(cats.physical()),
985            DataType::Enum(fcats, _) => Ok(fcats.physical()),
986            _ => {
987                polars_bail!(SchemaMismatch: "invalid dtype: expected an Enum or Categorical type, received '{:?}'", self)
988            },
989        }
990    }
991
992    /// If this dtype is a Categorical or Enum, returns the underlying mapping.
993    #[cfg(feature = "dtype-categorical")]
994    pub fn cat_mapping(&self) -> PolarsResult<&Arc<CategoricalMapping>> {
995        match self {
996            DataType::Categorical(_, mapping) | DataType::Enum(_, mapping) => Ok(mapping),
997            _ => {
998                polars_bail!(SchemaMismatch: "invalid dtype: expected an Enum or Categorical type, received '{:?}'", self)
999            },
1000        }
1001    }
1002
1003    #[cfg(feature = "dtype-categorical")]
1004    pub fn from_categories(cats: Arc<Categories>) -> Self {
1005        let mapping = cats.mapping();
1006        Self::Categorical(cats, mapping)
1007    }
1008
1009    #[cfg(feature = "dtype-categorical")]
1010    pub fn from_frozen_categories(fcats: Arc<FrozenCategories>) -> Self {
1011        let mapping = fcats.mapping().clone();
1012        Self::Enum(fcats, mapping)
1013    }
1014
1015    pub fn is_numeric(&self) -> bool {
1016        self.is_integer() || self.is_float() || self.is_decimal()
1017    }
1018}
1019
1020impl Display for DataType {
1021    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1022        let s = match self {
1023            DataType::Null => "null",
1024            DataType::Boolean => "bool",
1025            DataType::UInt8 => "u8",
1026            DataType::UInt16 => "u16",
1027            DataType::UInt32 => "u32",
1028            DataType::UInt64 => "u64",
1029            DataType::Int8 => "i8",
1030            DataType::Int16 => "i16",
1031            DataType::Int32 => "i32",
1032            DataType::Int64 => "i64",
1033            DataType::Int128 => "i128",
1034            DataType::Float32 => "f32",
1035            DataType::Float64 => "f64",
1036            #[cfg(feature = "dtype-decimal")]
1037            DataType::Decimal(precision, scale) => {
1038                return match (precision, scale) {
1039                    (Some(precision), Some(scale)) => {
1040                        f.write_str(&format!("decimal[{precision},{scale}]"))
1041                    },
1042                    (None, Some(scale)) => f.write_str(&format!("decimal[*,{scale}]")),
1043                    _ => f.write_str("decimal[?]"), // shouldn't happen
1044                };
1045            },
1046            DataType::String => "str",
1047            DataType::Binary => "binary",
1048            DataType::Date => "date",
1049            DataType::Datetime(tu, tz) => {
1050                let s = match tz {
1051                    None => format!("datetime[{tu}]"),
1052                    Some(tz) => format!("datetime[{tu}, {tz}]"),
1053                };
1054                return f.write_str(&s);
1055            },
1056            DataType::Duration(tu) => return write!(f, "duration[{tu}]"),
1057            DataType::Time => "time",
1058            #[cfg(feature = "dtype-array")]
1059            DataType::Array(_, _) => {
1060                let tp = self.array_leaf_dtype().unwrap();
1061
1062                let dims = self.get_shape().unwrap();
1063                let shape = if dims.len() == 1 {
1064                    format!("{}", dims[0])
1065                } else {
1066                    format_tuple!(dims)
1067                };
1068                return write!(f, "array[{tp}, {shape}]");
1069            },
1070            DataType::List(tp) => return write!(f, "list[{tp}]"),
1071            #[cfg(feature = "object")]
1072            DataType::Object(s) => s,
1073            #[cfg(feature = "dtype-categorical")]
1074            DataType::Categorical(_, _) => "cat",
1075            #[cfg(feature = "dtype-categorical")]
1076            DataType::Enum(_, _) => "enum",
1077            #[cfg(feature = "dtype-struct")]
1078            DataType::Struct(fields) => return write!(f, "struct[{}]", fields.len()),
1079            DataType::Unknown(kind) => match kind {
1080                UnknownKind::Ufunc => "unknown ufunc",
1081                UnknownKind::Any => "unknown",
1082                UnknownKind::Int(_) => "dyn int",
1083                UnknownKind::Float => "dyn float",
1084                UnknownKind::Str => "dyn str",
1085            },
1086            DataType::BinaryOffset => "binary[offset]",
1087        };
1088        f.write_str(s)
1089    }
1090}
1091
1092pub fn merge_dtypes(left: &DataType, right: &DataType) -> PolarsResult<DataType> {
1093    use DataType::*;
1094    Ok(match (left, right) {
1095        #[cfg(feature = "dtype-categorical")]
1096        (Categorical(cats_l, map), Categorical(cats_r, _)) => {
1097            ensure_same_categories(cats_l, cats_r)?;
1098            Categorical(cats_l.clone(), map.clone())
1099        },
1100        #[cfg(feature = "dtype-categorical")]
1101        (Enum(fcats_l, map), Enum(fcats_r, _)) => {
1102            ensure_same_frozen_categories(fcats_l, fcats_r)?;
1103            Enum(fcats_l.clone(), map.clone())
1104        },
1105        (List(inner_l), List(inner_r)) => {
1106            let merged = merge_dtypes(inner_l, inner_r)?;
1107            List(Box::new(merged))
1108        },
1109        #[cfg(feature = "dtype-struct")]
1110        (Struct(inner_l), Struct(inner_r)) => {
1111            polars_ensure!(inner_l.len() == inner_r.len(), ComputeError: "cannot combine structs with differing amounts of fields ({} != {})", inner_l.len(), inner_r.len());
1112            let fields = inner_l.iter().zip(inner_r.iter()).map(|(l, r)| {
1113                polars_ensure!(l.name() == r.name(), ComputeError: "cannot combine structs with different fields ({} != {})", l.name(), r.name());
1114                let merged = merge_dtypes(l.dtype(), r.dtype())?;
1115                Ok(Field::new(l.name().clone(), merged))
1116            }).collect::<PolarsResult<Vec<_>>>()?;
1117            Struct(fields)
1118        },
1119        #[cfg(feature = "dtype-array")]
1120        (Array(inner_l, width_l), Array(inner_r, width_r)) => {
1121            polars_ensure!(width_l == width_r, ComputeError: "widths of FixedSizeWidth Series are not equal");
1122            let merged = merge_dtypes(inner_l, inner_r)?;
1123            Array(Box::new(merged), *width_l)
1124        },
1125        (left, right) if left == right => left.clone(),
1126        _ => polars_bail!(ComputeError: "unable to merge datatypes"),
1127    })
1128}
1129
1130fn collect_nested_types(
1131    dtype: &DataType,
1132    result: &mut PlHashSet<DataType>,
1133    include_compound_types: bool,
1134) {
1135    match dtype {
1136        DataType::List(inner) => {
1137            if include_compound_types {
1138                result.insert(dtype.clone());
1139            }
1140            collect_nested_types(inner, result, include_compound_types);
1141        },
1142        #[cfg(feature = "dtype-array")]
1143        DataType::Array(inner, _) => {
1144            if include_compound_types {
1145                result.insert(dtype.clone());
1146            }
1147            collect_nested_types(inner, result, include_compound_types);
1148        },
1149        #[cfg(feature = "dtype-struct")]
1150        DataType::Struct(fields) => {
1151            if include_compound_types {
1152                result.insert(dtype.clone());
1153            }
1154            for field in fields {
1155                collect_nested_types(field.dtype(), result, include_compound_types);
1156            }
1157        },
1158        _ => {
1159            result.insert(dtype.clone());
1160        },
1161    }
1162}
1163
1164pub fn unpack_dtypes(dtype: &DataType, include_compound_types: bool) -> PlHashSet<DataType> {
1165    let mut result = PlHashSet::new();
1166    collect_nested_types(dtype, &mut result, include_compound_types);
1167    result
1168}
1169
1170#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
1171#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
1172#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
1173pub struct CompatLevel(pub(crate) u16);
1174
1175impl CompatLevel {
1176    pub const fn newest() -> CompatLevel {
1177        CompatLevel(1)
1178    }
1179
1180    pub const fn oldest() -> CompatLevel {
1181        CompatLevel(0)
1182    }
1183
1184    // The following methods are only used internally
1185
1186    #[doc(hidden)]
1187    pub fn with_level(level: u16) -> PolarsResult<CompatLevel> {
1188        if level > CompatLevel::newest().0 {
1189            polars_bail!(InvalidOperation: "invalid compat level");
1190        }
1191        Ok(CompatLevel(level))
1192    }
1193
1194    #[doc(hidden)]
1195    pub fn get_level(&self) -> u16 {
1196        self.0
1197    }
1198}
1199
1200#[cfg(test)]
1201mod tests {
1202    use super::*;
1203
1204    #[cfg(feature = "dtype-array")]
1205    #[test]
1206    fn test_unpack_primitive_dtypes() {
1207        let inner_type = DataType::Float64;
1208        let array_type = DataType::Array(Box::new(inner_type), 10);
1209        let list_type = DataType::List(Box::new(array_type.clone()));
1210
1211        let result = unpack_dtypes(&list_type, false);
1212
1213        let mut expected = PlHashSet::new();
1214        expected.insert(DataType::Float64);
1215
1216        assert_eq!(result, expected)
1217    }
1218
1219    #[cfg(feature = "dtype-array")]
1220    #[test]
1221    fn test_unpack_compound_dtypes() {
1222        let inner_type = DataType::Float64;
1223        let array_type = DataType::Array(Box::new(inner_type), 10);
1224        let list_type = DataType::List(Box::new(array_type.clone()));
1225
1226        let result = unpack_dtypes(&list_type, true);
1227
1228        let mut expected = PlHashSet::new();
1229        expected.insert(list_type.clone());
1230        expected.insert(array_type.clone());
1231        expected.insert(DataType::Float64);
1232
1233        assert_eq!(result, expected)
1234    }
1235}