polars_core/datatypes/
dtype.rs

1use std::collections::BTreeMap;
2
3use arrow::datatypes::{DTYPE_CATEGORICAL, DTYPE_ENUM_VALUES, Metadata};
4#[cfg(feature = "dtype-array")]
5use polars_utils::format_tuple;
6use polars_utils::itertools::Itertools;
7#[cfg(any(feature = "serde-lazy", feature = "serde"))]
8use serde::{Deserialize, Serialize};
9use strum_macros::IntoStaticStr;
10
11use super::*;
12#[cfg(feature = "object")]
13use crate::chunked_array::object::registry::get_object_physical_type;
14use crate::utils::materialize_dyn_int;
15
16pub type TimeZone = PlSmallStr;
17
18static MAINTAIN_PL_TYPE: &str = "maintain_type";
19static PL_KEY: &str = "pl";
20
21pub trait MetaDataExt: IntoMetadata {
22    fn is_enum(&self) -> bool {
23        let metadata = self.into_metadata_ref();
24        metadata.get(DTYPE_ENUM_VALUES).is_some()
25    }
26
27    fn categorical(&self) -> Option<CategoricalOrdering> {
28        let metadata = self.into_metadata_ref();
29        match metadata.get(DTYPE_CATEGORICAL)?.as_str() {
30            "lexical" => Some(CategoricalOrdering::Lexical),
31            // Default is Physical
32            _ => Some(CategoricalOrdering::Physical),
33        }
34    }
35
36    fn maintain_type(&self) -> bool {
37        let metadata = self.into_metadata_ref();
38        metadata.get(PL_KEY).map(|s| s.as_str()) == Some(MAINTAIN_PL_TYPE)
39    }
40}
41
42impl MetaDataExt for Metadata {}
43pub trait IntoMetadata {
44    #[allow(clippy::wrong_self_convention)]
45    fn into_metadata_ref(&self) -> &Metadata;
46}
47
48impl IntoMetadata for Metadata {
49    fn into_metadata_ref(&self) -> &Metadata {
50        self
51    }
52}
53
54#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Default)]
55#[cfg_attr(
56    any(feature = "serde", feature = "serde-lazy"),
57    derive(Serialize, Deserialize)
58)]
59pub enum UnknownKind {
60    // Hold the value to determine the concrete size.
61    Int(i128),
62    Float,
63    // Can be Categorical or String
64    Str,
65    #[default]
66    Any,
67}
68
69impl UnknownKind {
70    pub fn materialize(&self) -> Option<DataType> {
71        let dtype = match self {
72            UnknownKind::Int(v) => materialize_dyn_int(*v).dtype(),
73            UnknownKind::Float => DataType::Float64,
74            UnknownKind::Str => DataType::String,
75            UnknownKind::Any => return None,
76        };
77        Some(dtype)
78    }
79}
80
81#[derive(Debug, Copy, Clone, PartialEq, Default, IntoStaticStr)]
82#[cfg_attr(
83    any(feature = "serde-lazy", feature = "serde"),
84    derive(Serialize, Deserialize)
85)]
86#[strum(serialize_all = "snake_case")]
87pub enum CategoricalOrdering {
88    #[default]
89    Physical,
90    Lexical,
91}
92
93#[derive(Clone, Debug)]
94pub enum DataType {
95    Boolean,
96    UInt8,
97    UInt16,
98    UInt32,
99    UInt64,
100    Int8,
101    Int16,
102    Int32,
103    Int64,
104    Int128,
105    Float32,
106    Float64,
107    /// Fixed point decimal type optional precision and non-negative scale.
108    /// This is backed by a signed 128-bit integer which allows for up to 38 significant digits.
109    /// Meaning max precision is 38.
110    #[cfg(feature = "dtype-decimal")]
111    Decimal(Option<usize>, Option<usize>), // precision/scale; scale being None means "infer"
112    /// String data
113    String,
114    Binary,
115    BinaryOffset,
116    /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
117    /// in days (32 bits).
118    Date,
119    /// A 64-bit date representing the elapsed time since UNIX epoch (1970-01-01)
120    /// in the given timeunit (64 bits).
121    Datetime(TimeUnit, Option<TimeZone>),
122    /// 64-bit integer representing difference between times in milliseconds or nanoseconds
123    Duration(TimeUnit),
124    /// A 64-bit time representing the elapsed time since midnight in nanoseconds
125    Time,
126    /// A nested list with a fixed size in each row
127    #[cfg(feature = "dtype-array")]
128    Array(Box<DataType>, usize),
129    /// A nested list with a variable size in each row
130    List(Box<DataType>),
131    /// A generic type that can be used in a `Series`
132    /// &'static str can be used to determine/set inner type
133    #[cfg(feature = "object")]
134    Object(&'static str),
135    Null,
136    // The RevMapping has the internal state.
137    // This is ignored with comparisons, hashing etc.
138    #[cfg(feature = "dtype-categorical")]
139    Categorical(Option<Arc<RevMapping>>, CategoricalOrdering),
140    // It is an Option, so that matching Enum/Categoricals can take the same guards.
141    #[cfg(feature = "dtype-categorical")]
142    Enum(Option<Arc<RevMapping>>, CategoricalOrdering),
143    #[cfg(feature = "dtype-struct")]
144    Struct(Vec<Field>),
145    // some logical types we cannot know statically, e.g. Datetime
146    Unknown(UnknownKind),
147}
148
149impl Default for DataType {
150    fn default() -> Self {
151        DataType::Unknown(UnknownKind::Any)
152    }
153}
154
155pub trait AsRefDataType {
156    fn as_ref_dtype(&self) -> &DataType;
157}
158
159impl Hash for DataType {
160    fn hash<H: Hasher>(&self, state: &mut H) {
161        std::mem::discriminant(self).hash(state)
162    }
163}
164
165impl PartialEq for DataType {
166    fn eq(&self, other: &Self) -> bool {
167        use DataType::*;
168        {
169            match (self, other) {
170                #[cfg(feature = "dtype-categorical")]
171                // Don't include rev maps in comparisons
172                // TODO: include ordering in comparison
173                (Categorical(_, _ordering_l), Categorical(_, _ordering_r)) => true,
174                #[cfg(feature = "dtype-categorical")]
175                // None means select all Enum dtypes. This is for operation `pl.col(pl.Enum)`
176                (Enum(None, _), Enum(_, _)) | (Enum(_, _), Enum(None, _)) => true,
177                #[cfg(feature = "dtype-categorical")]
178                (Enum(Some(cat_lhs), _), Enum(Some(cat_rhs), _)) => {
179                    cat_lhs.get_categories() == cat_rhs.get_categories()
180                },
181                (Datetime(tu_l, tz_l), Datetime(tu_r, tz_r)) => tu_l == tu_r && tz_l == tz_r,
182                (List(left_inner), List(right_inner)) => left_inner == right_inner,
183                #[cfg(feature = "dtype-duration")]
184                (Duration(tu_l), Duration(tu_r)) => tu_l == tu_r,
185                #[cfg(feature = "dtype-decimal")]
186                (Decimal(l_prec, l_scale), Decimal(r_prec, r_scale)) => {
187                    let is_prec_eq = l_prec.is_none() || r_prec.is_none() || l_prec == r_prec;
188                    let is_scale_eq = l_scale.is_none() || r_scale.is_none() || l_scale == r_scale;
189
190                    is_prec_eq && is_scale_eq
191                },
192                #[cfg(feature = "object")]
193                (Object(lhs), Object(rhs)) => lhs == rhs,
194                #[cfg(feature = "dtype-struct")]
195                (Struct(lhs), Struct(rhs)) => Vec::as_ptr(lhs) == Vec::as_ptr(rhs) || lhs == rhs,
196                #[cfg(feature = "dtype-array")]
197                (Array(left_inner, left_width), Array(right_inner, right_width)) => {
198                    left_width == right_width && left_inner == right_inner
199                },
200                (Unknown(l), Unknown(r)) => match (l, r) {
201                    (UnknownKind::Int(_), UnknownKind::Int(_)) => true,
202                    _ => l == r,
203                },
204                _ => std::mem::discriminant(self) == std::mem::discriminant(other),
205            }
206        }
207    }
208}
209
210impl Eq for DataType {}
211
212impl DataType {
213    pub fn new_idxsize() -> Self {
214        #[cfg(feature = "bigidx")]
215        {
216            Self::UInt64
217        }
218        #[cfg(not(feature = "bigidx"))]
219        {
220            Self::UInt32
221        }
222    }
223
224    /// Standardize timezones to consistent values.
225    pub(crate) fn canonical_timezone(tz: &Option<PlSmallStr>) -> Option<TimeZone> {
226        match tz.as_deref() {
227            Some("") | None => None,
228            #[cfg(feature = "timezones")]
229            Some("+00:00") | Some("00:00") | Some("utc") => Some(PlSmallStr::from_static("UTC")),
230            Some(v) => Some(PlSmallStr::from_str(v)),
231        }
232    }
233
234    pub fn value_within_range(&self, other: AnyValue) -> bool {
235        use DataType::*;
236        match self {
237            UInt8 => other.extract::<u8>().is_some(),
238            #[cfg(feature = "dtype-u16")]
239            UInt16 => other.extract::<u16>().is_some(),
240            UInt32 => other.extract::<u32>().is_some(),
241            UInt64 => other.extract::<u64>().is_some(),
242            #[cfg(feature = "dtype-i8")]
243            Int8 => other.extract::<i8>().is_some(),
244            #[cfg(feature = "dtype-i16")]
245            Int16 => other.extract::<i16>().is_some(),
246            Int32 => other.extract::<i32>().is_some(),
247            Int64 => other.extract::<i64>().is_some(),
248            _ => false,
249        }
250    }
251
252    /// Check if the whole dtype is known.
253    pub fn is_known(&self) -> bool {
254        match self {
255            DataType::List(inner) => inner.is_known(),
256            #[cfg(feature = "dtype-array")]
257            DataType::Array(inner, _) => inner.is_known(),
258            #[cfg(feature = "dtype-struct")]
259            DataType::Struct(fields) => fields.iter().all(|fld| fld.dtype.is_known()),
260            DataType::Unknown(_) => false,
261            _ => true,
262        }
263    }
264
265    /// Materialize this datatype if it is unknown. All other datatypes
266    /// are left unchanged.
267    pub fn materialize_unknown(self, allow_unknown: bool) -> PolarsResult<DataType> {
268        match self {
269            DataType::Unknown(u) => match u.materialize() {
270                Some(known) => Ok(known),
271                None => {
272                    if allow_unknown {
273                        Ok(DataType::Unknown(u))
274                    } else {
275                        polars_bail!(SchemaMismatch: "failed to materialize unknown type")
276                    }
277                },
278            },
279            DataType::List(inner) => Ok(DataType::List(Box::new(
280                inner.materialize_unknown(allow_unknown)?,
281            ))),
282            #[cfg(feature = "dtype-array")]
283            DataType::Array(inner, size) => Ok(DataType::Array(
284                Box::new(inner.materialize_unknown(allow_unknown)?),
285                size,
286            )),
287            #[cfg(feature = "dtype-struct")]
288            DataType::Struct(fields) => Ok(DataType::Struct(
289                fields
290                    .into_iter()
291                    .map(|f| {
292                        PolarsResult::Ok(Field::new(
293                            f.name,
294                            f.dtype.materialize_unknown(allow_unknown)?,
295                        ))
296                    })
297                    .try_collect_vec()?,
298            )),
299            _ => Ok(self),
300        }
301    }
302
303    #[cfg(feature = "dtype-array")]
304    /// Get the full shape of a multidimensional array.
305    pub fn get_shape(&self) -> Option<Vec<usize>> {
306        fn get_shape_impl(dt: &DataType, shape: &mut Vec<usize>) {
307            if let DataType::Array(inner, size) = dt {
308                shape.push(*size);
309                get_shape_impl(inner, shape);
310            }
311        }
312
313        if let DataType::Array(inner, size) = self {
314            let mut shape = vec![*size];
315            get_shape_impl(inner, &mut shape);
316            Some(shape)
317        } else {
318            None
319        }
320    }
321
322    /// Get the inner data type of a nested type.
323    pub fn inner_dtype(&self) -> Option<&DataType> {
324        match self {
325            DataType::List(inner) => Some(inner),
326            #[cfg(feature = "dtype-array")]
327            DataType::Array(inner, _) => Some(inner),
328            _ => None,
329        }
330    }
331
332    /// Get the absolute inner data type of a nested type.
333    pub fn leaf_dtype(&self) -> &DataType {
334        let mut prev = self;
335        while let Some(dtype) = prev.inner_dtype() {
336            prev = dtype
337        }
338        prev
339    }
340
341    #[cfg(feature = "dtype-array")]
342    /// Get the inner data type of a multidimensional array.
343    pub fn array_leaf_dtype(&self) -> Option<&DataType> {
344        let mut prev = self;
345        match prev {
346            DataType::Array(_, _) => {
347                while let DataType::Array(inner, _) = &prev {
348                    prev = inner;
349                }
350                Some(prev)
351            },
352            _ => None,
353        }
354    }
355
356    /// Cast the leaf types of Lists/Arrays and keep the nesting.
357    pub fn cast_leaf(&self, to: DataType) -> DataType {
358        use DataType::*;
359        match self {
360            List(inner) => List(Box::new(inner.cast_leaf(to))),
361            #[cfg(feature = "dtype-array")]
362            Array(inner, size) => Array(Box::new(inner.cast_leaf(to)), *size),
363            _ => to,
364        }
365    }
366
367    /// Return whether the cast to `to` makes sense.
368    ///
369    /// If it `None`, we are not sure.
370    pub fn can_cast_to(&self, to: &DataType) -> Option<bool> {
371        if self == to {
372            return Some(true);
373        }
374        if self.is_primitive_numeric() && to.is_primitive_numeric() {
375            return Some(true);
376        }
377
378        if self.is_null() {
379            return Some(true);
380        }
381
382        use DataType as D;
383        Some(match (self, to) {
384            #[cfg(feature = "dtype-categorical")]
385            (D::Categorical(_, _) | D::Enum(_, _), D::Binary)
386            | (D::Binary, D::Categorical(_, _) | D::Enum(_, _)) => false,
387
388            #[cfg(feature = "object")]
389            (D::Object(_), D::Object(_)) => true,
390            #[cfg(feature = "object")]
391            (D::Object(_), _) | (_, D::Object(_)) => false,
392
393            (D::Boolean, dt) | (dt, D::Boolean) => match dt {
394                dt if dt.is_primitive_numeric() => true,
395                #[cfg(feature = "dtype-decimal")]
396                D::Decimal(_, _) => true,
397                D::String | D::Binary => true,
398                _ => false,
399            },
400
401            (D::List(from), D::List(to)) => from.can_cast_to(to)?,
402            #[cfg(feature = "dtype-array")]
403            (D::Array(from, l_width), D::Array(to, r_width)) => {
404                l_width == r_width && from.can_cast_to(to)?
405            },
406            #[cfg(feature = "dtype-struct")]
407            (D::Struct(l_fields), D::Struct(r_fields)) => {
408                if l_fields.is_empty() {
409                    return Some(true);
410                }
411
412                if l_fields.len() != r_fields.len() {
413                    return Some(false);
414                }
415
416                for (l, r) in l_fields.iter().zip(r_fields) {
417                    if !l.dtype().can_cast_to(r.dtype())? {
418                        return Some(false);
419                    }
420                }
421
422                true
423            },
424
425            // @NOTE: we are being conversative
426            _ => return None,
427        })
428    }
429
430    pub fn implode(self) -> DataType {
431        DataType::List(Box::new(self))
432    }
433
434    /// Convert to the physical data type
435    #[must_use]
436    pub fn to_physical(&self) -> DataType {
437        use DataType::*;
438        match self {
439            Date => Int32,
440            Datetime(_, _) => Int64,
441            Duration(_) => Int64,
442            Time => Int64,
443            #[cfg(feature = "dtype-decimal")]
444            Decimal(_, _) => Int128,
445            #[cfg(feature = "dtype-categorical")]
446            Categorical(_, _) | Enum(_, _) => UInt32,
447            #[cfg(feature = "dtype-array")]
448            Array(dt, width) => Array(Box::new(dt.to_physical()), *width),
449            List(dt) => List(Box::new(dt.to_physical())),
450            #[cfg(feature = "dtype-struct")]
451            Struct(fields) => {
452                let new_fields = fields
453                    .iter()
454                    .map(|s| Field::new(s.name().clone(), s.dtype().to_physical()))
455                    .collect();
456                Struct(new_fields)
457            },
458            _ => self.clone(),
459        }
460    }
461
462    pub fn is_supported_list_arithmetic_input(&self) -> bool {
463        self.is_primitive_numeric() || self.is_bool() || self.is_null()
464    }
465
466    /// Check if this [`DataType`] is a logical type
467    pub fn is_logical(&self) -> bool {
468        self != &self.to_physical()
469    }
470
471    /// Check if this [`DataType`] is a temporal type
472    pub fn is_temporal(&self) -> bool {
473        use DataType::*;
474        matches!(self, Date | Datetime(_, _) | Duration(_) | Time)
475    }
476
477    /// Check if datatype is a primitive type. By that we mean that
478    /// it is not a nested or logical type.
479    pub fn is_primitive(&self) -> bool {
480        self.is_primitive_numeric()
481            | matches!(
482                self,
483                DataType::Boolean | DataType::String | DataType::Binary
484            )
485    }
486
487    /// Check if this [`DataType`] is a primitive numeric type (excludes Decimal).
488    pub fn is_primitive_numeric(&self) -> bool {
489        self.is_float() || self.is_integer()
490    }
491
492    /// Check if this [`DataType`] is a boolean.
493    pub fn is_bool(&self) -> bool {
494        matches!(self, DataType::Boolean)
495    }
496
497    /// Check if this [`DataType`] is a list.
498    pub fn is_list(&self) -> bool {
499        matches!(self, DataType::List(_))
500    }
501
502    /// Check if this [`DataType`] is an array.
503    pub fn is_array(&self) -> bool {
504        #[cfg(feature = "dtype-array")]
505        {
506            matches!(self, DataType::Array(_, _))
507        }
508        #[cfg(not(feature = "dtype-array"))]
509        {
510            false
511        }
512    }
513
514    pub fn is_nested(&self) -> bool {
515        self.is_list() || self.is_struct() || self.is_array()
516    }
517
518    /// Check if this [`DataType`] is a struct
519    pub fn is_struct(&self) -> bool {
520        #[cfg(feature = "dtype-struct")]
521        {
522            matches!(self, DataType::Struct(_))
523        }
524        #[cfg(not(feature = "dtype-struct"))]
525        {
526            false
527        }
528    }
529
530    pub fn is_binary(&self) -> bool {
531        matches!(self, DataType::Binary)
532    }
533
534    pub fn is_date(&self) -> bool {
535        matches!(self, DataType::Date)
536    }
537    pub fn is_datetime(&self) -> bool {
538        matches!(self, DataType::Datetime(..))
539    }
540
541    pub fn is_object(&self) -> bool {
542        #[cfg(feature = "object")]
543        {
544            matches!(self, DataType::Object(_))
545        }
546        #[cfg(not(feature = "object"))]
547        {
548            false
549        }
550    }
551
552    pub fn is_null(&self) -> bool {
553        matches!(self, DataType::Null)
554    }
555
556    pub fn contains_views(&self) -> bool {
557        use DataType::*;
558        match self {
559            Binary | String => true,
560            #[cfg(feature = "dtype-categorical")]
561            Categorical(_, _) | Enum(_, _) => true,
562            List(inner) => inner.contains_views(),
563            #[cfg(feature = "dtype-array")]
564            Array(inner, _) => inner.contains_views(),
565            #[cfg(feature = "dtype-struct")]
566            Struct(fields) => fields.iter().any(|field| field.dtype.contains_views()),
567            _ => false,
568        }
569    }
570
571    pub fn contains_categoricals(&self) -> bool {
572        use DataType::*;
573        match self {
574            #[cfg(feature = "dtype-categorical")]
575            Categorical(_, _) | Enum(_, _) => true,
576            List(inner) => inner.contains_categoricals(),
577            #[cfg(feature = "dtype-array")]
578            Array(inner, _) => inner.contains_categoricals(),
579            #[cfg(feature = "dtype-struct")]
580            Struct(fields) => fields
581                .iter()
582                .any(|field| field.dtype.contains_categoricals()),
583            _ => false,
584        }
585    }
586
587    pub fn contains_objects(&self) -> bool {
588        use DataType::*;
589        match self {
590            #[cfg(feature = "object")]
591            Object(_) => true,
592            List(inner) => inner.contains_objects(),
593            #[cfg(feature = "dtype-array")]
594            Array(inner, _) => inner.contains_objects(),
595            #[cfg(feature = "dtype-struct")]
596            Struct(fields) => fields.iter().any(|field| field.dtype.contains_objects()),
597            _ => false,
598        }
599    }
600
601    /// Check if type is sortable
602    pub fn is_ord(&self) -> bool {
603        #[cfg(feature = "dtype-categorical")]
604        let is_cat = matches!(self, DataType::Categorical(_, _) | DataType::Enum(_, _));
605        #[cfg(not(feature = "dtype-categorical"))]
606        let is_cat = false;
607
608        let phys = self.to_physical();
609        (phys.is_primitive_numeric()
610            || self.is_decimal()
611            || matches!(
612                phys,
613                DataType::Binary | DataType::String | DataType::Boolean
614            ))
615            && !is_cat
616    }
617
618    /// Check if this [`DataType`] is a Decimal type (of any scale/precision).
619    pub fn is_decimal(&self) -> bool {
620        match self {
621            #[cfg(feature = "dtype-decimal")]
622            DataType::Decimal(_, _) => true,
623            _ => false,
624        }
625    }
626
627    /// Check if this [`DataType`] is a basic floating point type (excludes Decimal).
628    /// Note, this also includes `Unknown(UnknownKind::Float)`.
629    pub fn is_float(&self) -> bool {
630        matches!(
631            self,
632            DataType::Float32 | DataType::Float64 | DataType::Unknown(UnknownKind::Float)
633        )
634    }
635
636    /// Check if this [`DataType`] is an integer. Note, this also includes `Unknown(UnknownKind::Int)`.
637    pub fn is_integer(&self) -> bool {
638        matches!(
639            self,
640            DataType::Int8
641                | DataType::Int16
642                | DataType::Int32
643                | DataType::Int64
644                | DataType::Int128
645                | DataType::UInt8
646                | DataType::UInt16
647                | DataType::UInt32
648                | DataType::UInt64
649                | DataType::Unknown(UnknownKind::Int(_))
650        )
651    }
652
653    pub fn is_signed_integer(&self) -> bool {
654        // allow because it cannot be replaced when object feature is activated
655        matches!(
656            self,
657            DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 | DataType::Int128
658        )
659    }
660
661    pub fn is_unsigned_integer(&self) -> bool {
662        matches!(
663            self,
664            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64,
665        )
666    }
667
668    pub fn is_string(&self) -> bool {
669        matches!(self, DataType::String | DataType::Unknown(UnknownKind::Str))
670    }
671
672    pub fn is_categorical(&self) -> bool {
673        #[cfg(feature = "dtype-categorical")]
674        {
675            matches!(self, DataType::Categorical(_, _))
676        }
677        #[cfg(not(feature = "dtype-categorical"))]
678        {
679            false
680        }
681    }
682
683    pub fn is_enum(&self) -> bool {
684        #[cfg(feature = "dtype-categorical")]
685        {
686            matches!(self, DataType::Enum(_, _))
687        }
688        #[cfg(not(feature = "dtype-categorical"))]
689        {
690            false
691        }
692    }
693
694    /// Convert to an Arrow Field
695    pub fn to_arrow_field(&self, name: PlSmallStr, compat_level: CompatLevel) -> ArrowField {
696        let metadata = match self {
697            #[cfg(feature = "dtype-categorical")]
698            DataType::Enum(Some(revmap), _) => {
699                let cats = revmap.get_categories();
700                let mut encoded = String::with_capacity(cats.len() * 10);
701                for cat in cats.values_iter() {
702                    encoded.push_str(itoa::Buffer::new().format(cat.len()));
703                    encoded.push(';');
704                    encoded.push_str(cat);
705                }
706                Some(BTreeMap::from([(
707                    PlSmallStr::from_static(DTYPE_ENUM_VALUES),
708                    PlSmallStr::from_string(encoded),
709                )]))
710            },
711            #[cfg(feature = "dtype-categorical")]
712            DataType::Categorical(_, ordering) => Some(BTreeMap::from([(
713                PlSmallStr::from_static(DTYPE_CATEGORICAL),
714                PlSmallStr::from_static(ordering.into()),
715            )])),
716            DataType::BinaryOffset => Some(BTreeMap::from([(
717                PlSmallStr::from_static(PL_KEY),
718                PlSmallStr::from_static(MAINTAIN_PL_TYPE),
719            )])),
720            _ => None,
721        };
722
723        let field = ArrowField::new(name, self.to_arrow(compat_level), true);
724
725        if let Some(metadata) = metadata {
726            field.with_metadata(metadata)
727        } else {
728            field
729        }
730    }
731
732    /// Try to get the maximum value for this datatype.
733    pub fn max(&self) -> PolarsResult<Scalar> {
734        use DataType::*;
735        let v = match self {
736            Int8 => Scalar::from(i8::MAX),
737            Int16 => Scalar::from(i16::MAX),
738            Int32 => Scalar::from(i32::MAX),
739            Int64 => Scalar::from(i64::MAX),
740            Int128 => Scalar::from(i128::MAX),
741            UInt8 => Scalar::from(u8::MAX),
742            UInt16 => Scalar::from(u16::MAX),
743            UInt32 => Scalar::from(u32::MAX),
744            UInt64 => Scalar::from(u64::MAX),
745            Float32 => Scalar::from(f32::INFINITY),
746            Float64 => Scalar::from(f64::INFINITY),
747            #[cfg(feature = "dtype-time")]
748            Time => Scalar::new(Time, AnyValue::Time(NS_IN_DAY - 1)),
749            dt => polars_bail!(ComputeError: "cannot determine upper bound for dtype `{}`", dt),
750        };
751        Ok(v)
752    }
753
754    /// Try to get the minimum value for this datatype.
755    pub fn min(&self) -> PolarsResult<Scalar> {
756        use DataType::*;
757        let v = match self {
758            Int8 => Scalar::from(i8::MIN),
759            Int16 => Scalar::from(i16::MIN),
760            Int32 => Scalar::from(i32::MIN),
761            Int64 => Scalar::from(i64::MIN),
762            Int128 => Scalar::from(i128::MIN),
763            UInt8 => Scalar::from(u8::MIN),
764            UInt16 => Scalar::from(u16::MIN),
765            UInt32 => Scalar::from(u32::MIN),
766            UInt64 => Scalar::from(u64::MIN),
767            Float32 => Scalar::from(f32::NEG_INFINITY),
768            Float64 => Scalar::from(f64::NEG_INFINITY),
769            #[cfg(feature = "dtype-time")]
770            Time => Scalar::new(Time, AnyValue::Time(0)),
771            dt => polars_bail!(ComputeError: "cannot determine lower bound for dtype `{}`", dt),
772        };
773        Ok(v)
774    }
775
776    /// Convert to an Arrow data type.
777    #[inline]
778    pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowDataType {
779        self.try_to_arrow(compat_level).unwrap()
780    }
781
782    #[inline]
783    pub fn try_to_arrow(&self, compat_level: CompatLevel) -> PolarsResult<ArrowDataType> {
784        use DataType::*;
785        match self {
786            Boolean => Ok(ArrowDataType::Boolean),
787            UInt8 => Ok(ArrowDataType::UInt8),
788            UInt16 => Ok(ArrowDataType::UInt16),
789            UInt32 => Ok(ArrowDataType::UInt32),
790            UInt64 => Ok(ArrowDataType::UInt64),
791            Int8 => Ok(ArrowDataType::Int8),
792            Int16 => Ok(ArrowDataType::Int16),
793            Int32 => Ok(ArrowDataType::Int32),
794            Int64 => Ok(ArrowDataType::Int64),
795            Int128 => Ok(ArrowDataType::Int128),
796            Float32 => Ok(ArrowDataType::Float32),
797            Float64 => Ok(ArrowDataType::Float64),
798            #[cfg(feature = "dtype-decimal")]
799            Decimal(precision, scale) => {
800                let precision = (*precision).unwrap_or(38);
801                polars_ensure!(precision <= 38 && precision > 0, InvalidOperation: "decimal precision should be <= 38 & >= 1");
802
803                Ok(ArrowDataType::Decimal(
804                    precision,
805                    scale.unwrap_or(0), // and what else can we do here?
806                ))
807            },
808            String => {
809                let dt = if compat_level.0 >= 1 {
810                    ArrowDataType::Utf8View
811                } else {
812                    ArrowDataType::LargeUtf8
813                };
814                Ok(dt)
815            },
816            Binary => {
817                let dt = if compat_level.0 >= 1 {
818                    ArrowDataType::BinaryView
819                } else {
820                    ArrowDataType::LargeBinary
821                };
822                Ok(dt)
823            },
824            Date => Ok(ArrowDataType::Date32),
825            Datetime(unit, tz) => Ok(ArrowDataType::Timestamp(unit.to_arrow(), tz.clone())),
826            Duration(unit) => Ok(ArrowDataType::Duration(unit.to_arrow())),
827            Time => Ok(ArrowDataType::Time64(ArrowTimeUnit::Nanosecond)),
828            #[cfg(feature = "dtype-array")]
829            Array(dt, size) => Ok(dt
830                .try_to_arrow(compat_level)?
831                .to_fixed_size_list(*size, true)),
832            List(dt) => Ok(ArrowDataType::LargeList(Box::new(
833                dt.to_arrow_field(PlSmallStr::from_static("item"), compat_level),
834            ))),
835            Null => Ok(ArrowDataType::Null),
836            #[cfg(feature = "object")]
837            Object(_) => Ok(get_object_physical_type()),
838            #[cfg(feature = "dtype-categorical")]
839            Categorical(_, _) | Enum(_, _) => {
840                let values = if compat_level.0 >= 1 {
841                    ArrowDataType::Utf8View
842                } else {
843                    ArrowDataType::LargeUtf8
844                };
845                Ok(ArrowDataType::Dictionary(
846                    IntegerType::UInt32,
847                    Box::new(values),
848                    false,
849                ))
850            },
851            #[cfg(feature = "dtype-struct")]
852            Struct(fields) => {
853                let fields = fields
854                    .iter()
855                    .map(|fld| fld.to_arrow(compat_level))
856                    .collect();
857                Ok(ArrowDataType::Struct(fields))
858            },
859            BinaryOffset => Ok(ArrowDataType::LargeBinary),
860            Unknown(kind) => {
861                let dt = match kind {
862                    UnknownKind::Any => ArrowDataType::Unknown,
863                    UnknownKind::Float => ArrowDataType::Float64,
864                    UnknownKind::Str => ArrowDataType::Utf8View,
865                    UnknownKind::Int(v) => {
866                        return materialize_dyn_int(*v).dtype().try_to_arrow(compat_level);
867                    },
868                };
869                Ok(dt)
870            },
871        }
872    }
873
874    pub fn is_nested_null(&self) -> bool {
875        use DataType::*;
876        match self {
877            Null => true,
878            List(field) => field.is_nested_null(),
879            #[cfg(feature = "dtype-array")]
880            Array(field, _) => field.is_nested_null(),
881            #[cfg(feature = "dtype-struct")]
882            Struct(fields) => fields.iter().all(|fld| fld.dtype.is_nested_null()),
883            _ => false,
884        }
885    }
886
887    /// Answers if this type matches the given type of a schema.
888    ///
889    /// Allows (nested) Null types in this type to match any type in the schema,
890    /// but not vice versa. In such a case Ok(true) is returned, because a cast
891    /// is necessary. If no cast is necessary Ok(false) is returned, and an
892    /// error is returned if the types are incompatible.
893    pub fn matches_schema_type(&self, schema_type: &DataType) -> PolarsResult<bool> {
894        match (self, schema_type) {
895            (DataType::List(l), DataType::List(r)) => l.matches_schema_type(r),
896            #[cfg(feature = "dtype-array")]
897            (DataType::Array(l, sl), DataType::Array(r, sr)) => {
898                Ok(l.matches_schema_type(r)? && sl == sr)
899            },
900            #[cfg(feature = "dtype-struct")]
901            (DataType::Struct(l), DataType::Struct(r)) => {
902                let mut must_cast = false;
903                for (l, r) in l.iter().zip(r.iter()) {
904                    must_cast |= l.dtype.matches_schema_type(&r.dtype)?;
905                }
906                Ok(must_cast)
907            },
908            (DataType::Null, DataType::Null) => Ok(false),
909            #[cfg(feature = "dtype-decimal")]
910            (DataType::Decimal(_, s1), DataType::Decimal(_, s2)) => Ok(s1 != s2),
911            // We don't allow the other way around, only if our current type is
912            // null and the schema isn't we allow it.
913            (DataType::Null, _) => Ok(true),
914            (l, r) if l == r => Ok(false),
915            (l, r) => {
916                polars_bail!(SchemaMismatch: "type {:?} is incompatible with expected type {:?}", l, r)
917            },
918        }
919    }
920}
921
922impl Display for DataType {
923    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
924        let s = match self {
925            DataType::Null => "null",
926            DataType::Boolean => "bool",
927            DataType::UInt8 => "u8",
928            DataType::UInt16 => "u16",
929            DataType::UInt32 => "u32",
930            DataType::UInt64 => "u64",
931            DataType::Int8 => "i8",
932            DataType::Int16 => "i16",
933            DataType::Int32 => "i32",
934            DataType::Int64 => "i64",
935            DataType::Int128 => "i128",
936            DataType::Float32 => "f32",
937            DataType::Float64 => "f64",
938            #[cfg(feature = "dtype-decimal")]
939            DataType::Decimal(precision, scale) => {
940                return match (precision, scale) {
941                    (Some(precision), Some(scale)) => {
942                        f.write_str(&format!("decimal[{precision},{scale}]"))
943                    },
944                    (None, Some(scale)) => f.write_str(&format!("decimal[*,{scale}]")),
945                    _ => f.write_str("decimal[?]"), // shouldn't happen
946                };
947            },
948            DataType::String => "str",
949            DataType::Binary => "binary",
950            DataType::Date => "date",
951            DataType::Datetime(tu, tz) => {
952                let s = match tz {
953                    None => format!("datetime[{tu}]"),
954                    Some(tz) => format!("datetime[{tu}, {tz}]"),
955                };
956                return f.write_str(&s);
957            },
958            DataType::Duration(tu) => return write!(f, "duration[{tu}]"),
959            DataType::Time => "time",
960            #[cfg(feature = "dtype-array")]
961            DataType::Array(_, _) => {
962                let tp = self.array_leaf_dtype().unwrap();
963
964                let dims = self.get_shape().unwrap();
965                let shape = if dims.len() == 1 {
966                    format!("{}", dims[0])
967                } else {
968                    format_tuple!(dims)
969                };
970                return write!(f, "array[{tp}, {}]", shape);
971            },
972            DataType::List(tp) => return write!(f, "list[{tp}]"),
973            #[cfg(feature = "object")]
974            DataType::Object(s) => s,
975            #[cfg(feature = "dtype-categorical")]
976            DataType::Categorical(_, _) => "cat",
977            #[cfg(feature = "dtype-categorical")]
978            DataType::Enum(_, _) => "enum",
979            #[cfg(feature = "dtype-struct")]
980            DataType::Struct(fields) => return write!(f, "struct[{}]", fields.len()),
981            DataType::Unknown(kind) => match kind {
982                UnknownKind::Any => "unknown",
983                UnknownKind::Int(_) => "dyn int",
984                UnknownKind::Float => "dyn float",
985                UnknownKind::Str => "dyn str",
986            },
987            DataType::BinaryOffset => "binary[offset]",
988        };
989        f.write_str(s)
990    }
991}
992
993pub fn merge_dtypes(left: &DataType, right: &DataType) -> PolarsResult<DataType> {
994    use DataType::*;
995    Ok(match (left, right) {
996        #[cfg(feature = "dtype-categorical")]
997        (Categorical(Some(rev_map_l), ordering), Categorical(Some(rev_map_r), _)) => {
998            match (&**rev_map_l, &**rev_map_r) {
999                (RevMapping::Global(_, _, idl), RevMapping::Global(_, _, idr)) if idl == idr => {
1000                    let mut merger = GlobalRevMapMerger::new(rev_map_l.clone());
1001                    merger.merge_map(rev_map_r)?;
1002                    Categorical(Some(merger.finish()), *ordering)
1003                },
1004                (RevMapping::Local(_, idl), RevMapping::Local(_, idr)) if idl == idr => {
1005                    left.clone()
1006                },
1007                _ => polars_bail!(string_cache_mismatch),
1008            }
1009        },
1010        #[cfg(feature = "dtype-categorical")]
1011        (Enum(Some(rev_map_l), _), Enum(Some(rev_map_r), _)) => {
1012            match (&**rev_map_l, &**rev_map_r) {
1013                (RevMapping::Local(_, idl), RevMapping::Local(_, idr)) if idl == idr => {
1014                    left.clone()
1015                },
1016                _ => polars_bail!(ComputeError: "can not combine with different categories"),
1017            }
1018        },
1019        (List(inner_l), List(inner_r)) => {
1020            let merged = merge_dtypes(inner_l, inner_r)?;
1021            List(Box::new(merged))
1022        },
1023        #[cfg(feature = "dtype-struct")]
1024        (Struct(inner_l), Struct(inner_r)) => {
1025            polars_ensure!(inner_l.len() == inner_r.len(), ComputeError: "cannot combine structs with differing amounts of fields ({} != {})", inner_l.len(), inner_r.len());
1026            let fields = inner_l.iter().zip(inner_r.iter()).map(|(l, r)| {
1027                polars_ensure!(l.name() == r.name(), ComputeError: "cannot combine structs with different fields ({} != {})", l.name(), r.name());
1028                let merged = merge_dtypes(l.dtype(), r.dtype())?;
1029                Ok(Field::new(l.name().clone(), merged))
1030            }).collect::<PolarsResult<Vec<_>>>()?;
1031            Struct(fields)
1032        },
1033        #[cfg(feature = "dtype-array")]
1034        (Array(inner_l, width_l), Array(inner_r, width_r)) => {
1035            polars_ensure!(width_l == width_r, ComputeError: "widths of FixedSizeWidth Series are not equal");
1036            let merged = merge_dtypes(inner_l, inner_r)?;
1037            Array(Box::new(merged), *width_l)
1038        },
1039        (left, right) if left == right => left.clone(),
1040        _ => polars_bail!(ComputeError: "unable to merge datatypes"),
1041    })
1042}
1043
1044fn collect_nested_types(
1045    dtype: &DataType,
1046    result: &mut PlHashSet<DataType>,
1047    include_compound_types: bool,
1048) {
1049    match dtype {
1050        DataType::List(inner) => {
1051            if include_compound_types {
1052                result.insert(dtype.clone());
1053            }
1054            collect_nested_types(inner, result, include_compound_types);
1055        },
1056        #[cfg(feature = "dtype-array")]
1057        DataType::Array(inner, _) => {
1058            if include_compound_types {
1059                result.insert(dtype.clone());
1060            }
1061            collect_nested_types(inner, result, include_compound_types);
1062        },
1063        #[cfg(feature = "dtype-struct")]
1064        DataType::Struct(fields) => {
1065            if include_compound_types {
1066                result.insert(dtype.clone());
1067            }
1068            for field in fields {
1069                collect_nested_types(field.dtype(), result, include_compound_types);
1070            }
1071        },
1072        _ => {
1073            result.insert(dtype.clone());
1074        },
1075    }
1076}
1077
1078pub fn unpack_dtypes(dtype: &DataType, include_compound_types: bool) -> PlHashSet<DataType> {
1079    let mut result = PlHashSet::new();
1080    collect_nested_types(dtype, &mut result, include_compound_types);
1081    result
1082}
1083
1084#[cfg(feature = "dtype-categorical")]
1085pub fn create_enum_dtype(categories: Utf8ViewArray) -> DataType {
1086    let rev_map = RevMapping::build_local(categories);
1087    DataType::Enum(Some(Arc::new(rev_map)), Default::default())
1088}
1089
1090#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
1091#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
1092pub struct CompatLevel(pub(crate) u16);
1093
1094impl CompatLevel {
1095    pub const fn newest() -> CompatLevel {
1096        CompatLevel(1)
1097    }
1098
1099    pub const fn oldest() -> CompatLevel {
1100        CompatLevel(0)
1101    }
1102
1103    // The following methods are only used internally
1104
1105    #[doc(hidden)]
1106    pub fn with_level(level: u16) -> PolarsResult<CompatLevel> {
1107        if level > CompatLevel::newest().0 {
1108            polars_bail!(InvalidOperation: "invalid compat level");
1109        }
1110        Ok(CompatLevel(level))
1111    }
1112
1113    #[doc(hidden)]
1114    pub fn get_level(&self) -> u16 {
1115        self.0
1116    }
1117}
1118
1119#[cfg(test)]
1120mod tests {
1121    use super::*;
1122
1123    #[cfg(feature = "dtype-array")]
1124    #[test]
1125    fn test_unpack_primitive_dtypes() {
1126        let inner_type = DataType::Float64;
1127        let array_type = DataType::Array(Box::new(inner_type), 10);
1128        let list_type = DataType::List(Box::new(array_type.clone()));
1129
1130        let result = unpack_dtypes(&list_type, false);
1131
1132        let mut expected = PlHashSet::new();
1133        expected.insert(DataType::Float64);
1134
1135        assert_eq!(result, expected)
1136    }
1137
1138    #[cfg(feature = "dtype-array")]
1139    #[test]
1140    fn test_unpack_compound_dtypes() {
1141        let inner_type = DataType::Float64;
1142        let array_type = DataType::Array(Box::new(inner_type), 10);
1143        let list_type = DataType::List(Box::new(array_type.clone()));
1144
1145        let result = unpack_dtypes(&list_type, true);
1146
1147        let mut expected = PlHashSet::new();
1148        expected.insert(list_type.clone());
1149        expected.insert(array_type.clone());
1150        expected.insert(DataType::Float64);
1151
1152        assert_eq!(result, expected)
1153    }
1154}