1use std::collections::BTreeMap;
2
3use arrow::datatypes::{
4 DTYPE_CATEGORICAL_NEW, DTYPE_ENUM_VALUES_LEGACY, DTYPE_ENUM_VALUES_NEW, Metadata,
5};
6#[cfg(feature = "dtype-array")]
7use polars_utils::format_tuple;
8use polars_utils::itertools::Itertools;
9#[cfg(any(feature = "serde-lazy", feature = "serde"))]
10use serde::{Deserialize, Serialize};
11pub use temporal::time_zone::TimeZone;
12
13use super::*;
14#[cfg(feature = "object")]
15use crate::chunked_array::object::registry::get_object_physical_type;
16use crate::utils::materialize_dyn_int;
17
18static MAINTAIN_PL_TYPE: &str = "maintain_type";
19static PL_KEY: &str = "pl";
20
21pub trait MetaDataExt: IntoMetadata {
22 fn pl_enum_metadata(&self) -> Option<&str> {
23 let md = self.into_metadata_ref();
24 let values = md
25 .get(DTYPE_ENUM_VALUES_NEW)
26 .or_else(|| md.get(DTYPE_ENUM_VALUES_LEGACY));
27 Some(values?.as_str())
28 }
29
30 fn pl_categorical_metadata(&self) -> Option<&str> {
31 Some(
36 self.into_metadata_ref()
37 .get(DTYPE_CATEGORICAL_NEW)?
38 .as_str(),
39 )
40 }
41
42 fn maintain_type(&self) -> bool {
43 let metadata = self.into_metadata_ref();
44 metadata.get(PL_KEY).map(|s| s.as_str()) == Some(MAINTAIN_PL_TYPE)
45 }
46}
47
48impl MetaDataExt for Metadata {}
49pub trait IntoMetadata {
50 #[allow(clippy::wrong_self_convention)]
51 fn into_metadata_ref(&self) -> &Metadata;
52}
53
54impl IntoMetadata for Metadata {
55 fn into_metadata_ref(&self) -> &Metadata {
56 self
57 }
58}
59
60#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Default)]
61#[cfg_attr(
62 any(feature = "serde", feature = "serde-lazy"),
63 derive(Serialize, Deserialize)
64)]
65#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
66pub enum UnknownKind {
67 Ufunc,
68 Int(i128),
70 Float,
71 Str,
73 #[default]
74 Any,
75}
76
77impl UnknownKind {
78 pub fn materialize(&self) -> Option<DataType> {
79 let dtype = match self {
80 UnknownKind::Int(v) => materialize_dyn_int(*v).dtype(),
81 UnknownKind::Float => DataType::Float64,
82 UnknownKind::Str => DataType::String,
83 UnknownKind::Any | UnknownKind::Ufunc => return None,
84 };
85 Some(dtype)
86 }
87}
88
89#[derive(Clone, Debug)]
90pub enum DataType {
91 Boolean,
92 UInt8,
93 UInt16,
94 UInt32,
95 UInt64,
96 Int8,
97 Int16,
98 Int32,
99 Int64,
100 Int128,
101 Float32,
102 Float64,
103 #[cfg(feature = "dtype-decimal")]
107 Decimal(Option<usize>, Option<usize>), String,
110 Binary,
111 BinaryOffset,
112 Date,
115 Datetime(TimeUnit, Option<TimeZone>),
118 Duration(TimeUnit),
120 Time,
122 #[cfg(feature = "dtype-array")]
124 Array(Box<DataType>, usize),
125 List(Box<DataType>),
127 #[cfg(feature = "object")]
130 Object(&'static str),
131 Null,
132 #[cfg(feature = "dtype-categorical")]
133 Categorical(Arc<Categories>, Arc<CategoricalMapping>),
134 #[cfg(feature = "dtype-categorical")]
136 Enum(Arc<FrozenCategories>, Arc<CategoricalMapping>),
137 #[cfg(feature = "dtype-struct")]
138 Struct(Vec<Field>),
139 Unknown(UnknownKind),
141}
142
143impl Default for DataType {
144 fn default() -> Self {
145 DataType::Unknown(UnknownKind::Any)
146 }
147}
148
149pub trait AsRefDataType {
150 fn as_ref_dtype(&self) -> &DataType;
151}
152
153impl Hash for DataType {
154 fn hash<H: Hasher>(&self, state: &mut H) {
155 std::mem::discriminant(self).hash(state)
156 }
157}
158
159impl PartialEq for DataType {
160 fn eq(&self, other: &Self) -> bool {
161 use DataType::*;
162 {
163 match (self, other) {
164 #[cfg(feature = "dtype-categorical")]
165 (Categorical(cats_l, _), Categorical(cats_r, _)) => Arc::ptr_eq(cats_l, cats_r),
166 #[cfg(feature = "dtype-categorical")]
167 (Enum(fcats_l, _), Enum(fcats_r, _)) => Arc::ptr_eq(fcats_l, fcats_r),
168 (Datetime(tu_l, tz_l), Datetime(tu_r, tz_r)) => tu_l == tu_r && tz_l == tz_r,
169 (List(left_inner), List(right_inner)) => left_inner == right_inner,
170 #[cfg(feature = "dtype-duration")]
171 (Duration(tu_l), Duration(tu_r)) => tu_l == tu_r,
172 #[cfg(feature = "dtype-decimal")]
173 (Decimal(l_prec, l_scale), Decimal(r_prec, r_scale)) => {
174 let is_prec_eq = l_prec.is_none() || r_prec.is_none() || l_prec == r_prec;
175 let is_scale_eq = l_scale.is_none() || r_scale.is_none() || l_scale == r_scale;
176
177 is_prec_eq && is_scale_eq
178 },
179 #[cfg(feature = "object")]
180 (Object(lhs), Object(rhs)) => lhs == rhs,
181 #[cfg(feature = "dtype-struct")]
182 (Struct(lhs), Struct(rhs)) => {
183 std::ptr::eq(Vec::as_ptr(lhs), Vec::as_ptr(rhs)) || lhs == rhs
184 },
185 #[cfg(feature = "dtype-array")]
186 (Array(left_inner, left_width), Array(right_inner, right_width)) => {
187 left_width == right_width && left_inner == right_inner
188 },
189 (Unknown(l), Unknown(r)) => match (l, r) {
190 (UnknownKind::Int(_), UnknownKind::Int(_)) => true,
191 _ => l == r,
192 },
193 _ => std::mem::discriminant(self) == std::mem::discriminant(other),
194 }
195 }
196 }
197}
198
199impl Eq for DataType {}
200
201impl DataType {
202 pub const IDX_DTYPE: Self = {
203 #[cfg(not(feature = "bigidx"))]
204 {
205 DataType::UInt32
206 }
207 #[cfg(feature = "bigidx")]
208 {
209 DataType::UInt64
210 }
211 };
212
213 pub fn value_within_range(&self, other: AnyValue) -> bool {
214 use DataType::*;
215 match self {
216 UInt8 => other.extract::<u8>().is_some(),
217 #[cfg(feature = "dtype-u16")]
218 UInt16 => other.extract::<u16>().is_some(),
219 UInt32 => other.extract::<u32>().is_some(),
220 UInt64 => other.extract::<u64>().is_some(),
221 #[cfg(feature = "dtype-i8")]
222 Int8 => other.extract::<i8>().is_some(),
223 #[cfg(feature = "dtype-i16")]
224 Int16 => other.extract::<i16>().is_some(),
225 Int32 => other.extract::<i32>().is_some(),
226 Int64 => other.extract::<i64>().is_some(),
227 _ => false,
228 }
229 }
230
231 pub fn is_known(&self) -> bool {
233 match self {
234 DataType::List(inner) => inner.is_known(),
235 #[cfg(feature = "dtype-array")]
236 DataType::Array(inner, _) => inner.is_known(),
237 #[cfg(feature = "dtype-struct")]
238 DataType::Struct(fields) => fields.iter().all(|fld| fld.dtype.is_known()),
239 DataType::Unknown(_) => false,
240 _ => true,
241 }
242 }
243
244 pub fn materialize_unknown(self, allow_unknown: bool) -> PolarsResult<DataType> {
247 match self {
248 DataType::Unknown(u) => match u.materialize() {
249 Some(known) => Ok(known),
250 None => {
251 if allow_unknown {
252 Ok(DataType::Unknown(u))
253 } else {
254 polars_bail!(SchemaMismatch: "failed to materialize unknown type")
255 }
256 },
257 },
258 DataType::List(inner) => Ok(DataType::List(Box::new(
259 inner.materialize_unknown(allow_unknown)?,
260 ))),
261 #[cfg(feature = "dtype-array")]
262 DataType::Array(inner, size) => Ok(DataType::Array(
263 Box::new(inner.materialize_unknown(allow_unknown)?),
264 size,
265 )),
266 #[cfg(feature = "dtype-struct")]
267 DataType::Struct(fields) => Ok(DataType::Struct(
268 fields
269 .into_iter()
270 .map(|f| {
271 PolarsResult::Ok(Field::new(
272 f.name,
273 f.dtype.materialize_unknown(allow_unknown)?,
274 ))
275 })
276 .try_collect_vec()?,
277 )),
278 _ => Ok(self),
279 }
280 }
281
282 #[cfg(feature = "dtype-array")]
283 pub fn get_shape(&self) -> Option<Vec<usize>> {
285 fn get_shape_impl(dt: &DataType, shape: &mut Vec<usize>) {
286 if let DataType::Array(inner, size) = dt {
287 shape.push(*size);
288 get_shape_impl(inner, shape);
289 }
290 }
291
292 if let DataType::Array(inner, size) = self {
293 let mut shape = vec![*size];
294 get_shape_impl(inner, &mut shape);
295 Some(shape)
296 } else {
297 None
298 }
299 }
300
301 pub fn inner_dtype(&self) -> Option<&DataType> {
303 match self {
304 DataType::List(inner) => Some(inner),
305 #[cfg(feature = "dtype-array")]
306 DataType::Array(inner, _) => Some(inner),
307 _ => None,
308 }
309 }
310
311 pub fn leaf_dtype(&self) -> &DataType {
313 let mut prev = self;
314 while let Some(dtype) = prev.inner_dtype() {
315 prev = dtype
316 }
317 prev
318 }
319
320 #[cfg(feature = "dtype-array")]
321 pub fn array_leaf_dtype(&self) -> Option<&DataType> {
323 let mut prev = self;
324 match prev {
325 DataType::Array(_, _) => {
326 while let DataType::Array(inner, _) = &prev {
327 prev = inner;
328 }
329 Some(prev)
330 },
331 _ => None,
332 }
333 }
334
335 pub fn cast_leaf(&self, to: DataType) -> DataType {
337 use DataType::*;
338 match self {
339 List(inner) => List(Box::new(inner.cast_leaf(to))),
340 #[cfg(feature = "dtype-array")]
341 Array(inner, size) => Array(Box::new(inner.cast_leaf(to)), *size),
342 _ => to,
343 }
344 }
345
346 pub fn can_cast_to(&self, to: &DataType) -> Option<bool> {
350 if self == to {
351 return Some(true);
352 }
353 if self.is_primitive_numeric() && to.is_primitive_numeric() {
354 return Some(true);
355 }
356
357 if self.is_null() {
358 return Some(true);
359 }
360
361 use DataType as D;
362 Some(match (self, to) {
363 #[cfg(feature = "dtype-categorical")]
364 (D::Categorical(_, _) | D::Enum(_, _), D::Binary)
365 | (D::Binary, D::Categorical(_, _) | D::Enum(_, _)) => false, #[cfg(feature = "object")]
368 (D::Object(_), D::Object(_)) => true,
369 #[cfg(feature = "object")]
370 (D::Object(_), _) | (_, D::Object(_)) => false,
371
372 (D::Boolean, dt) | (dt, D::Boolean) => match dt {
373 dt if dt.is_primitive_numeric() => true,
374 #[cfg(feature = "dtype-decimal")]
375 D::Decimal(_, _) => true,
376 D::String | D::Binary => true,
377 _ => false,
378 },
379
380 (D::List(from), D::List(to)) => from.can_cast_to(to)?,
381 #[cfg(feature = "dtype-array")]
382 (D::Array(from, l_width), D::Array(to, r_width)) => {
383 l_width == r_width && from.can_cast_to(to)?
384 },
385 #[cfg(feature = "dtype-struct")]
386 (D::Struct(l_fields), D::Struct(r_fields)) => {
387 if l_fields.is_empty() {
388 return Some(true);
389 }
390
391 if l_fields.len() != r_fields.len() {
392 return Some(false);
393 }
394
395 for (l, r) in l_fields.iter().zip(r_fields) {
396 if !l.dtype().can_cast_to(r.dtype())? {
397 return Some(false);
398 }
399 }
400
401 true
402 },
403
404 _ => return None,
406 })
407 }
408
409 pub fn implode(self) -> DataType {
410 DataType::List(Box::new(self))
411 }
412
413 #[must_use]
415 pub fn to_physical(&self) -> DataType {
416 use DataType::*;
417 match self {
418 Date => Int32,
419 Datetime(_, _) => Int64,
420 Duration(_) => Int64,
421 Time => Int64,
422 #[cfg(feature = "dtype-decimal")]
423 Decimal(_, _) => Int128,
424 #[cfg(feature = "dtype-categorical")]
425 Categorical(cats, _) => cats.physical().dtype(),
426 #[cfg(feature = "dtype-categorical")]
427 Enum(fcats, _) => fcats.physical().dtype(),
428 #[cfg(feature = "dtype-array")]
429 Array(dt, width) => Array(Box::new(dt.to_physical()), *width),
430 List(dt) => List(Box::new(dt.to_physical())),
431 #[cfg(feature = "dtype-struct")]
432 Struct(fields) => {
433 let new_fields = fields
434 .iter()
435 .map(|s| Field::new(s.name().clone(), s.dtype().to_physical()))
436 .collect();
437 Struct(new_fields)
438 },
439 _ => self.clone(),
440 }
441 }
442
443 pub fn is_supported_list_arithmetic_input(&self) -> bool {
444 self.is_primitive_numeric() || self.is_bool() || self.is_null()
445 }
446
447 pub fn is_logical(&self) -> bool {
449 self != &self.to_physical()
450 }
451
452 pub fn is_temporal(&self) -> bool {
454 use DataType::*;
455 matches!(self, Date | Datetime(_, _) | Duration(_) | Time)
456 }
457
458 pub fn is_primitive(&self) -> bool {
461 self.is_primitive_numeric()
462 | matches!(
463 self,
464 DataType::Boolean | DataType::String | DataType::Binary
465 )
466 }
467
468 pub fn is_primitive_numeric(&self) -> bool {
470 self.is_float() || self.is_integer()
471 }
472
473 pub fn is_bool(&self) -> bool {
475 matches!(self, DataType::Boolean)
476 }
477
478 pub fn is_list(&self) -> bool {
480 matches!(self, DataType::List(_))
481 }
482
483 pub fn is_array(&self) -> bool {
485 #[cfg(feature = "dtype-array")]
486 {
487 matches!(self, DataType::Array(_, _))
488 }
489 #[cfg(not(feature = "dtype-array"))]
490 {
491 false
492 }
493 }
494
495 pub fn is_nested(&self) -> bool {
496 self.is_list() || self.is_struct() || self.is_array()
497 }
498
499 pub fn is_struct(&self) -> bool {
501 #[cfg(feature = "dtype-struct")]
502 {
503 matches!(self, DataType::Struct(_))
504 }
505 #[cfg(not(feature = "dtype-struct"))]
506 {
507 false
508 }
509 }
510
511 pub fn is_binary(&self) -> bool {
512 matches!(self, DataType::Binary)
513 }
514
515 pub fn is_date(&self) -> bool {
516 matches!(self, DataType::Date)
517 }
518 pub fn is_datetime(&self) -> bool {
519 matches!(self, DataType::Datetime(..))
520 }
521
522 pub fn is_duration(&self) -> bool {
523 matches!(self, DataType::Duration(..))
524 }
525
526 pub fn is_object(&self) -> bool {
527 #[cfg(feature = "object")]
528 {
529 matches!(self, DataType::Object(_))
530 }
531 #[cfg(not(feature = "object"))]
532 {
533 false
534 }
535 }
536
537 pub fn is_null(&self) -> bool {
538 matches!(self, DataType::Null)
539 }
540
541 pub fn contains_views(&self) -> bool {
542 use DataType::*;
543 match self {
544 Binary | String => true,
545 List(inner) => inner.contains_views(),
546 #[cfg(feature = "dtype-array")]
547 Array(inner, _) => inner.contains_views(),
548 #[cfg(feature = "dtype-struct")]
549 Struct(fields) => fields.iter().any(|field| field.dtype.contains_views()),
550 _ => false,
551 }
552 }
553
554 pub fn contains_categoricals(&self) -> bool {
555 use DataType::*;
556 match self {
557 #[cfg(feature = "dtype-categorical")]
558 Categorical(_, _) | Enum(_, _) => true,
559 List(inner) => inner.contains_categoricals(),
560 #[cfg(feature = "dtype-array")]
561 Array(inner, _) => inner.contains_categoricals(),
562 #[cfg(feature = "dtype-struct")]
563 Struct(fields) => fields
564 .iter()
565 .any(|field| field.dtype.contains_categoricals()),
566 _ => false,
567 }
568 }
569
570 pub fn contains_objects(&self) -> bool {
571 use DataType::*;
572 match self {
573 #[cfg(feature = "object")]
574 Object(_) => true,
575 List(inner) => inner.contains_objects(),
576 #[cfg(feature = "dtype-array")]
577 Array(inner, _) => inner.contains_objects(),
578 #[cfg(feature = "dtype-struct")]
579 Struct(fields) => fields.iter().any(|field| field.dtype.contains_objects()),
580 _ => false,
581 }
582 }
583
584 pub fn contains_list_recursive(&self) -> bool {
585 use DataType as D;
586 match self {
587 D::List(_) => true,
588 #[cfg(feature = "dtype-array")]
589 D::Array(inner, _) => inner.contains_list_recursive(),
590 #[cfg(feature = "dtype-struct")]
591 D::Struct(fields) => fields
592 .iter()
593 .any(|field| field.dtype.contains_list_recursive()),
594 _ => false,
595 }
596 }
597
598 pub fn contains_unknown(&self) -> bool {
599 use DataType as D;
600 match self {
601 D::Unknown(_) => true,
602 D::List(inner) => inner.contains_unknown(),
603 #[cfg(feature = "dtype-array")]
604 D::Array(inner, _) => inner.contains_unknown(),
605 #[cfg(feature = "dtype-struct")]
606 D::Struct(fields) => fields.iter().any(|field| field.dtype.contains_unknown()),
607 _ => false,
608 }
609 }
610
611 pub fn is_ord(&self) -> bool {
613 let phys = self.to_physical();
614 phys.is_primitive_numeric()
615 || self.is_decimal()
616 || matches!(
617 phys,
618 DataType::Binary | DataType::String | DataType::Boolean
619 )
620 }
621
622 pub fn is_decimal(&self) -> bool {
624 match self {
625 #[cfg(feature = "dtype-decimal")]
626 DataType::Decimal(_, _) => true,
627 _ => false,
628 }
629 }
630
631 pub fn is_float(&self) -> bool {
634 matches!(
635 self,
636 DataType::Float32 | DataType::Float64 | DataType::Unknown(UnknownKind::Float)
637 )
638 }
639
640 pub fn is_integer(&self) -> bool {
642 matches!(
643 self,
644 DataType::Int8
645 | DataType::Int16
646 | DataType::Int32
647 | DataType::Int64
648 | DataType::Int128
649 | DataType::UInt8
650 | DataType::UInt16
651 | DataType::UInt32
652 | DataType::UInt64
653 | DataType::Unknown(UnknownKind::Int(_))
654 )
655 }
656
657 pub fn is_signed_integer(&self) -> bool {
658 matches!(
660 self,
661 DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 | DataType::Int128
662 )
663 }
664
665 pub fn is_unsigned_integer(&self) -> bool {
666 matches!(
667 self,
668 DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64,
669 )
670 }
671
672 pub fn is_string(&self) -> bool {
673 matches!(self, DataType::String | DataType::Unknown(UnknownKind::Str))
674 }
675
676 pub fn is_categorical(&self) -> bool {
677 #[cfg(feature = "dtype-categorical")]
678 {
679 matches!(self, DataType::Categorical(_, _))
680 }
681 #[cfg(not(feature = "dtype-categorical"))]
682 {
683 false
684 }
685 }
686
687 pub fn is_enum(&self) -> bool {
688 #[cfg(feature = "dtype-categorical")]
689 {
690 matches!(self, DataType::Enum(_, _))
691 }
692 #[cfg(not(feature = "dtype-categorical"))]
693 {
694 false
695 }
696 }
697
698 pub fn to_arrow_field(&self, name: PlSmallStr, compat_level: CompatLevel) -> ArrowField {
700 let metadata = match self {
701 #[cfg(feature = "dtype-categorical")]
702 DataType::Enum(fcats, _map) => {
703 let cats = fcats.categories();
704 let strings_size: usize = cats
705 .values_iter()
706 .map(|s| (s.len() + 1).ilog10() as usize + 1 + s.len())
707 .sum();
708 let mut encoded = String::with_capacity(strings_size);
709 for cat in cats.values_iter() {
710 encoded.push_str(itoa::Buffer::new().format(cat.len()));
711 encoded.push(';');
712 encoded.push_str(cat);
713 }
714 Some(BTreeMap::from([(
715 PlSmallStr::from_static(DTYPE_ENUM_VALUES_NEW),
716 PlSmallStr::from_string(encoded),
717 )]))
718 },
719 #[cfg(feature = "dtype-categorical")]
720 DataType::Categorical(cats, _) => {
721 let mut encoded = String::new();
722 encoded.push_str(itoa::Buffer::new().format(cats.name().len()));
723 encoded.push(';');
724 encoded.push_str(cats.name());
725 encoded.push_str(itoa::Buffer::new().format(cats.namespace().len()));
726 encoded.push(';');
727 encoded.push_str(cats.namespace());
728 encoded.push_str(cats.physical().as_str());
729 encoded.push(';');
730
731 Some(BTreeMap::from([(
732 PlSmallStr::from_static(DTYPE_CATEGORICAL_NEW),
733 PlSmallStr::from_string(encoded),
734 )]))
735 },
736 DataType::BinaryOffset => Some(BTreeMap::from([(
737 PlSmallStr::from_static(PL_KEY),
738 PlSmallStr::from_static(MAINTAIN_PL_TYPE),
739 )])),
740 _ => None,
741 };
742
743 let field = ArrowField::new(name, self.to_arrow(compat_level), true);
744
745 if let Some(metadata) = metadata {
746 field.with_metadata(metadata)
747 } else {
748 field
749 }
750 }
751
752 pub fn max(&self) -> PolarsResult<Scalar> {
754 use DataType::*;
755 let v = match self {
756 Int8 => Scalar::from(i8::MAX),
757 Int16 => Scalar::from(i16::MAX),
758 Int32 => Scalar::from(i32::MAX),
759 Int64 => Scalar::from(i64::MAX),
760 Int128 => Scalar::from(i128::MAX),
761 UInt8 => Scalar::from(u8::MAX),
762 UInt16 => Scalar::from(u16::MAX),
763 UInt32 => Scalar::from(u32::MAX),
764 UInt64 => Scalar::from(u64::MAX),
765 Float32 => Scalar::from(f32::INFINITY),
766 Float64 => Scalar::from(f64::INFINITY),
767 #[cfg(feature = "dtype-time")]
768 Time => Scalar::new(Time, AnyValue::Time(NS_IN_DAY - 1)),
769 dt => polars_bail!(ComputeError: "cannot determine upper bound for dtype `{}`", dt),
770 };
771 Ok(v)
772 }
773
774 pub fn min(&self) -> PolarsResult<Scalar> {
776 use DataType::*;
777 let v = match self {
778 Int8 => Scalar::from(i8::MIN),
779 Int16 => Scalar::from(i16::MIN),
780 Int32 => Scalar::from(i32::MIN),
781 Int64 => Scalar::from(i64::MIN),
782 Int128 => Scalar::from(i128::MIN),
783 UInt8 => Scalar::from(u8::MIN),
784 UInt16 => Scalar::from(u16::MIN),
785 UInt32 => Scalar::from(u32::MIN),
786 UInt64 => Scalar::from(u64::MIN),
787 Float32 => Scalar::from(f32::NEG_INFINITY),
788 Float64 => Scalar::from(f64::NEG_INFINITY),
789 #[cfg(feature = "dtype-time")]
790 Time => Scalar::new(Time, AnyValue::Time(0)),
791 dt => polars_bail!(ComputeError: "cannot determine lower bound for dtype `{}`", dt),
792 };
793 Ok(v)
794 }
795
796 #[inline]
798 pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowDataType {
799 self.try_to_arrow(compat_level).unwrap()
800 }
801
802 #[inline]
803 pub fn try_to_arrow(&self, compat_level: CompatLevel) -> PolarsResult<ArrowDataType> {
804 use DataType::*;
805 match self {
806 Boolean => Ok(ArrowDataType::Boolean),
807 UInt8 => Ok(ArrowDataType::UInt8),
808 UInt16 => Ok(ArrowDataType::UInt16),
809 UInt32 => Ok(ArrowDataType::UInt32),
810 UInt64 => Ok(ArrowDataType::UInt64),
811 Int8 => Ok(ArrowDataType::Int8),
812 Int16 => Ok(ArrowDataType::Int16),
813 Int32 => Ok(ArrowDataType::Int32),
814 Int64 => Ok(ArrowDataType::Int64),
815 Int128 => Ok(ArrowDataType::Int128),
816 Float32 => Ok(ArrowDataType::Float32),
817 Float64 => Ok(ArrowDataType::Float64),
818 #[cfg(feature = "dtype-decimal")]
819 Decimal(precision, scale) => {
820 let precision = (*precision).unwrap_or(38);
821 polars_ensure!(precision <= 38 && precision > 0, InvalidOperation: "decimal precision should be <= 38 & >= 1");
822
823 Ok(ArrowDataType::Decimal(
824 precision,
825 scale.unwrap_or(0), ))
827 },
828 String => {
829 let dt = if compat_level.0 >= 1 {
830 ArrowDataType::Utf8View
831 } else {
832 ArrowDataType::LargeUtf8
833 };
834 Ok(dt)
835 },
836 Binary => {
837 let dt = if compat_level.0 >= 1 {
838 ArrowDataType::BinaryView
839 } else {
840 ArrowDataType::LargeBinary
841 };
842 Ok(dt)
843 },
844 Date => Ok(ArrowDataType::Date32),
845 Datetime(unit, tz) => Ok(ArrowDataType::Timestamp(
846 unit.to_arrow(),
847 tz.as_deref().cloned(),
848 )),
849 Duration(unit) => Ok(ArrowDataType::Duration(unit.to_arrow())),
850 Time => Ok(ArrowDataType::Time64(ArrowTimeUnit::Nanosecond)),
851 #[cfg(feature = "dtype-array")]
852 Array(dt, size) => Ok(dt
853 .try_to_arrow(compat_level)?
854 .to_fixed_size_list(*size, true)),
855 List(dt) => Ok(ArrowDataType::LargeList(Box::new(
856 dt.to_arrow_field(LIST_VALUES_NAME, compat_level),
857 ))),
858 Null => Ok(ArrowDataType::Null),
859 #[cfg(feature = "object")]
860 Object(_) => Ok(get_object_physical_type()),
861 #[cfg(feature = "dtype-categorical")]
862 Categorical(_, _) | Enum(_, _) => {
863 let arrow_phys = match self.cat_physical().unwrap() {
864 CategoricalPhysical::U8 => IntegerType::UInt8,
865 CategoricalPhysical::U16 => IntegerType::UInt16,
866 CategoricalPhysical::U32 => IntegerType::UInt32,
867 };
868
869 let values = if compat_level.0 >= 1 {
870 ArrowDataType::Utf8View
871 } else {
872 ArrowDataType::LargeUtf8
873 };
874
875 Ok(ArrowDataType::Dictionary(
876 arrow_phys,
877 Box::new(values),
878 false,
879 ))
880 },
881 #[cfg(feature = "dtype-struct")]
882 Struct(fields) => {
883 let fields = fields
884 .iter()
885 .map(|fld| fld.to_arrow(compat_level))
886 .collect();
887 Ok(ArrowDataType::Struct(fields))
888 },
889 BinaryOffset => Ok(ArrowDataType::LargeBinary),
890 Unknown(kind) => {
891 let dt = match kind {
892 UnknownKind::Any | UnknownKind::Ufunc => ArrowDataType::Unknown,
893 UnknownKind::Float => ArrowDataType::Float64,
894 UnknownKind::Str => ArrowDataType::Utf8View,
895 UnknownKind::Int(v) => {
896 return materialize_dyn_int(*v).dtype().try_to_arrow(compat_level);
897 },
898 };
899 Ok(dt)
900 },
901 }
902 }
903
904 pub fn is_nested_null(&self) -> bool {
905 use DataType::*;
906 match self {
907 Null => true,
908 List(field) => field.is_nested_null(),
909 #[cfg(feature = "dtype-array")]
910 Array(field, _) => field.is_nested_null(),
911 #[cfg(feature = "dtype-struct")]
912 Struct(fields) => fields.iter().all(|fld| fld.dtype.is_nested_null()),
913 _ => false,
914 }
915 }
916
917 pub fn matches_schema_type(&self, schema_type: &DataType) -> PolarsResult<bool> {
924 match (self, schema_type) {
925 (DataType::List(l), DataType::List(r)) => l.matches_schema_type(r),
926 #[cfg(feature = "dtype-array")]
927 (DataType::Array(l, sl), DataType::Array(r, sr)) => {
928 Ok(l.matches_schema_type(r)? && sl == sr)
929 },
930 #[cfg(feature = "dtype-struct")]
931 (DataType::Struct(l), DataType::Struct(r)) => {
932 if l.len() != r.len() {
933 polars_bail!(SchemaMismatch: "structs have different number of fields: {} vs {}", l.len(), r.len());
934 }
935 let mut must_cast = false;
936 for (l, r) in l.iter().zip(r.iter()) {
937 must_cast |= l.dtype.matches_schema_type(&r.dtype)?;
938 }
939 Ok(must_cast)
940 },
941 (DataType::Null, DataType::Null) => Ok(false),
942 #[cfg(feature = "dtype-decimal")]
943 (DataType::Decimal(_, s1), DataType::Decimal(_, s2)) => Ok(s1 != s2),
944 (DataType::Null, _) => Ok(true),
947 #[cfg(feature = "dtype-categorical")]
948 (DataType::Categorical(l, _), DataType::Categorical(r, _)) => {
949 ensure_same_categories(l, r)?;
950 Ok(false)
951 },
952 #[cfg(feature = "dtype-categorical")]
953 (DataType::Enum(l, _), DataType::Enum(r, _)) => {
954 ensure_same_frozen_categories(l, r)?;
955 Ok(false)
956 },
957
958 (l, r) if l == r => Ok(false),
959 (l, r) => {
960 polars_bail!(SchemaMismatch: "type {:?} is incompatible with expected type {:?}", l, r)
961 },
962 }
963 }
964
965 #[inline]
966 pub fn is_unknown(&self) -> bool {
967 matches!(self, DataType::Unknown(_))
968 }
969
970 pub fn nesting_level(&self) -> usize {
971 let mut level = 0;
972 let mut slf = self;
973 while let Some(inner_dtype) = slf.inner_dtype() {
974 level += 1;
975 slf = inner_dtype;
976 }
977 level
978 }
979
980 #[cfg(feature = "dtype-categorical")]
982 pub fn cat_physical(&self) -> PolarsResult<CategoricalPhysical> {
983 match self {
984 DataType::Categorical(cats, _) => Ok(cats.physical()),
985 DataType::Enum(fcats, _) => Ok(fcats.physical()),
986 _ => {
987 polars_bail!(SchemaMismatch: "invalid dtype: expected an Enum or Categorical type, received '{:?}'", self)
988 },
989 }
990 }
991
992 #[cfg(feature = "dtype-categorical")]
994 pub fn cat_mapping(&self) -> PolarsResult<&Arc<CategoricalMapping>> {
995 match self {
996 DataType::Categorical(_, mapping) | DataType::Enum(_, mapping) => Ok(mapping),
997 _ => {
998 polars_bail!(SchemaMismatch: "invalid dtype: expected an Enum or Categorical type, received '{:?}'", self)
999 },
1000 }
1001 }
1002
1003 #[cfg(feature = "dtype-categorical")]
1004 pub fn from_categories(cats: Arc<Categories>) -> Self {
1005 let mapping = cats.mapping();
1006 Self::Categorical(cats, mapping)
1007 }
1008
1009 #[cfg(feature = "dtype-categorical")]
1010 pub fn from_frozen_categories(fcats: Arc<FrozenCategories>) -> Self {
1011 let mapping = fcats.mapping().clone();
1012 Self::Enum(fcats, mapping)
1013 }
1014
1015 pub fn is_numeric(&self) -> bool {
1016 self.is_integer() || self.is_float() || self.is_decimal()
1017 }
1018}
1019
1020impl Display for DataType {
1021 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1022 let s = match self {
1023 DataType::Null => "null",
1024 DataType::Boolean => "bool",
1025 DataType::UInt8 => "u8",
1026 DataType::UInt16 => "u16",
1027 DataType::UInt32 => "u32",
1028 DataType::UInt64 => "u64",
1029 DataType::Int8 => "i8",
1030 DataType::Int16 => "i16",
1031 DataType::Int32 => "i32",
1032 DataType::Int64 => "i64",
1033 DataType::Int128 => "i128",
1034 DataType::Float32 => "f32",
1035 DataType::Float64 => "f64",
1036 #[cfg(feature = "dtype-decimal")]
1037 DataType::Decimal(precision, scale) => {
1038 return match (precision, scale) {
1039 (Some(precision), Some(scale)) => {
1040 f.write_str(&format!("decimal[{precision},{scale}]"))
1041 },
1042 (None, Some(scale)) => f.write_str(&format!("decimal[*,{scale}]")),
1043 _ => f.write_str("decimal[?]"), };
1045 },
1046 DataType::String => "str",
1047 DataType::Binary => "binary",
1048 DataType::Date => "date",
1049 DataType::Datetime(tu, tz) => {
1050 let s = match tz {
1051 None => format!("datetime[{tu}]"),
1052 Some(tz) => format!("datetime[{tu}, {tz}]"),
1053 };
1054 return f.write_str(&s);
1055 },
1056 DataType::Duration(tu) => return write!(f, "duration[{tu}]"),
1057 DataType::Time => "time",
1058 #[cfg(feature = "dtype-array")]
1059 DataType::Array(_, _) => {
1060 let tp = self.array_leaf_dtype().unwrap();
1061
1062 let dims = self.get_shape().unwrap();
1063 let shape = if dims.len() == 1 {
1064 format!("{}", dims[0])
1065 } else {
1066 format_tuple!(dims)
1067 };
1068 return write!(f, "array[{tp}, {shape}]");
1069 },
1070 DataType::List(tp) => return write!(f, "list[{tp}]"),
1071 #[cfg(feature = "object")]
1072 DataType::Object(s) => s,
1073 #[cfg(feature = "dtype-categorical")]
1074 DataType::Categorical(_, _) => "cat",
1075 #[cfg(feature = "dtype-categorical")]
1076 DataType::Enum(_, _) => "enum",
1077 #[cfg(feature = "dtype-struct")]
1078 DataType::Struct(fields) => return write!(f, "struct[{}]", fields.len()),
1079 DataType::Unknown(kind) => match kind {
1080 UnknownKind::Ufunc => "unknown ufunc",
1081 UnknownKind::Any => "unknown",
1082 UnknownKind::Int(_) => "dyn int",
1083 UnknownKind::Float => "dyn float",
1084 UnknownKind::Str => "dyn str",
1085 },
1086 DataType::BinaryOffset => "binary[offset]",
1087 };
1088 f.write_str(s)
1089 }
1090}
1091
1092pub fn merge_dtypes(left: &DataType, right: &DataType) -> PolarsResult<DataType> {
1093 use DataType::*;
1094 Ok(match (left, right) {
1095 #[cfg(feature = "dtype-categorical")]
1096 (Categorical(cats_l, map), Categorical(cats_r, _)) => {
1097 ensure_same_categories(cats_l, cats_r)?;
1098 Categorical(cats_l.clone(), map.clone())
1099 },
1100 #[cfg(feature = "dtype-categorical")]
1101 (Enum(fcats_l, map), Enum(fcats_r, _)) => {
1102 ensure_same_frozen_categories(fcats_l, fcats_r)?;
1103 Enum(fcats_l.clone(), map.clone())
1104 },
1105 (List(inner_l), List(inner_r)) => {
1106 let merged = merge_dtypes(inner_l, inner_r)?;
1107 List(Box::new(merged))
1108 },
1109 #[cfg(feature = "dtype-struct")]
1110 (Struct(inner_l), Struct(inner_r)) => {
1111 polars_ensure!(inner_l.len() == inner_r.len(), ComputeError: "cannot combine structs with differing amounts of fields ({} != {})", inner_l.len(), inner_r.len());
1112 let fields = inner_l.iter().zip(inner_r.iter()).map(|(l, r)| {
1113 polars_ensure!(l.name() == r.name(), ComputeError: "cannot combine structs with different fields ({} != {})", l.name(), r.name());
1114 let merged = merge_dtypes(l.dtype(), r.dtype())?;
1115 Ok(Field::new(l.name().clone(), merged))
1116 }).collect::<PolarsResult<Vec<_>>>()?;
1117 Struct(fields)
1118 },
1119 #[cfg(feature = "dtype-array")]
1120 (Array(inner_l, width_l), Array(inner_r, width_r)) => {
1121 polars_ensure!(width_l == width_r, ComputeError: "widths of FixedSizeWidth Series are not equal");
1122 let merged = merge_dtypes(inner_l, inner_r)?;
1123 Array(Box::new(merged), *width_l)
1124 },
1125 (left, right) if left == right => left.clone(),
1126 _ => polars_bail!(ComputeError: "unable to merge datatypes"),
1127 })
1128}
1129
1130fn collect_nested_types(
1131 dtype: &DataType,
1132 result: &mut PlHashSet<DataType>,
1133 include_compound_types: bool,
1134) {
1135 match dtype {
1136 DataType::List(inner) => {
1137 if include_compound_types {
1138 result.insert(dtype.clone());
1139 }
1140 collect_nested_types(inner, result, include_compound_types);
1141 },
1142 #[cfg(feature = "dtype-array")]
1143 DataType::Array(inner, _) => {
1144 if include_compound_types {
1145 result.insert(dtype.clone());
1146 }
1147 collect_nested_types(inner, result, include_compound_types);
1148 },
1149 #[cfg(feature = "dtype-struct")]
1150 DataType::Struct(fields) => {
1151 if include_compound_types {
1152 result.insert(dtype.clone());
1153 }
1154 for field in fields {
1155 collect_nested_types(field.dtype(), result, include_compound_types);
1156 }
1157 },
1158 _ => {
1159 result.insert(dtype.clone());
1160 },
1161 }
1162}
1163
1164pub fn unpack_dtypes(dtype: &DataType, include_compound_types: bool) -> PlHashSet<DataType> {
1165 let mut result = PlHashSet::new();
1166 collect_nested_types(dtype, &mut result, include_compound_types);
1167 result
1168}
1169
1170#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
1171#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
1172#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
1173pub struct CompatLevel(pub(crate) u16);
1174
1175impl CompatLevel {
1176 pub const fn newest() -> CompatLevel {
1177 CompatLevel(1)
1178 }
1179
1180 pub const fn oldest() -> CompatLevel {
1181 CompatLevel(0)
1182 }
1183
1184 #[doc(hidden)]
1187 pub fn with_level(level: u16) -> PolarsResult<CompatLevel> {
1188 if level > CompatLevel::newest().0 {
1189 polars_bail!(InvalidOperation: "invalid compat level");
1190 }
1191 Ok(CompatLevel(level))
1192 }
1193
1194 #[doc(hidden)]
1195 pub fn get_level(&self) -> u16 {
1196 self.0
1197 }
1198}
1199
1200#[cfg(test)]
1201mod tests {
1202 use super::*;
1203
1204 #[cfg(feature = "dtype-array")]
1205 #[test]
1206 fn test_unpack_primitive_dtypes() {
1207 let inner_type = DataType::Float64;
1208 let array_type = DataType::Array(Box::new(inner_type), 10);
1209 let list_type = DataType::List(Box::new(array_type.clone()));
1210
1211 let result = unpack_dtypes(&list_type, false);
1212
1213 let mut expected = PlHashSet::new();
1214 expected.insert(DataType::Float64);
1215
1216 assert_eq!(result, expected)
1217 }
1218
1219 #[cfg(feature = "dtype-array")]
1220 #[test]
1221 fn test_unpack_compound_dtypes() {
1222 let inner_type = DataType::Float64;
1223 let array_type = DataType::Array(Box::new(inner_type), 10);
1224 let list_type = DataType::List(Box::new(array_type.clone()));
1225
1226 let result = unpack_dtypes(&list_type, true);
1227
1228 let mut expected = PlHashSet::new();
1229 expected.insert(list_type.clone());
1230 expected.insert(array_type.clone());
1231 expected.insert(DataType::Float64);
1232
1233 assert_eq!(result, expected)
1234 }
1235}