1use std::collections::BTreeMap;
2
3use arrow::datatypes::{
4 DTYPE_CATEGORICAL_NEW, DTYPE_ENUM_VALUES_LEGACY, DTYPE_ENUM_VALUES_NEW, MAINTAIN_PL_TYPE,
5 Metadata, PL_KEY,
6};
7#[cfg(feature = "dtype-array")]
8use polars_utils::format_tuple;
9use polars_utils::itertools::Itertools;
10#[cfg(any(feature = "serde-lazy", feature = "serde"))]
11use serde::{Deserialize, Serialize};
12pub use temporal::time_zone::TimeZone;
13
14use super::*;
15#[cfg(feature = "object")]
16use crate::chunked_array::object::registry::get_object_physical_type;
17#[cfg(feature = "dtype-extension")]
18pub use crate::datatypes::extension::ExtensionTypeInstance;
19use crate::utils::materialize_dyn_int;
20
21pub trait MetaDataExt: IntoMetadata {
22 fn pl_enum_metadata(&self) -> Option<&str> {
23 let md = self.into_metadata_ref();
24 let values = md
25 .get(DTYPE_ENUM_VALUES_NEW)
26 .or_else(|| md.get(DTYPE_ENUM_VALUES_LEGACY));
27 Some(values?.as_str())
28 }
29
30 fn pl_categorical_metadata(&self) -> Option<&str> {
31 Some(
36 self.into_metadata_ref()
37 .get(DTYPE_CATEGORICAL_NEW)?
38 .as_str(),
39 )
40 }
41
42 fn maintain_type(&self) -> bool {
43 let metadata = self.into_metadata_ref();
44 metadata.get(PL_KEY).map(|s| s.as_str()) == Some(MAINTAIN_PL_TYPE)
45 }
46}
47
48impl MetaDataExt for Metadata {}
49pub trait IntoMetadata {
50 #[allow(clippy::wrong_self_convention)]
51 fn into_metadata_ref(&self) -> &Metadata;
52}
53
54impl IntoMetadata for Metadata {
55 fn into_metadata_ref(&self) -> &Metadata {
56 self
57 }
58}
59
60#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Default)]
61#[cfg_attr(
62 any(feature = "serde", feature = "serde-lazy"),
63 derive(Serialize, Deserialize)
64)]
65#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
66pub enum UnknownKind {
67 Int(i128),
69 Float,
70 Str,
72 #[default]
73 Any,
74}
75
76impl UnknownKind {
77 pub fn materialize(&self) -> Option<DataType> {
78 let dtype = match self {
79 UnknownKind::Int(v) => materialize_dyn_int(*v).dtype(),
80 UnknownKind::Float => DataType::Float64,
81 UnknownKind::Str => DataType::String,
82 UnknownKind::Any => return None,
83 };
84 Some(dtype)
85 }
86}
87
88#[derive(Clone)]
89pub enum DataType {
90 Boolean,
91 UInt8,
92 UInt16,
93 UInt32,
94 UInt64,
95 UInt128,
96 Int8,
97 Int16,
98 Int32,
99 Int64,
100 Int128,
101 Float16,
102 Float32,
103 Float64,
104 #[cfg(feature = "dtype-decimal")]
108 Decimal(usize, usize), String,
111 Binary,
112 BinaryOffset,
113 Date,
116 Datetime(TimeUnit, Option<TimeZone>),
119 Duration(TimeUnit),
121 Time,
123 #[cfg(feature = "dtype-array")]
125 Array(Box<DataType>, usize),
126 List(Box<DataType>),
128 #[cfg(feature = "object")]
131 Object(&'static str),
132 Null,
133 #[cfg(feature = "dtype-categorical")]
134 Categorical(Arc<Categories>, Arc<CategoricalMapping>),
135 #[cfg(feature = "dtype-categorical")]
137 Enum(Arc<FrozenCategories>, Arc<CategoricalMapping>),
138 #[cfg(feature = "dtype-struct")]
139 Struct(Vec<Field>),
140 #[cfg(feature = "dtype-extension")]
141 Extension(ExtensionTypeInstance, Box<DataType>),
142 Unknown(UnknownKind),
144}
145
146impl Default for DataType {
147 fn default() -> Self {
148 DataType::Unknown(UnknownKind::Any)
149 }
150}
151
152pub trait AsRefDataType {
153 fn as_ref_dtype(&self) -> &DataType;
154}
155
156impl Hash for DataType {
157 fn hash<H: Hasher>(&self, state: &mut H) {
158 std::mem::discriminant(self).hash(state)
159 }
160}
161
162impl PartialEq for DataType {
163 fn eq(&self, other: &Self) -> bool {
164 use DataType::*;
165 {
166 match (self, other) {
167 #[cfg(feature = "dtype-categorical")]
168 (Categorical(cats_l, _), Categorical(cats_r, _)) => Arc::ptr_eq(cats_l, cats_r),
169 #[cfg(feature = "dtype-categorical")]
170 (Enum(fcats_l, _), Enum(fcats_r, _)) => Arc::ptr_eq(fcats_l, fcats_r),
171 (Datetime(tu_l, tz_l), Datetime(tu_r, tz_r)) => tu_l == tu_r && tz_l == tz_r,
172 (List(left_inner), List(right_inner)) => left_inner == right_inner,
173 #[cfg(feature = "dtype-duration")]
174 (Duration(tu_l), Duration(tu_r)) => tu_l == tu_r,
175 #[cfg(feature = "dtype-decimal")]
176 (Decimal(p1, s1), Decimal(p2, s2)) => (p1, s1) == (p2, s2),
177 #[cfg(feature = "object")]
178 (Object(lhs), Object(rhs)) => lhs == rhs,
179 #[cfg(feature = "dtype-struct")]
180 (Struct(lhs), Struct(rhs)) => {
181 std::ptr::eq(Vec::as_ptr(lhs), Vec::as_ptr(rhs)) || lhs == rhs
182 },
183 #[cfg(feature = "dtype-array")]
184 (Array(left_inner, left_width), Array(right_inner, right_width)) => {
185 left_width == right_width && left_inner == right_inner
186 },
187 (Unknown(l), Unknown(r)) => match (l, r) {
188 (UnknownKind::Int(_), UnknownKind::Int(_)) => true,
189 _ => l == r,
190 },
191 _ => std::mem::discriminant(self) == std::mem::discriminant(other),
192 }
193 }
194 }
195}
196
197impl Eq for DataType {}
198
199impl DataType {
200 pub const IDX_DTYPE: Self = {
201 #[cfg(not(feature = "bigidx"))]
202 {
203 DataType::UInt32
204 }
205 #[cfg(feature = "bigidx")]
206 {
207 DataType::UInt64
208 }
209 };
210
211 pub fn value_within_range(&self, other: AnyValue) -> bool {
212 use DataType::*;
213 match self {
214 UInt8 => other.extract::<u8>().is_some(),
215 #[cfg(feature = "dtype-u16")]
216 UInt16 => other.extract::<u16>().is_some(),
217 UInt32 => other.extract::<u32>().is_some(),
218 UInt64 => other.extract::<u64>().is_some(),
219 #[cfg(feature = "dtype-u128")]
220 UInt128 => other.extract::<u128>().is_some(),
221 #[cfg(feature = "dtype-i8")]
222 Int8 => other.extract::<i8>().is_some(),
223 #[cfg(feature = "dtype-i16")]
224 Int16 => other.extract::<i16>().is_some(),
225 Int32 => other.extract::<i32>().is_some(),
226 Int64 => other.extract::<i64>().is_some(),
227 #[cfg(feature = "dtype-i128")]
228 Int128 => other.extract::<i128>().is_some(),
229 _ => false,
230 }
231 }
232
233 #[cfg(feature = "dtype-struct")]
235 pub fn _month_days_ns_struct_type() -> Self {
236 DataType::Struct(vec![
237 Field::new(PlSmallStr::from_static("months"), DataType::Int32),
238 Field::new(PlSmallStr::from_static("days"), DataType::Int32),
239 Field::new(
240 PlSmallStr::from_static("nanoseconds"),
241 DataType::Duration(TimeUnit::Nanoseconds),
242 ),
243 ])
244 }
245
246 pub fn is_known(&self) -> bool {
248 match self {
249 DataType::List(inner) => inner.is_known(),
250 #[cfg(feature = "dtype-array")]
251 DataType::Array(inner, _) => inner.is_known(),
252 #[cfg(feature = "dtype-struct")]
253 DataType::Struct(fields) => fields.iter().all(|fld| fld.dtype.is_known()),
254 DataType::Unknown(_) => false,
255 _ => true,
256 }
257 }
258
259 pub fn materialize_unknown(self, allow_unknown: bool) -> PolarsResult<DataType> {
262 match self {
263 DataType::Unknown(u) => match u.materialize() {
264 Some(known) => Ok(known),
265 None => {
266 if allow_unknown {
267 Ok(DataType::Unknown(u))
268 } else {
269 polars_bail!(SchemaMismatch: "failed to materialize unknown type")
270 }
271 },
272 },
273 DataType::List(inner) => Ok(DataType::List(Box::new(
274 inner.materialize_unknown(allow_unknown)?,
275 ))),
276 #[cfg(feature = "dtype-array")]
277 DataType::Array(inner, size) => Ok(DataType::Array(
278 Box::new(inner.materialize_unknown(allow_unknown)?),
279 size,
280 )),
281 #[cfg(feature = "dtype-struct")]
282 DataType::Struct(fields) => Ok(DataType::Struct(
283 fields
284 .into_iter()
285 .map(|f| {
286 PolarsResult::Ok(Field::new(
287 f.name,
288 f.dtype.materialize_unknown(allow_unknown)?,
289 ))
290 })
291 .try_collect_vec()?,
292 )),
293 _ => Ok(self),
294 }
295 }
296
297 #[cfg(feature = "dtype-array")]
298 pub fn get_shape(&self) -> Option<Vec<usize>> {
300 fn get_shape_impl(dt: &DataType, shape: &mut Vec<usize>) {
301 if let DataType::Array(inner, size) = dt {
302 shape.push(*size);
303 get_shape_impl(inner, shape);
304 }
305 }
306
307 if let DataType::Array(inner, size) = self {
308 let mut shape = vec![*size];
309 get_shape_impl(inner, &mut shape);
310 Some(shape)
311 } else {
312 None
313 }
314 }
315
316 pub fn inner_dtype(&self) -> Option<&DataType> {
318 match self {
319 DataType::List(inner) => Some(inner),
320 #[cfg(feature = "dtype-array")]
321 DataType::Array(inner, _) => Some(inner),
322 _ => None,
323 }
324 }
325
326 pub fn into_inner_dtype(self) -> Option<DataType> {
328 match self {
329 DataType::List(inner) => Some(*inner),
330 #[cfg(feature = "dtype-array")]
331 DataType::Array(inner, _) => Some(*inner),
332 _ => None,
333 }
334 }
335
336 pub fn try_into_inner_dtype(self) -> PolarsResult<DataType> {
338 match self {
339 DataType::List(inner) => Ok(*inner),
340 #[cfg(feature = "dtype-array")]
341 DataType::Array(inner, _) => Ok(*inner),
342 dt => polars_bail!(InvalidOperation: "cannot get inner datatype of `{dt}`"),
343 }
344 }
345
346 pub fn leaf_dtype(&self) -> &DataType {
348 let mut prev = self;
349 while let Some(dtype) = prev.inner_dtype() {
350 prev = dtype
351 }
352 prev
353 }
354
355 #[cfg(feature = "dtype-array")]
356 pub fn array_leaf_dtype(&self) -> Option<&DataType> {
358 let mut prev = self;
359 match prev {
360 DataType::Array(_, _) => {
361 while let DataType::Array(inner, _) = &prev {
362 prev = inner;
363 }
364 Some(prev)
365 },
366 _ => None,
367 }
368 }
369
370 pub fn cast_leaf(&self, to: DataType) -> DataType {
372 use DataType::*;
373 match self {
374 List(inner) => List(Box::new(inner.cast_leaf(to))),
375 #[cfg(feature = "dtype-array")]
376 Array(inner, size) => Array(Box::new(inner.cast_leaf(to)), *size),
377 _ => to,
378 }
379 }
380
381 pub fn can_cast_to(&self, to: &DataType) -> Option<bool> {
385 if self == to {
386 return Some(true);
387 }
388 if self.is_primitive_numeric() && to.is_primitive_numeric() {
389 return Some(true);
390 }
391
392 if self.is_null() {
393 return Some(true);
394 }
395
396 use DataType as D;
397 Some(match (self, to) {
398 #[cfg(feature = "dtype-categorical")]
399 (D::Categorical(_, _) | D::Enum(_, _), D::Binary)
400 | (D::Binary, D::Categorical(_, _) | D::Enum(_, _)) => false, #[cfg(feature = "object")]
403 (D::Object(_), D::Object(_)) => true,
404 #[cfg(feature = "object")]
405 (D::Object(_), _) | (_, D::Object(_)) => false,
406
407 (D::Boolean, dt) | (dt, D::Boolean) => match dt {
408 dt if dt.is_primitive_numeric() => true,
409 #[cfg(feature = "dtype-decimal")]
410 D::Decimal(_, _) => true,
411 D::String | D::Binary => true,
412 _ => false,
413 },
414
415 (D::List(from), D::List(to)) => from.can_cast_to(to)?,
416 #[cfg(feature = "dtype-array")]
417 (D::Array(from, l_width), D::Array(to, r_width)) => {
418 l_width == r_width && from.can_cast_to(to)?
419 },
420 #[cfg(feature = "dtype-struct")]
421 (D::Struct(l_fields), D::Struct(r_fields)) => {
422 if l_fields.is_empty() {
423 return Some(true);
424 }
425
426 if l_fields.len() != r_fields.len() {
427 return Some(false);
428 }
429
430 for (l, r) in l_fields.iter().zip(r_fields) {
431 if !l.dtype().can_cast_to(r.dtype())? {
432 return Some(false);
433 }
434 }
435
436 true
437 },
438
439 _ => return None,
441 })
442 }
443
444 pub fn implode(self) -> DataType {
445 DataType::List(Box::new(self))
446 }
447
448 #[must_use]
450 pub fn to_physical(&self) -> DataType {
451 use DataType::*;
452 match self {
453 Date => Int32,
454 Datetime(_, _) => Int64,
455 Duration(_) => Int64,
456 Time => Int64,
457 #[cfg(feature = "dtype-decimal")]
458 Decimal(_, _) => Int128,
459 #[cfg(feature = "dtype-categorical")]
460 Categorical(cats, _) => cats.physical().dtype(),
461 #[cfg(feature = "dtype-categorical")]
462 Enum(fcats, _) => fcats.physical().dtype(),
463 #[cfg(feature = "dtype-array")]
464 Array(dt, width) => Array(Box::new(dt.to_physical()), *width),
465 List(dt) => List(Box::new(dt.to_physical())),
466 #[cfg(feature = "dtype-struct")]
467 Struct(fields) => {
468 let new_fields = fields
469 .iter()
470 .map(|s| Field::new(s.name().clone(), s.dtype().to_physical()))
471 .collect();
472 Struct(new_fields)
473 },
474 #[cfg(feature = "dtype-extension")]
475 Extension(_, storage) => storage.to_physical(),
476 _ => self.clone(),
477 }
478 }
479
480 #[must_use]
481 pub fn to_storage(&self) -> DataType {
482 use DataType::*;
483 match self {
484 #[cfg(feature = "dtype-extension")]
485 Extension(_, storage) => storage.to_storage(),
486 _ => self.clone(),
487 }
488 }
489
490 pub fn is_supported_list_arithmetic_input(&self) -> bool {
491 self.is_primitive_numeric() || self.is_bool() || self.is_null()
492 }
493
494 pub fn is_logical(&self) -> bool {
496 self != &self.to_physical()
497 }
498
499 pub fn is_temporal(&self) -> bool {
501 use DataType::*;
502 matches!(self, Date | Datetime(_, _) | Duration(_) | Time)
503 }
504
505 pub fn is_primitive(&self) -> bool {
508 self.is_primitive_numeric()
509 | matches!(
510 self,
511 DataType::Boolean | DataType::String | DataType::Binary
512 )
513 }
514
515 pub fn is_primitive_numeric(&self) -> bool {
517 self.is_float() || self.is_integer()
518 }
519
520 pub fn is_bool(&self) -> bool {
522 matches!(self, DataType::Boolean)
523 }
524
525 pub fn is_list(&self) -> bool {
527 matches!(self, DataType::List(_))
528 }
529
530 pub fn is_array(&self) -> bool {
532 #[cfg(feature = "dtype-array")]
533 {
534 matches!(self, DataType::Array(_, _))
535 }
536 #[cfg(not(feature = "dtype-array"))]
537 {
538 false
539 }
540 }
541
542 pub fn is_nested(&self) -> bool {
543 match self {
544 DataType::List(_) => true,
545 #[cfg(feature = "dtype-array")]
546 DataType::Array(_, _) => true,
547 #[cfg(feature = "dtype-struct")]
548 DataType::Struct(_) => true,
549 #[cfg(feature = "dtype-extension")]
550 DataType::Extension(_, storage) => storage.is_nested(),
551 _ => false,
552 }
553 }
554
555 pub fn is_struct(&self) -> bool {
557 #[cfg(feature = "dtype-struct")]
558 {
559 matches!(self, DataType::Struct(_))
560 }
561 #[cfg(not(feature = "dtype-struct"))]
562 {
563 false
564 }
565 }
566
567 pub fn is_binary(&self) -> bool {
568 matches!(self, DataType::Binary)
569 }
570
571 pub fn is_date(&self) -> bool {
572 matches!(self, DataType::Date)
573 }
574 pub fn is_datetime(&self) -> bool {
575 matches!(self, DataType::Datetime(..))
576 }
577
578 pub fn is_duration(&self) -> bool {
579 matches!(self, DataType::Duration(..))
580 }
581
582 pub fn is_object(&self) -> bool {
583 #[cfg(feature = "object")]
584 {
585 matches!(self, DataType::Object(_))
586 }
587 #[cfg(not(feature = "object"))]
588 {
589 false
590 }
591 }
592
593 pub fn is_null(&self) -> bool {
594 matches!(self, DataType::Null)
595 }
596
597 pub fn contains_views(&self) -> bool {
598 use DataType::*;
599 match self {
600 Binary | String => true,
601 List(inner) => inner.contains_views(),
602 #[cfg(feature = "dtype-array")]
603 Array(inner, _) => inner.contains_views(),
604 #[cfg(feature = "dtype-struct")]
605 Struct(fields) => fields.iter().any(|field| field.dtype.contains_views()),
606 _ => false,
607 }
608 }
609
610 pub fn contains_categoricals(&self) -> bool {
611 use DataType::*;
612 match self {
613 #[cfg(feature = "dtype-categorical")]
614 Categorical(_, _) | Enum(_, _) => true,
615 List(inner) => inner.contains_categoricals(),
616 #[cfg(feature = "dtype-array")]
617 Array(inner, _) => inner.contains_categoricals(),
618 #[cfg(feature = "dtype-struct")]
619 Struct(fields) => fields
620 .iter()
621 .any(|field| field.dtype.contains_categoricals()),
622 _ => false,
623 }
624 }
625
626 pub fn contains_objects(&self) -> bool {
627 use DataType::*;
628 match self {
629 #[cfg(feature = "object")]
630 Object(_) => true,
631 List(inner) => inner.contains_objects(),
632 #[cfg(feature = "dtype-array")]
633 Array(inner, _) => inner.contains_objects(),
634 #[cfg(feature = "dtype-struct")]
635 Struct(fields) => fields.iter().any(|field| field.dtype.contains_objects()),
636 _ => false,
637 }
638 }
639
640 pub fn contains_list_recursive(&self) -> bool {
641 use DataType as D;
642 match self {
643 D::List(_) => true,
644 #[cfg(feature = "dtype-array")]
645 D::Array(inner, _) => inner.contains_list_recursive(),
646 #[cfg(feature = "dtype-struct")]
647 D::Struct(fields) => fields
648 .iter()
649 .any(|field| field.dtype.contains_list_recursive()),
650 _ => false,
651 }
652 }
653
654 pub fn contains_unknown(&self) -> bool {
655 use DataType as D;
656 match self {
657 D::Unknown(_) => true,
658 D::List(inner) => inner.contains_unknown(),
659 #[cfg(feature = "dtype-array")]
660 D::Array(inner, _) => inner.contains_unknown(),
661 #[cfg(feature = "dtype-struct")]
662 D::Struct(fields) => fields.iter().any(|field| field.dtype.contains_unknown()),
663 _ => false,
664 }
665 }
666
667 pub fn is_ord(&self) -> bool {
669 let phys = self.to_physical();
670 phys.is_primitive_numeric()
671 || self.is_decimal()
672 || matches!(
673 phys,
674 DataType::Binary | DataType::String | DataType::Boolean
675 )
676 }
677
678 pub fn is_decimal(&self) -> bool {
680 match self {
681 #[cfg(feature = "dtype-decimal")]
682 DataType::Decimal(_, _) => true,
683 _ => false,
684 }
685 }
686
687 pub fn is_float(&self) -> bool {
690 matches!(
691 self,
692 DataType::Float16
693 | DataType::Float32
694 | DataType::Float64
695 | DataType::Unknown(UnknownKind::Float)
696 )
697 }
698
699 pub fn is_integer(&self) -> bool {
701 matches!(
702 self,
703 DataType::Int8
704 | DataType::Int16
705 | DataType::Int32
706 | DataType::Int64
707 | DataType::Int128
708 | DataType::UInt8
709 | DataType::UInt16
710 | DataType::UInt32
711 | DataType::UInt64
712 | DataType::UInt128
713 | DataType::Unknown(UnknownKind::Int(_))
714 )
715 }
716
717 pub fn is_signed_integer(&self) -> bool {
718 matches!(
720 self,
721 DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 | DataType::Int128
722 )
723 }
724
725 pub fn is_unsigned_integer(&self) -> bool {
726 matches!(
727 self,
728 DataType::UInt8
729 | DataType::UInt16
730 | DataType::UInt32
731 | DataType::UInt64
732 | DataType::UInt128,
733 )
734 }
735
736 pub fn is_string(&self) -> bool {
737 matches!(self, DataType::String | DataType::Unknown(UnknownKind::Str))
738 }
739
740 pub fn is_categorical(&self) -> bool {
741 #[cfg(feature = "dtype-categorical")]
742 {
743 matches!(self, DataType::Categorical(_, _))
744 }
745 #[cfg(not(feature = "dtype-categorical"))]
746 {
747 false
748 }
749 }
750
751 pub fn is_enum(&self) -> bool {
752 #[cfg(feature = "dtype-categorical")]
753 {
754 matches!(self, DataType::Enum(_, _))
755 }
756 #[cfg(not(feature = "dtype-categorical"))]
757 {
758 false
759 }
760 }
761
762 pub fn is_extension(&self) -> bool {
763 #[cfg(feature = "dtype-extension")]
764 {
765 matches!(self, DataType::Extension(_, _))
766 }
767 #[cfg(not(feature = "dtype-extension"))]
768 {
769 false
770 }
771 }
772
773 pub fn to_arrow_field(&self, name: PlSmallStr, compat_level: CompatLevel) -> ArrowField {
775 let metadata = match self {
776 #[cfg(feature = "dtype-categorical")]
777 DataType::Enum(fcats, _map) => {
778 let cats = fcats.categories();
779 let strings_size: usize = cats
780 .values_iter()
781 .map(|s| (s.len() + 1).ilog10() as usize + 1 + s.len())
782 .sum();
783 let mut encoded = String::with_capacity(strings_size);
784 for cat in cats.values_iter() {
785 encoded.push_str(itoa::Buffer::new().format(cat.len()));
786 encoded.push(';');
787 encoded.push_str(cat);
788 }
789 Some(BTreeMap::from([(
790 PlSmallStr::from_static(DTYPE_ENUM_VALUES_NEW),
791 PlSmallStr::from_string(encoded),
792 )]))
793 },
794 #[cfg(feature = "dtype-categorical")]
795 DataType::Categorical(cats, _) => {
796 let mut encoded = String::new();
797 encoded.push_str(itoa::Buffer::new().format(cats.name().len()));
798 encoded.push(';');
799 encoded.push_str(cats.name());
800 encoded.push_str(itoa::Buffer::new().format(cats.namespace().len()));
801 encoded.push(';');
802 encoded.push_str(cats.namespace());
803 encoded.push_str(cats.physical().as_str());
804 encoded.push(';');
805
806 Some(BTreeMap::from([(
807 PlSmallStr::from_static(DTYPE_CATEGORICAL_NEW),
808 PlSmallStr::from_string(encoded),
809 )]))
810 },
811 DataType::BinaryOffset => Some(BTreeMap::from([(
812 PlSmallStr::from_static(PL_KEY),
813 PlSmallStr::from_static(MAINTAIN_PL_TYPE),
814 )])),
815 _ => None,
816 };
817
818 let field = ArrowField::new(name, self.to_arrow(compat_level), true);
819
820 if let Some(metadata) = metadata {
821 field.with_metadata(metadata)
822 } else {
823 field
824 }
825 }
826
827 pub fn max(&self) -> PolarsResult<Scalar> {
829 use DataType::*;
830 let v = match self {
831 Int8 => Scalar::from(i8::MAX),
832 Int16 => Scalar::from(i16::MAX),
833 Int32 => Scalar::from(i32::MAX),
834 Int64 => Scalar::from(i64::MAX),
835 Int128 => Scalar::from(i128::MAX),
836 UInt8 => Scalar::from(u8::MAX),
837 UInt16 => Scalar::from(u16::MAX),
838 UInt32 => Scalar::from(u32::MAX),
839 UInt64 => Scalar::from(u64::MAX),
840 UInt128 => Scalar::from(u128::MAX),
841 Float16 => Scalar::from(pf16::INFINITY),
842 Float32 => Scalar::from(f32::INFINITY),
843 Float64 => Scalar::from(f64::INFINITY),
844 #[cfg(feature = "dtype-time")]
845 Time => Scalar::new(Time, AnyValue::Time(NS_IN_DAY - 1)),
846 dt => polars_bail!(ComputeError: "cannot determine upper bound for dtype `{}`", dt),
847 };
848 Ok(v)
849 }
850
851 pub fn min(&self) -> PolarsResult<Scalar> {
853 use DataType::*;
854 let v = match self {
855 Int8 => Scalar::from(i8::MIN),
856 Int16 => Scalar::from(i16::MIN),
857 Int32 => Scalar::from(i32::MIN),
858 Int64 => Scalar::from(i64::MIN),
859 Int128 => Scalar::from(i128::MIN),
860 UInt8 => Scalar::from(u8::MIN),
861 UInt16 => Scalar::from(u16::MIN),
862 UInt32 => Scalar::from(u32::MIN),
863 UInt64 => Scalar::from(u64::MIN),
864 UInt128 => Scalar::from(u128::MIN),
865 Float16 => Scalar::from(pf16::NEG_INFINITY),
866 Float32 => Scalar::from(f32::NEG_INFINITY),
867 Float64 => Scalar::from(f64::NEG_INFINITY),
868 #[cfg(feature = "dtype-time")]
869 Time => Scalar::new(Time, AnyValue::Time(0)),
870 dt => polars_bail!(ComputeError: "cannot determine lower bound for dtype `{}`", dt),
871 };
872 Ok(v)
873 }
874
875 #[inline]
877 pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowDataType {
878 self.try_to_arrow(compat_level).unwrap()
879 }
880
881 #[inline]
882 pub fn try_to_arrow(&self, compat_level: CompatLevel) -> PolarsResult<ArrowDataType> {
883 use DataType::*;
884 match self {
885 Boolean => Ok(ArrowDataType::Boolean),
886 UInt8 => Ok(ArrowDataType::UInt8),
887 UInt16 => Ok(ArrowDataType::UInt16),
888 UInt32 => Ok(ArrowDataType::UInt32),
889 UInt64 => Ok(ArrowDataType::UInt64),
890 UInt128 => Ok(ArrowDataType::UInt128),
891 Int8 => Ok(ArrowDataType::Int8),
892 Int16 => Ok(ArrowDataType::Int16),
893 Int32 => Ok(ArrowDataType::Int32),
894 Int64 => Ok(ArrowDataType::Int64),
895 Int128 => Ok(ArrowDataType::Int128),
896 Float16 => Ok(ArrowDataType::Float16),
897 Float32 => Ok(ArrowDataType::Float32),
898 Float64 => Ok(ArrowDataType::Float64),
899 #[cfg(feature = "dtype-decimal")]
900 Decimal(precision, scale) => {
901 assert!(*precision >= 1 && *precision <= 38);
902 Ok(ArrowDataType::Decimal(*precision, *scale))
903 },
904 String => {
905 let dt = if compat_level.0 >= 1 {
906 ArrowDataType::Utf8View
907 } else {
908 ArrowDataType::LargeUtf8
909 };
910 Ok(dt)
911 },
912 Binary => {
913 let dt = if compat_level.0 >= 1 {
914 ArrowDataType::BinaryView
915 } else {
916 ArrowDataType::LargeBinary
917 };
918 Ok(dt)
919 },
920 Date => Ok(ArrowDataType::Date32),
921 Datetime(unit, tz) => Ok(ArrowDataType::Timestamp(
922 unit.to_arrow(),
923 tz.as_deref().cloned(),
924 )),
925 Duration(unit) => Ok(ArrowDataType::Duration(unit.to_arrow())),
926 Time => Ok(ArrowDataType::Time64(ArrowTimeUnit::Nanosecond)),
927 #[cfg(feature = "dtype-array")]
928 Array(dt, width) => Ok(ArrowDataType::FixedSizeList(
929 Box::new(dt.to_arrow_field(LIST_VALUES_NAME, compat_level)),
930 *width,
931 )),
932 List(dt) => Ok(ArrowDataType::LargeList(Box::new(
933 dt.to_arrow_field(LIST_VALUES_NAME, compat_level),
934 ))),
935 Null => Ok(ArrowDataType::Null),
936 #[cfg(feature = "object")]
937 Object(_) => Ok(get_object_physical_type()),
938 #[cfg(feature = "dtype-categorical")]
939 Categorical(_, _) | Enum(_, _) => {
940 let arrow_phys = match self.cat_physical().unwrap() {
941 CategoricalPhysical::U8 => IntegerType::UInt8,
942 CategoricalPhysical::U16 => IntegerType::UInt16,
943 CategoricalPhysical::U32 => IntegerType::UInt32,
944 };
945
946 let values = if compat_level.0 >= 1 {
947 ArrowDataType::Utf8View
948 } else {
949 ArrowDataType::LargeUtf8
950 };
951
952 Ok(ArrowDataType::Dictionary(
953 arrow_phys,
954 Box::new(values),
955 false,
956 ))
957 },
958 #[cfg(feature = "dtype-struct")]
959 Struct(fields) => {
960 let fields = fields
961 .iter()
962 .map(|fld| fld.to_arrow(compat_level))
963 .collect();
964 Ok(ArrowDataType::Struct(fields))
965 },
966 BinaryOffset => Ok(ArrowDataType::LargeBinary),
967 #[cfg(feature = "dtype-extension")]
968 Extension(typ, inner) => Ok(ArrowDataType::Extension(Box::new(
969 arrow::datatypes::ExtensionType {
970 name: typ.name().into(),
971 inner: inner.try_to_arrow(compat_level)?,
972 metadata: typ.serialize_metadata().map(|m| m.into()),
973 },
974 ))),
975 Unknown(kind) => {
976 let dt = match kind {
977 UnknownKind::Any => ArrowDataType::Unknown,
978 UnknownKind::Float => ArrowDataType::Float64,
979 UnknownKind::Str => ArrowDataType::Utf8View,
980 UnknownKind::Int(v) => {
981 return materialize_dyn_int(*v).dtype().try_to_arrow(compat_level);
982 },
983 };
984 Ok(dt)
985 },
986 }
987 }
988
989 pub fn is_nested_null(&self) -> bool {
990 use DataType::*;
991 match self {
992 Null => true,
993 List(field) => field.is_nested_null(),
994 #[cfg(feature = "dtype-array")]
995 Array(field, _) => field.is_nested_null(),
996 #[cfg(feature = "dtype-struct")]
997 Struct(fields) => fields.iter().all(|fld| fld.dtype.is_nested_null()),
998 _ => false,
999 }
1000 }
1001
1002 pub fn matches_schema_type(&self, schema_type: &DataType) -> PolarsResult<bool> {
1009 match (self, schema_type) {
1010 (DataType::List(l), DataType::List(r)) => l.matches_schema_type(r),
1011 #[cfg(feature = "dtype-array")]
1012 (DataType::Array(l, sl), DataType::Array(r, sr)) => {
1013 Ok(l.matches_schema_type(r)? && sl == sr)
1014 },
1015 #[cfg(feature = "dtype-struct")]
1016 (DataType::Struct(l), DataType::Struct(r)) => {
1017 if l.len() != r.len() {
1018 polars_bail!(SchemaMismatch: "structs have different number of fields: {} vs {}", l.len(), r.len());
1019 }
1020 let mut must_cast = false;
1021 for (l, r) in l.iter().zip(r.iter()) {
1022 must_cast |= l.dtype.matches_schema_type(&r.dtype)?;
1023 }
1024 Ok(must_cast)
1025 },
1026 (DataType::Null, DataType::Null) => Ok(false),
1027 #[cfg(feature = "dtype-decimal")]
1028 (DataType::Decimal(p1, s1), DataType::Decimal(p2, s2)) => Ok((p1, s1) != (p2, s2)),
1029 (DataType::Null, _) => Ok(true),
1032 #[cfg(feature = "dtype-categorical")]
1033 (DataType::Categorical(l, _), DataType::Categorical(r, _)) => {
1034 ensure_same_categories(l, r)?;
1035 Ok(false)
1036 },
1037 #[cfg(feature = "dtype-categorical")]
1038 (DataType::Enum(l, _), DataType::Enum(r, _)) => {
1039 ensure_same_frozen_categories(l, r)?;
1040 Ok(false)
1041 },
1042
1043 (l, r) if l == r => Ok(false),
1044 (l, r) => {
1045 polars_bail!(SchemaMismatch: "type {:?} is incompatible with expected type {:?}", l, r)
1046 },
1047 }
1048 }
1049
1050 #[inline]
1051 pub fn is_unknown(&self) -> bool {
1052 matches!(self, DataType::Unknown(_))
1053 }
1054
1055 pub fn nesting_level(&self) -> usize {
1056 let mut level = 0;
1057 let mut slf = self;
1058 while let Some(inner_dtype) = slf.inner_dtype() {
1059 level += 1;
1060 slf = inner_dtype;
1061 }
1062 level
1063 }
1064
1065 #[cfg(feature = "dtype-categorical")]
1067 pub fn cat_physical(&self) -> PolarsResult<CategoricalPhysical> {
1068 match self {
1069 DataType::Categorical(cats, _) => Ok(cats.physical()),
1070 DataType::Enum(fcats, _) => Ok(fcats.physical()),
1071 _ => {
1072 polars_bail!(SchemaMismatch: "invalid dtype: expected an Enum or Categorical type, received '{:?}'", self)
1073 },
1074 }
1075 }
1076
1077 #[cfg(feature = "dtype-categorical")]
1079 pub fn cat_mapping(&self) -> PolarsResult<&Arc<CategoricalMapping>> {
1080 match self {
1081 DataType::Categorical(_, mapping) | DataType::Enum(_, mapping) => Ok(mapping),
1082 _ => {
1083 polars_bail!(SchemaMismatch: "invalid dtype: expected an Enum or Categorical type, received '{:?}'", self)
1084 },
1085 }
1086 }
1087
1088 #[cfg(feature = "dtype-categorical")]
1089 pub fn from_categories(cats: Arc<Categories>) -> Self {
1090 let mapping = cats.mapping();
1091 Self::Categorical(cats, mapping)
1092 }
1093
1094 #[cfg(feature = "dtype-categorical")]
1095 pub fn from_frozen_categories(fcats: Arc<FrozenCategories>) -> Self {
1096 let mapping = fcats.mapping().clone();
1097 Self::Enum(fcats, mapping)
1098 }
1099
1100 pub fn is_numeric(&self) -> bool {
1101 self.is_integer() || self.is_float() || self.is_decimal()
1102 }
1103}
1104
1105impl Display for DataType {
1106 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1107 let s = match self {
1108 DataType::Null => "null",
1109 DataType::Boolean => "bool",
1110 DataType::UInt8 => "u8",
1111 DataType::UInt16 => "u16",
1112 DataType::UInt32 => "u32",
1113 DataType::UInt64 => "u64",
1114 DataType::UInt128 => "u128",
1115 DataType::Int8 => "i8",
1116 DataType::Int16 => "i16",
1117 DataType::Int32 => "i32",
1118 DataType::Int64 => "i64",
1119 DataType::Int128 => "i128",
1120 DataType::Float16 => "f16",
1121 DataType::Float32 => "f32",
1122 DataType::Float64 => "f64",
1123 #[cfg(feature = "dtype-decimal")]
1124 DataType::Decimal(p, s) => return write!(f, "decimal[{p},{s}]"),
1125 DataType::String => "str",
1126 DataType::Binary => "binary",
1127 DataType::BinaryOffset => "binary[offset]",
1128 DataType::Date => "date",
1129 DataType::Datetime(tu, None) => return write!(f, "datetime[{tu}]"),
1130 DataType::Datetime(tu, Some(tz)) => return write!(f, "datetime[{tu}, {tz}]"),
1131 DataType::Duration(tu) => return write!(f, "duration[{tu}]"),
1132 DataType::Time => "time",
1133 #[cfg(feature = "dtype-array")]
1134 DataType::Array(_, _) => {
1135 let tp = self.array_leaf_dtype().unwrap();
1136
1137 let dims = self.get_shape().unwrap();
1138 let shape = if dims.len() == 1 {
1139 format!("{}", dims[0])
1140 } else {
1141 format_tuple!(dims)
1142 };
1143 return write!(f, "array[{tp}, {shape}]");
1144 },
1145 DataType::List(tp) => return write!(f, "list[{tp}]"),
1146 #[cfg(feature = "object")]
1147 DataType::Object(s) => s,
1148 #[cfg(feature = "dtype-categorical")]
1149 DataType::Categorical(_, _) => "cat",
1150 #[cfg(feature = "dtype-categorical")]
1151 DataType::Enum(_, _) => "enum",
1152 #[cfg(feature = "dtype-struct")]
1153 DataType::Struct(fields) => return write!(f, "struct[{}]", fields.len()),
1154 #[cfg(feature = "dtype-extension")]
1155 DataType::Extension(typ, _) => return write!(f, "ext[{}]", typ.0.dyn_display()),
1156 DataType::Unknown(kind) => match kind {
1157 UnknownKind::Any => "unknown",
1158 UnknownKind::Int(_) => "dyn int",
1159 UnknownKind::Float => "dyn float",
1160 UnknownKind::Str => "dyn str",
1161 },
1162 };
1163 f.write_str(s)
1164 }
1165}
1166
1167impl std::fmt::Debug for DataType {
1168 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1169 use DataType::*;
1170 match self {
1171 Boolean => write!(f, "Boolean"),
1172 UInt8 => write!(f, "UInt8"),
1173 UInt16 => write!(f, "UInt16"),
1174 UInt32 => write!(f, "UInt32"),
1175 UInt64 => write!(f, "UInt64"),
1176 UInt128 => write!(f, "UInt128"),
1177 Int8 => write!(f, "Int8"),
1178 Int16 => write!(f, "Int16"),
1179 Int32 => write!(f, "Int32"),
1180 Int64 => write!(f, "Int64"),
1181 Int128 => write!(f, "Int128"),
1182 Float16 => write!(f, "Float16"),
1183 Float32 => write!(f, "Float32"),
1184 Float64 => write!(f, "Float64"),
1185 String => write!(f, "String"),
1186 Binary => write!(f, "Binary"),
1187 BinaryOffset => write!(f, "BinaryOffset"),
1188 Date => write!(f, "Date"),
1189 Time => write!(f, "Time"),
1190 Duration(unit) => write!(f, "Duration('{unit}')"),
1191 Datetime(unit, opt_tz) => {
1192 if let Some(tz) = opt_tz {
1193 write!(f, "Datetime('{unit}', '{tz}')")
1194 } else {
1195 write!(f, "Datetime('{unit}')")
1196 }
1197 },
1198 #[cfg(feature = "dtype-decimal")]
1199 Decimal(p, s) => write!(f, "Decimal({p}, {s})"),
1200 #[cfg(feature = "dtype-array")]
1201 Array(inner, size) => write!(f, "Array({inner:?}, {size})"),
1202 List(inner) => write!(f, "List({inner:?})"),
1203 #[cfg(feature = "dtype-struct")]
1204 Struct(fields) => {
1205 let mut first = true;
1206 write!(f, "Struct({{")?;
1207 for field in fields {
1208 if !first {
1209 write!(f, ", ")?;
1210 }
1211 write!(f, "'{}': {:?}", field.name(), field.dtype())?;
1212 first = false;
1213 }
1214 write!(f, "}})")
1215 },
1216 #[cfg(feature = "dtype-categorical")]
1217 Categorical(cats, _) => {
1218 if cats.is_global() {
1219 write!(f, "Categorical")
1220 } else if cats.namespace().is_empty() && cats.physical() == CategoricalPhysical::U32
1221 {
1222 write!(f, "Categorical('{}')", cats.name())
1223 } else {
1224 write!(
1225 f,
1226 "Categorical('{}', '{}', {:?})",
1227 cats.name(),
1228 cats.namespace(),
1229 cats.physical()
1230 )
1231 }
1232 },
1233 #[cfg(feature = "dtype-categorical")]
1234 Enum(_, _) => write!(f, "Enum([...])"),
1235 #[cfg(feature = "object")]
1236 Object(_) => write!(f, "Object"),
1237 Null => write!(f, "Null"),
1238 #[cfg(feature = "dtype-extension")]
1239 Extension(typ, inner) => write!(f, "Extension({}, {inner:?})", typ.0.dyn_debug()),
1240 Unknown(kind) => write!(f, "Unknown({kind:?})"),
1241 }
1242 }
1243}
1244
1245pub fn merge_dtypes(left: &DataType, right: &DataType) -> PolarsResult<DataType> {
1246 use DataType::*;
1247 Ok(match (left, right) {
1248 #[cfg(feature = "dtype-categorical")]
1249 (Categorical(cats_l, map), Categorical(cats_r, _)) => {
1250 ensure_same_categories(cats_l, cats_r)?;
1251 Categorical(cats_l.clone(), map.clone())
1252 },
1253 #[cfg(feature = "dtype-categorical")]
1254 (Enum(fcats_l, map), Enum(fcats_r, _)) => {
1255 ensure_same_frozen_categories(fcats_l, fcats_r)?;
1256 Enum(fcats_l.clone(), map.clone())
1257 },
1258 (List(inner_l), List(inner_r)) => {
1259 let merged = merge_dtypes(inner_l, inner_r)?;
1260 List(Box::new(merged))
1261 },
1262 #[cfg(feature = "dtype-struct")]
1263 (Struct(inner_l), Struct(inner_r)) => {
1264 polars_ensure!(inner_l.len() == inner_r.len(), ComputeError: "cannot combine structs with differing amounts of fields ({} != {})", inner_l.len(), inner_r.len());
1265 let fields = inner_l.iter().zip(inner_r.iter()).map(|(l, r)| {
1266 polars_ensure!(l.name() == r.name(), ComputeError: "cannot combine structs with different fields ({} != {})", l.name(), r.name());
1267 let merged = merge_dtypes(l.dtype(), r.dtype())?;
1268 Ok(Field::new(l.name().clone(), merged))
1269 }).collect::<PolarsResult<Vec<_>>>()?;
1270 Struct(fields)
1271 },
1272 #[cfg(feature = "dtype-array")]
1273 (Array(inner_l, width_l), Array(inner_r, width_r)) => {
1274 polars_ensure!(width_l == width_r, ComputeError: "widths of FixedSizeWidth Series are not equal");
1275 let merged = merge_dtypes(inner_l, inner_r)?;
1276 Array(Box::new(merged), *width_l)
1277 },
1278 (left, right) if left == right => left.clone(),
1279 _ => polars_bail!(ComputeError: "unable to merge datatypes"),
1280 })
1281}
1282
1283fn collect_nested_types(
1284 dtype: &DataType,
1285 result: &mut PlHashSet<DataType>,
1286 include_compound_types: bool,
1287) {
1288 match dtype {
1289 DataType::List(inner) => {
1290 if include_compound_types {
1291 result.insert(dtype.clone());
1292 }
1293 collect_nested_types(inner, result, include_compound_types);
1294 },
1295 #[cfg(feature = "dtype-array")]
1296 DataType::Array(inner, _) => {
1297 if include_compound_types {
1298 result.insert(dtype.clone());
1299 }
1300 collect_nested_types(inner, result, include_compound_types);
1301 },
1302 #[cfg(feature = "dtype-struct")]
1303 DataType::Struct(fields) => {
1304 if include_compound_types {
1305 result.insert(dtype.clone());
1306 }
1307 for field in fields {
1308 collect_nested_types(field.dtype(), result, include_compound_types);
1309 }
1310 },
1311 _ => {
1312 result.insert(dtype.clone());
1313 },
1314 }
1315}
1316
1317pub fn unpack_dtypes(dtype: &DataType, include_compound_types: bool) -> PlHashSet<DataType> {
1318 let mut result = PlHashSet::new();
1319 collect_nested_types(dtype, &mut result, include_compound_types);
1320 result
1321}
1322
1323#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
1324#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
1325#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
1326pub struct CompatLevel(pub(crate) u16);
1327
1328impl CompatLevel {
1329 pub const fn newest() -> CompatLevel {
1330 CompatLevel(1)
1331 }
1332
1333 pub const fn oldest() -> CompatLevel {
1334 CompatLevel(0)
1335 }
1336
1337 #[doc(hidden)]
1340 pub fn with_level(level: u16) -> PolarsResult<CompatLevel> {
1341 if level > CompatLevel::newest().0 {
1342 polars_bail!(InvalidOperation: "invalid compat level");
1343 }
1344 Ok(CompatLevel(level))
1345 }
1346
1347 #[doc(hidden)]
1348 pub fn get_level(&self) -> u16 {
1349 self.0
1350 }
1351}
1352
1353#[cfg(test)]
1354mod tests {
1355 use super::*;
1356
1357 #[cfg(feature = "dtype-array")]
1358 #[test]
1359 fn test_unpack_primitive_dtypes() {
1360 let inner_type = DataType::Float64;
1361 let array_type = DataType::Array(Box::new(inner_type), 10);
1362 let list_type = DataType::List(Box::new(array_type));
1363
1364 let result = unpack_dtypes(&list_type, false);
1365
1366 let mut expected = PlHashSet::new();
1367 expected.insert(DataType::Float64);
1368
1369 assert_eq!(result, expected)
1370 }
1371
1372 #[cfg(feature = "dtype-array")]
1373 #[test]
1374 fn test_unpack_compound_dtypes() {
1375 let inner_type = DataType::Float64;
1376 let array_type = DataType::Array(Box::new(inner_type), 10);
1377 let list_type = DataType::List(Box::new(array_type.clone()));
1378
1379 let result = unpack_dtypes(&list_type, true);
1380
1381 let mut expected = PlHashSet::new();
1382 expected.insert(list_type);
1383 expected.insert(array_type);
1384 expected.insert(DataType::Float64);
1385
1386 assert_eq!(result, expected)
1387 }
1388}