1use std::collections::BTreeMap;
2
3use arrow::datatypes::{
4 DTYPE_CATEGORICAL_NEW, DTYPE_ENUM_VALUES_LEGACY, DTYPE_ENUM_VALUES_NEW, MAINTAIN_PL_TYPE,
5 Metadata, PL_KEY,
6};
7#[cfg(feature = "dtype-array")]
8use polars_utils::format_tuple;
9use polars_utils::itertools::Itertools;
10#[cfg(any(feature = "serde-lazy", feature = "serde"))]
11use serde::{Deserialize, Serialize};
12pub use temporal::time_zone::TimeZone;
13
14use super::*;
15#[cfg(feature = "object")]
16use crate::chunked_array::object::registry::get_object_physical_type;
17#[cfg(feature = "dtype-extension")]
18pub use crate::datatypes::extension::ExtensionTypeInstance;
19use crate::utils::materialize_dyn_int;
20
21pub trait MetaDataExt: IntoMetadata {
22 fn pl_enum_metadata(&self) -> Option<&str> {
23 let md = self.into_metadata_ref();
24 let values = md
25 .get(DTYPE_ENUM_VALUES_NEW)
26 .or_else(|| md.get(DTYPE_ENUM_VALUES_LEGACY));
27 Some(values?.as_str())
28 }
29
30 fn pl_categorical_metadata(&self) -> Option<&str> {
31 Some(
36 self.into_metadata_ref()
37 .get(DTYPE_CATEGORICAL_NEW)?
38 .as_str(),
39 )
40 }
41
42 fn maintain_type(&self) -> bool {
43 let metadata = self.into_metadata_ref();
44 metadata.get(PL_KEY).map(|s| s.as_str()) == Some(MAINTAIN_PL_TYPE)
45 }
46}
47
48impl MetaDataExt for Metadata {}
49pub trait IntoMetadata {
50 #[allow(clippy::wrong_self_convention)]
51 fn into_metadata_ref(&self) -> &Metadata;
52}
53
54impl IntoMetadata for Metadata {
55 fn into_metadata_ref(&self) -> &Metadata {
56 self
57 }
58}
59
60#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Default)]
61#[cfg_attr(
62 any(feature = "serde", feature = "serde-lazy"),
63 derive(Serialize, Deserialize)
64)]
65#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
66pub enum UnknownKind {
67 Int(i128),
69 Float,
70 Str,
72 #[default]
73 Any,
74}
75
76impl UnknownKind {
77 pub fn materialize(&self) -> Option<DataType> {
78 let dtype = match self {
79 UnknownKind::Int(v) => materialize_dyn_int(*v).dtype(),
80 UnknownKind::Float => DataType::Float64,
81 UnknownKind::Str => DataType::String,
82 UnknownKind::Any => return None,
83 };
84 Some(dtype)
85 }
86}
87
88#[derive(Clone)]
89pub enum DataType {
90 Boolean,
91 UInt8,
92 UInt16,
93 UInt32,
94 UInt64,
95 UInt128,
96 Int8,
97 Int16,
98 Int32,
99 Int64,
100 Int128,
101 Float16,
102 Float32,
103 Float64,
104 #[cfg(feature = "dtype-decimal")]
108 Decimal(usize, usize), String,
111 Binary,
112 BinaryOffset,
113 Date,
116 Datetime(TimeUnit, Option<TimeZone>),
119 Duration(TimeUnit),
121 Time,
123 #[cfg(feature = "dtype-array")]
125 Array(Box<DataType>, usize),
126 List(Box<DataType>),
128 #[cfg(feature = "object")]
131 Object(&'static str),
132 Null,
133 #[cfg(feature = "dtype-categorical")]
134 Categorical(Arc<Categories>, Arc<CategoricalMapping>),
135 #[cfg(feature = "dtype-categorical")]
137 Enum(Arc<FrozenCategories>, Arc<CategoricalMapping>),
138 #[cfg(feature = "dtype-struct")]
139 Struct(Vec<Field>),
140 #[cfg(feature = "dtype-extension")]
141 Extension(ExtensionTypeInstance, Box<DataType>),
142 Unknown(UnknownKind),
144}
145
146impl Default for DataType {
147 fn default() -> Self {
148 DataType::Unknown(UnknownKind::Any)
149 }
150}
151
152pub trait AsRefDataType {
153 fn as_ref_dtype(&self) -> &DataType;
154}
155
156impl Hash for DataType {
157 fn hash<H: Hasher>(&self, state: &mut H) {
158 std::mem::discriminant(self).hash(state)
159 }
160}
161
162impl PartialEq for DataType {
163 fn eq(&self, other: &Self) -> bool {
164 use DataType::*;
165 {
166 match (self, other) {
167 #[cfg(feature = "dtype-categorical")]
168 (Categorical(cats_l, _), Categorical(cats_r, _)) => Arc::ptr_eq(cats_l, cats_r),
169 #[cfg(feature = "dtype-categorical")]
170 (Enum(fcats_l, _), Enum(fcats_r, _)) => Arc::ptr_eq(fcats_l, fcats_r),
171 (Datetime(tu_l, tz_l), Datetime(tu_r, tz_r)) => tu_l == tu_r && tz_l == tz_r,
172 (List(left_inner), List(right_inner)) => left_inner == right_inner,
173 #[cfg(feature = "dtype-duration")]
174 (Duration(tu_l), Duration(tu_r)) => tu_l == tu_r,
175 #[cfg(feature = "dtype-decimal")]
176 (Decimal(p1, s1), Decimal(p2, s2)) => (p1, s1) == (p2, s2),
177 #[cfg(feature = "object")]
178 (Object(lhs), Object(rhs)) => lhs == rhs,
179 #[cfg(feature = "dtype-struct")]
180 (Struct(lhs), Struct(rhs)) => {
181 std::ptr::eq(Vec::as_ptr(lhs), Vec::as_ptr(rhs)) || lhs == rhs
182 },
183 #[cfg(feature = "dtype-array")]
184 (Array(left_inner, left_width), Array(right_inner, right_width)) => {
185 left_width == right_width && left_inner == right_inner
186 },
187 (Unknown(l), Unknown(r)) => match (l, r) {
188 (UnknownKind::Int(_), UnknownKind::Int(_)) => true,
189 _ => l == r,
190 },
191 _ => std::mem::discriminant(self) == std::mem::discriminant(other),
192 }
193 }
194 }
195}
196
197impl Eq for DataType {}
198
199impl DataType {
200 pub const IDX_DTYPE: Self = {
201 #[cfg(not(feature = "bigidx"))]
202 {
203 DataType::UInt32
204 }
205 #[cfg(feature = "bigidx")]
206 {
207 DataType::UInt64
208 }
209 };
210
211 pub fn value_within_range(&self, other: AnyValue) -> bool {
212 use DataType::*;
213 match self {
214 UInt8 => other.extract::<u8>().is_some(),
215 #[cfg(feature = "dtype-u16")]
216 UInt16 => other.extract::<u16>().is_some(),
217 UInt32 => other.extract::<u32>().is_some(),
218 UInt64 => other.extract::<u64>().is_some(),
219 #[cfg(feature = "dtype-u128")]
220 UInt128 => other.extract::<u128>().is_some(),
221 #[cfg(feature = "dtype-i8")]
222 Int8 => other.extract::<i8>().is_some(),
223 #[cfg(feature = "dtype-i16")]
224 Int16 => other.extract::<i16>().is_some(),
225 Int32 => other.extract::<i32>().is_some(),
226 Int64 => other.extract::<i64>().is_some(),
227 #[cfg(feature = "dtype-i128")]
228 Int128 => other.extract::<i128>().is_some(),
229 _ => false,
230 }
231 }
232
233 #[cfg(feature = "dtype-struct")]
235 pub fn _month_days_ns_struct_type() -> Self {
236 DataType::Struct(vec![
237 Field::new(PlSmallStr::from_static("months"), DataType::Int32),
238 Field::new(PlSmallStr::from_static("days"), DataType::Int32),
239 Field::new(
240 PlSmallStr::from_static("nanoseconds"),
241 DataType::Duration(TimeUnit::Nanoseconds),
242 ),
243 ])
244 }
245
246 pub fn is_known(&self) -> bool {
248 match self {
249 DataType::List(inner) => inner.is_known(),
250 #[cfg(feature = "dtype-array")]
251 DataType::Array(inner, _) => inner.is_known(),
252 #[cfg(feature = "dtype-struct")]
253 DataType::Struct(fields) => fields.iter().all(|fld| fld.dtype.is_known()),
254 DataType::Unknown(_) => false,
255 _ => true,
256 }
257 }
258
259 pub fn materialize_unknown(self, allow_unknown: bool) -> PolarsResult<DataType> {
262 match self {
263 DataType::Unknown(u) => match u.materialize() {
264 Some(known) => Ok(known),
265 None => {
266 if allow_unknown {
267 Ok(DataType::Unknown(u))
268 } else {
269 polars_bail!(SchemaMismatch: "failed to materialize unknown type")
270 }
271 },
272 },
273 DataType::List(inner) => Ok(DataType::List(Box::new(
274 inner.materialize_unknown(allow_unknown)?,
275 ))),
276 #[cfg(feature = "dtype-array")]
277 DataType::Array(inner, size) => Ok(DataType::Array(
278 Box::new(inner.materialize_unknown(allow_unknown)?),
279 size,
280 )),
281 #[cfg(feature = "dtype-struct")]
282 DataType::Struct(fields) => Ok(DataType::Struct(
283 fields
284 .into_iter()
285 .map(|f| {
286 PolarsResult::Ok(Field::new(
287 f.name,
288 f.dtype.materialize_unknown(allow_unknown)?,
289 ))
290 })
291 .try_collect_vec()?,
292 )),
293 _ => Ok(self),
294 }
295 }
296
297 #[cfg(feature = "dtype-array")]
298 pub fn get_shape(&self) -> Option<Vec<usize>> {
300 fn get_shape_impl(dt: &DataType, shape: &mut Vec<usize>) {
301 if let DataType::Array(inner, size) = dt {
302 shape.push(*size);
303 get_shape_impl(inner, shape);
304 }
305 }
306
307 if let DataType::Array(inner, size) = self {
308 let mut shape = vec![*size];
309 get_shape_impl(inner, &mut shape);
310 Some(shape)
311 } else {
312 None
313 }
314 }
315
316 pub fn inner_dtype(&self) -> Option<&DataType> {
318 match self {
319 DataType::List(inner) => Some(inner),
320 #[cfg(feature = "dtype-array")]
321 DataType::Array(inner, _) => Some(inner),
322 _ => None,
323 }
324 }
325
326 pub fn into_inner_dtype(self) -> Option<DataType> {
328 match self {
329 DataType::List(inner) => Some(*inner),
330 #[cfg(feature = "dtype-array")]
331 DataType::Array(inner, _) => Some(*inner),
332 _ => None,
333 }
334 }
335
336 pub fn try_into_inner_dtype(self) -> PolarsResult<DataType> {
338 match self {
339 DataType::List(inner) => Ok(*inner),
340 #[cfg(feature = "dtype-array")]
341 DataType::Array(inner, _) => Ok(*inner),
342 dt => polars_bail!(InvalidOperation: "cannot get inner datatype of `{dt}`"),
343 }
344 }
345
346 pub fn leaf_dtype(&self) -> &DataType {
348 let mut prev = self;
349 while let Some(dtype) = prev.inner_dtype() {
350 prev = dtype
351 }
352 prev
353 }
354
355 #[cfg(feature = "dtype-array")]
356 pub fn array_leaf_dtype(&self) -> Option<&DataType> {
358 let mut prev = self;
359 match prev {
360 DataType::Array(_, _) => {
361 while let DataType::Array(inner, _) = &prev {
362 prev = inner;
363 }
364 Some(prev)
365 },
366 _ => None,
367 }
368 }
369
370 pub fn cast_leaf(&self, to: DataType) -> DataType {
372 use DataType::*;
373 match self {
374 List(inner) => List(Box::new(inner.cast_leaf(to))),
375 #[cfg(feature = "dtype-array")]
376 Array(inner, size) => Array(Box::new(inner.cast_leaf(to)), *size),
377 _ => to,
378 }
379 }
380
381 pub fn map_leaves<F: FnMut(DataType) -> DataType>(self, f: &mut F) -> DataType {
384 use DataType::*;
385 match self {
386 List(inner) => List(Box::new(inner.map_leaves(f))),
387 #[cfg(feature = "dtype-array")]
388 Array(inner, size) => Array(Box::new(inner.map_leaves(f)), size),
389 #[cfg(feature = "dtype-struct")]
390 Struct(fields) => {
391 let new_fields = fields
392 .into_iter()
393 .map(|fld| Field::new(fld.name, fld.dtype.map_leaves(f)))
394 .collect();
395 Struct(new_fields)
396 },
397 #[cfg(feature = "dtype-extension")]
398 Extension(ext, storage) => Extension(ext, Box::new(storage.map_leaves(f))),
399 _ => f(self),
400 }
401 }
402
403 pub fn can_cast_to(&self, to: &DataType) -> Option<bool> {
407 if self == to {
408 return Some(true);
409 }
410 if self.is_primitive_numeric() && to.is_primitive_numeric() {
411 return Some(true);
412 }
413
414 if self.is_null() {
415 return Some(true);
416 }
417
418 use DataType as D;
419 Some(match (self, to) {
420 #[cfg(feature = "dtype-categorical")]
421 (D::Categorical(_, _) | D::Enum(_, _), D::Binary)
422 | (D::Binary, D::Categorical(_, _) | D::Enum(_, _)) => false, #[cfg(feature = "object")]
425 (D::Object(_), D::Object(_)) => true,
426 #[cfg(feature = "object")]
427 (D::Object(_), _) | (_, D::Object(_)) => false,
428
429 (D::Boolean, dt) | (dt, D::Boolean) => match dt {
430 dt if dt.is_primitive_numeric() => true,
431 #[cfg(feature = "dtype-decimal")]
432 D::Decimal(_, _) => true,
433 D::String | D::Binary => true,
434 _ => false,
435 },
436
437 (D::List(from), D::List(to)) => from.can_cast_to(to)?,
438 #[cfg(feature = "dtype-array")]
439 (D::Array(from, l_width), D::Array(to, r_width)) => {
440 l_width == r_width && from.can_cast_to(to)?
441 },
442 #[cfg(feature = "dtype-struct")]
443 (D::Struct(l_fields), D::Struct(r_fields)) => {
444 if l_fields.is_empty() {
445 return Some(true);
446 }
447
448 if l_fields.len() != r_fields.len() {
449 return Some(false);
450 }
451
452 for (l, r) in l_fields.iter().zip(r_fields) {
453 if !l.dtype().can_cast_to(r.dtype())? {
454 return Some(false);
455 }
456 }
457
458 true
459 },
460
461 _ => return None,
463 })
464 }
465
466 pub fn implode(self) -> DataType {
467 DataType::List(Box::new(self))
468 }
469
470 #[must_use]
472 pub fn to_physical(&self) -> DataType {
473 use DataType::*;
474 match self {
475 Date => Int32,
476 Datetime(_, _) => Int64,
477 Duration(_) => Int64,
478 Time => Int64,
479 #[cfg(feature = "dtype-decimal")]
480 Decimal(_, _) => Int128,
481 #[cfg(feature = "dtype-categorical")]
482 Categorical(cats, _) => cats.physical().dtype(),
483 #[cfg(feature = "dtype-categorical")]
484 Enum(fcats, _) => fcats.physical().dtype(),
485 #[cfg(feature = "dtype-array")]
486 Array(dt, width) => Array(Box::new(dt.to_physical()), *width),
487 List(dt) => List(Box::new(dt.to_physical())),
488 #[cfg(feature = "dtype-struct")]
489 Struct(fields) => {
490 let new_fields = fields
491 .iter()
492 .map(|s| Field::new(s.name().clone(), s.dtype().to_physical()))
493 .collect();
494 Struct(new_fields)
495 },
496 #[cfg(feature = "dtype-extension")]
497 Extension(_, storage) => storage.to_physical(),
498 _ => self.clone(),
499 }
500 }
501
502 #[must_use]
503 pub fn to_storage(&self) -> DataType {
504 use DataType::*;
505 match self {
506 #[cfg(feature = "dtype-extension")]
507 Extension(_, storage) => storage.to_storage(),
508 _ => self.clone(),
509 }
510 }
511
512 pub fn is_supported_list_arithmetic_input(&self) -> bool {
513 self.is_primitive_numeric() || self.is_bool() || self.is_null()
514 }
515
516 pub fn is_logical(&self) -> bool {
518 self != &self.to_physical()
519 }
520
521 pub fn is_temporal(&self) -> bool {
523 use DataType::*;
524 matches!(self, Date | Datetime(_, _) | Duration(_) | Time)
525 }
526
527 pub fn is_primitive(&self) -> bool {
530 self.is_primitive_numeric()
531 | matches!(
532 self,
533 DataType::Boolean | DataType::String | DataType::Binary
534 )
535 }
536
537 pub fn is_primitive_numeric(&self) -> bool {
539 self.is_float() || self.is_integer()
540 }
541
542 pub fn is_bool(&self) -> bool {
544 matches!(self, DataType::Boolean)
545 }
546
547 pub fn is_list(&self) -> bool {
549 matches!(self, DataType::List(_))
550 }
551
552 pub fn is_array(&self) -> bool {
554 #[cfg(feature = "dtype-array")]
555 {
556 matches!(self, DataType::Array(_, _))
557 }
558 #[cfg(not(feature = "dtype-array"))]
559 {
560 false
561 }
562 }
563
564 pub fn is_nested(&self) -> bool {
565 match self {
566 DataType::List(_) => true,
567 #[cfg(feature = "dtype-array")]
568 DataType::Array(_, _) => true,
569 #[cfg(feature = "dtype-struct")]
570 DataType::Struct(_) => true,
571 #[cfg(feature = "dtype-extension")]
572 DataType::Extension(_, storage) => storage.is_nested(),
573 _ => false,
574 }
575 }
576
577 pub fn is_struct(&self) -> bool {
579 #[cfg(feature = "dtype-struct")]
580 {
581 matches!(self, DataType::Struct(_))
582 }
583 #[cfg(not(feature = "dtype-struct"))]
584 {
585 false
586 }
587 }
588
589 pub fn is_binary(&self) -> bool {
590 matches!(self, DataType::Binary)
591 }
592
593 pub fn is_date(&self) -> bool {
594 matches!(self, DataType::Date)
595 }
596 pub fn is_datetime(&self) -> bool {
597 matches!(self, DataType::Datetime(..))
598 }
599
600 pub fn is_duration(&self) -> bool {
601 matches!(self, DataType::Duration(..))
602 }
603
604 pub fn is_object(&self) -> bool {
605 #[cfg(feature = "object")]
606 {
607 matches!(self, DataType::Object(_))
608 }
609 #[cfg(not(feature = "object"))]
610 {
611 false
612 }
613 }
614
615 pub fn is_null(&self) -> bool {
616 matches!(self, DataType::Null)
617 }
618
619 pub fn contains_views(&self) -> bool {
620 use DataType::*;
621 match self {
622 Binary | String => true,
623 List(inner) => inner.contains_views(),
624 #[cfg(feature = "dtype-array")]
625 Array(inner, _) => inner.contains_views(),
626 #[cfg(feature = "dtype-struct")]
627 Struct(fields) => fields.iter().any(|field| field.dtype.contains_views()),
628 _ => false,
629 }
630 }
631
632 pub fn contains_categoricals(&self) -> bool {
633 use DataType::*;
634 match self {
635 #[cfg(feature = "dtype-categorical")]
636 Categorical(_, _) | Enum(_, _) => true,
637 List(inner) => inner.contains_categoricals(),
638 #[cfg(feature = "dtype-array")]
639 Array(inner, _) => inner.contains_categoricals(),
640 #[cfg(feature = "dtype-struct")]
641 Struct(fields) => fields
642 .iter()
643 .any(|field| field.dtype.contains_categoricals()),
644 _ => false,
645 }
646 }
647
648 pub fn contains_objects(&self) -> bool {
649 use DataType::*;
650 match self {
651 #[cfg(feature = "object")]
652 Object(_) => true,
653 List(inner) => inner.contains_objects(),
654 #[cfg(feature = "dtype-array")]
655 Array(inner, _) => inner.contains_objects(),
656 #[cfg(feature = "dtype-struct")]
657 Struct(fields) => fields.iter().any(|field| field.dtype.contains_objects()),
658 _ => false,
659 }
660 }
661
662 pub fn contains_list_recursive(&self) -> bool {
663 use DataType as D;
664 match self {
665 D::List(_) => true,
666 #[cfg(feature = "dtype-array")]
667 D::Array(inner, _) => inner.contains_list_recursive(),
668 #[cfg(feature = "dtype-struct")]
669 D::Struct(fields) => fields
670 .iter()
671 .any(|field| field.dtype.contains_list_recursive()),
672 _ => false,
673 }
674 }
675
676 pub fn contains_unknown(&self) -> bool {
677 use DataType as D;
678 match self {
679 D::Unknown(_) => true,
680 D::List(inner) => inner.contains_unknown(),
681 #[cfg(feature = "dtype-array")]
682 D::Array(inner, _) => inner.contains_unknown(),
683 #[cfg(feature = "dtype-struct")]
684 D::Struct(fields) => fields.iter().any(|field| field.dtype.contains_unknown()),
685 _ => false,
686 }
687 }
688
689 pub fn is_ord(&self) -> bool {
691 let phys = self.to_physical();
692 phys.is_primitive_numeric()
693 || self.is_decimal()
694 || matches!(
695 phys,
696 DataType::Binary | DataType::String | DataType::Boolean
697 )
698 }
699
700 pub fn is_decimal(&self) -> bool {
702 match self {
703 #[cfg(feature = "dtype-decimal")]
704 DataType::Decimal(_, _) => true,
705 _ => false,
706 }
707 }
708
709 pub fn is_float(&self) -> bool {
712 matches!(
713 self,
714 DataType::Float16
715 | DataType::Float32
716 | DataType::Float64
717 | DataType::Unknown(UnknownKind::Float)
718 )
719 }
720
721 pub fn is_integer(&self) -> bool {
723 matches!(
724 self,
725 DataType::Int8
726 | DataType::Int16
727 | DataType::Int32
728 | DataType::Int64
729 | DataType::Int128
730 | DataType::UInt8
731 | DataType::UInt16
732 | DataType::UInt32
733 | DataType::UInt64
734 | DataType::UInt128
735 | DataType::Unknown(UnknownKind::Int(_))
736 )
737 }
738
739 pub fn is_signed_integer(&self) -> bool {
740 matches!(
742 self,
743 DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 | DataType::Int128
744 )
745 }
746
747 pub fn is_unsigned_integer(&self) -> bool {
748 matches!(
749 self,
750 DataType::UInt8
751 | DataType::UInt16
752 | DataType::UInt32
753 | DataType::UInt64
754 | DataType::UInt128,
755 )
756 }
757
758 pub fn is_string(&self) -> bool {
759 matches!(self, DataType::String | DataType::Unknown(UnknownKind::Str))
760 }
761
762 pub fn is_categorical(&self) -> bool {
763 #[cfg(feature = "dtype-categorical")]
764 {
765 matches!(self, DataType::Categorical(_, _))
766 }
767 #[cfg(not(feature = "dtype-categorical"))]
768 {
769 false
770 }
771 }
772
773 pub fn is_enum(&self) -> bool {
774 #[cfg(feature = "dtype-categorical")]
775 {
776 matches!(self, DataType::Enum(_, _))
777 }
778 #[cfg(not(feature = "dtype-categorical"))]
779 {
780 false
781 }
782 }
783
784 pub fn is_extension(&self) -> bool {
785 #[cfg(feature = "dtype-extension")]
786 {
787 matches!(self, DataType::Extension(_, _))
788 }
789 #[cfg(not(feature = "dtype-extension"))]
790 {
791 false
792 }
793 }
794
795 pub fn to_arrow_field(&self, name: PlSmallStr, compat_level: CompatLevel) -> ArrowField {
797 let metadata = match self {
798 #[cfg(feature = "dtype-categorical")]
799 DataType::Enum(fcats, _map) => {
800 let cats = fcats.categories();
801 let strings_size: usize = cats
802 .values_iter()
803 .map(|s| (s.len() + 1).ilog10() as usize + 1 + s.len())
804 .sum();
805 let mut encoded = String::with_capacity(strings_size);
806 for cat in cats.values_iter() {
807 encoded.push_str(itoa::Buffer::new().format(cat.len()));
808 encoded.push(';');
809 encoded.push_str(cat);
810 }
811 Some(BTreeMap::from([(
812 PlSmallStr::from_static(DTYPE_ENUM_VALUES_NEW),
813 PlSmallStr::from_string(encoded),
814 )]))
815 },
816 #[cfg(feature = "dtype-categorical")]
817 DataType::Categorical(cats, _) => {
818 let mut encoded = String::new();
819 encoded.push_str(itoa::Buffer::new().format(cats.name().len()));
820 encoded.push(';');
821 encoded.push_str(cats.name());
822 encoded.push_str(itoa::Buffer::new().format(cats.namespace().len()));
823 encoded.push(';');
824 encoded.push_str(cats.namespace());
825 encoded.push_str(cats.physical().as_str());
826 encoded.push(';');
827
828 Some(BTreeMap::from([(
829 PlSmallStr::from_static(DTYPE_CATEGORICAL_NEW),
830 PlSmallStr::from_string(encoded),
831 )]))
832 },
833 DataType::BinaryOffset => Some(BTreeMap::from([(
834 PlSmallStr::from_static(PL_KEY),
835 PlSmallStr::from_static(MAINTAIN_PL_TYPE),
836 )])),
837 _ => None,
838 };
839
840 let field = ArrowField::new(name, self.to_arrow(compat_level), true);
841
842 if let Some(metadata) = metadata {
843 field.with_metadata(metadata)
844 } else {
845 field
846 }
847 }
848
849 pub fn max(&self) -> PolarsResult<Scalar> {
851 use DataType::*;
852 let v = match self {
853 Int8 => Scalar::from(i8::MAX),
854 Int16 => Scalar::from(i16::MAX),
855 Int32 => Scalar::from(i32::MAX),
856 Int64 => Scalar::from(i64::MAX),
857 Int128 => Scalar::from(i128::MAX),
858 UInt8 => Scalar::from(u8::MAX),
859 UInt16 => Scalar::from(u16::MAX),
860 UInt32 => Scalar::from(u32::MAX),
861 UInt64 => Scalar::from(u64::MAX),
862 UInt128 => Scalar::from(u128::MAX),
863 Float16 => Scalar::from(pf16::INFINITY),
864 Float32 => Scalar::from(f32::INFINITY),
865 Float64 => Scalar::from(f64::INFINITY),
866 #[cfg(feature = "dtype-time")]
867 Time => Scalar::new(Time, AnyValue::Time(NS_IN_DAY - 1)),
868 dt => polars_bail!(ComputeError: "cannot determine upper bound for dtype `{}`", dt),
869 };
870 Ok(v)
871 }
872
873 pub fn min(&self) -> PolarsResult<Scalar> {
875 use DataType::*;
876 let v = match self {
877 Int8 => Scalar::from(i8::MIN),
878 Int16 => Scalar::from(i16::MIN),
879 Int32 => Scalar::from(i32::MIN),
880 Int64 => Scalar::from(i64::MIN),
881 Int128 => Scalar::from(i128::MIN),
882 UInt8 => Scalar::from(u8::MIN),
883 UInt16 => Scalar::from(u16::MIN),
884 UInt32 => Scalar::from(u32::MIN),
885 UInt64 => Scalar::from(u64::MIN),
886 UInt128 => Scalar::from(u128::MIN),
887 Float16 => Scalar::from(pf16::NEG_INFINITY),
888 Float32 => Scalar::from(f32::NEG_INFINITY),
889 Float64 => Scalar::from(f64::NEG_INFINITY),
890 #[cfg(feature = "dtype-time")]
891 Time => Scalar::new(Time, AnyValue::Time(0)),
892 dt => polars_bail!(ComputeError: "cannot determine lower bound for dtype `{}`", dt),
893 };
894 Ok(v)
895 }
896
897 #[inline]
899 pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowDataType {
900 self.try_to_arrow(compat_level).unwrap()
901 }
902
903 #[inline]
904 pub fn try_to_arrow(&self, compat_level: CompatLevel) -> PolarsResult<ArrowDataType> {
905 use DataType::*;
906 match self {
907 Boolean => Ok(ArrowDataType::Boolean),
908 UInt8 => Ok(ArrowDataType::UInt8),
909 UInt16 => Ok(ArrowDataType::UInt16),
910 UInt32 => Ok(ArrowDataType::UInt32),
911 UInt64 => Ok(ArrowDataType::UInt64),
912 UInt128 => Ok(ArrowDataType::UInt128),
913 Int8 => Ok(ArrowDataType::Int8),
914 Int16 => Ok(ArrowDataType::Int16),
915 Int32 => Ok(ArrowDataType::Int32),
916 Int64 => Ok(ArrowDataType::Int64),
917 Int128 => Ok(ArrowDataType::Int128),
918 Float16 => Ok(ArrowDataType::Float16),
919 Float32 => Ok(ArrowDataType::Float32),
920 Float64 => Ok(ArrowDataType::Float64),
921 #[cfg(feature = "dtype-decimal")]
922 Decimal(precision, scale) => {
923 assert!(*precision >= 1 && *precision <= 38);
924 Ok(ArrowDataType::Decimal(*precision, *scale))
925 },
926 String => {
927 let dt = if compat_level.0 >= 1 {
928 ArrowDataType::Utf8View
929 } else {
930 ArrowDataType::LargeUtf8
931 };
932 Ok(dt)
933 },
934 Binary => {
935 let dt = if compat_level.0 >= 1 {
936 ArrowDataType::BinaryView
937 } else {
938 ArrowDataType::LargeBinary
939 };
940 Ok(dt)
941 },
942 Date => Ok(ArrowDataType::Date32),
943 Datetime(unit, tz) => Ok(ArrowDataType::Timestamp(
944 unit.to_arrow(),
945 tz.as_deref().cloned(),
946 )),
947 Duration(unit) => Ok(ArrowDataType::Duration(unit.to_arrow())),
948 Time => Ok(ArrowDataType::Time64(ArrowTimeUnit::Nanosecond)),
949 #[cfg(feature = "dtype-array")]
950 Array(dt, width) => Ok(ArrowDataType::FixedSizeList(
951 Box::new(dt.to_arrow_field(LIST_VALUES_NAME, compat_level)),
952 *width,
953 )),
954 List(dt) => Ok(ArrowDataType::LargeList(Box::new(
955 dt.to_arrow_field(LIST_VALUES_NAME, compat_level),
956 ))),
957 Null => Ok(ArrowDataType::Null),
958 #[cfg(feature = "object")]
959 Object(_) => Ok(get_object_physical_type()),
960 #[cfg(feature = "dtype-categorical")]
961 Categorical(_, _) | Enum(_, _) => {
962 let arrow_phys = match self.cat_physical().unwrap() {
963 CategoricalPhysical::U8 => IntegerType::UInt8,
964 CategoricalPhysical::U16 => IntegerType::UInt16,
965 CategoricalPhysical::U32 => IntegerType::UInt32,
966 };
967
968 let values = if compat_level.0 >= 1 {
969 ArrowDataType::Utf8View
970 } else {
971 ArrowDataType::LargeUtf8
972 };
973
974 Ok(ArrowDataType::Dictionary(
975 arrow_phys,
976 Box::new(values),
977 false,
978 ))
979 },
980 #[cfg(feature = "dtype-struct")]
981 Struct(fields) => {
982 let fields = fields
983 .iter()
984 .map(|fld| fld.to_arrow(compat_level))
985 .collect();
986 Ok(ArrowDataType::Struct(fields))
987 },
988 BinaryOffset => Ok(ArrowDataType::LargeBinary),
989 #[cfg(feature = "dtype-extension")]
990 Extension(typ, inner) => Ok(ArrowDataType::Extension(Box::new(
991 arrow::datatypes::ExtensionType {
992 name: typ.name().into(),
993 inner: inner.try_to_arrow(compat_level)?,
994 metadata: typ.serialize_metadata().map(|m| m.into()),
995 },
996 ))),
997 Unknown(kind) => {
998 let dt = match kind {
999 UnknownKind::Any => ArrowDataType::Unknown,
1000 UnknownKind::Float => ArrowDataType::Float64,
1001 UnknownKind::Str => ArrowDataType::Utf8View,
1002 UnknownKind::Int(v) => {
1003 return materialize_dyn_int(*v).dtype().try_to_arrow(compat_level);
1004 },
1005 };
1006 Ok(dt)
1007 },
1008 }
1009 }
1010
1011 pub fn is_nested_null(&self) -> bool {
1012 use DataType::*;
1013 match self {
1014 Null => true,
1015 List(field) => field.is_nested_null(),
1016 #[cfg(feature = "dtype-array")]
1017 Array(field, _) => field.is_nested_null(),
1018 #[cfg(feature = "dtype-struct")]
1019 Struct(fields) => fields.iter().all(|fld| fld.dtype.is_nested_null()),
1020 _ => false,
1021 }
1022 }
1023
1024 pub fn matches_schema_type(&self, schema_type: &DataType) -> PolarsResult<bool> {
1031 match (self, schema_type) {
1032 (DataType::List(l), DataType::List(r)) => l.matches_schema_type(r),
1033 #[cfg(feature = "dtype-array")]
1034 (DataType::Array(l, sl), DataType::Array(r, sr)) => {
1035 Ok(l.matches_schema_type(r)? && sl == sr)
1036 },
1037 #[cfg(feature = "dtype-struct")]
1038 (DataType::Struct(l), DataType::Struct(r)) => {
1039 if l.len() != r.len() {
1040 polars_bail!(SchemaMismatch: "structs have different number of fields: {} vs {}", l.len(), r.len());
1041 }
1042 let mut must_cast = false;
1043 for (l, r) in l.iter().zip(r.iter()) {
1044 must_cast |= l.dtype.matches_schema_type(&r.dtype)?;
1045 }
1046 Ok(must_cast)
1047 },
1048 (DataType::Null, DataType::Null) => Ok(false),
1049 #[cfg(feature = "dtype-decimal")]
1050 (DataType::Decimal(p1, s1), DataType::Decimal(p2, s2)) => Ok((p1, s1) != (p2, s2)),
1051 (DataType::Null, _) => Ok(true),
1054 #[cfg(feature = "dtype-categorical")]
1055 (DataType::Categorical(l, _), DataType::Categorical(r, _)) => {
1056 ensure_same_categories(l, r)?;
1057 Ok(false)
1058 },
1059 #[cfg(feature = "dtype-categorical")]
1060 (DataType::Enum(l, _), DataType::Enum(r, _)) => {
1061 ensure_same_frozen_categories(l, r)?;
1062 Ok(false)
1063 },
1064
1065 (l, r) if l == r => Ok(false),
1066 (l, r) => {
1067 polars_bail!(SchemaMismatch: "type {:?} is incompatible with expected type {:?}", l, r)
1068 },
1069 }
1070 }
1071
1072 #[inline]
1073 pub fn is_unknown(&self) -> bool {
1074 matches!(self, DataType::Unknown(_))
1075 }
1076
1077 pub fn nesting_level(&self) -> usize {
1078 let mut level = 0;
1079 let mut slf = self;
1080 while let Some(inner_dtype) = slf.inner_dtype() {
1081 level += 1;
1082 slf = inner_dtype;
1083 }
1084 level
1085 }
1086
1087 #[cfg(feature = "dtype-categorical")]
1089 pub fn cat_physical(&self) -> PolarsResult<CategoricalPhysical> {
1090 match self {
1091 DataType::Categorical(cats, _) => Ok(cats.physical()),
1092 DataType::Enum(fcats, _) => Ok(fcats.physical()),
1093 _ => {
1094 polars_bail!(SchemaMismatch: "invalid dtype: expected an Enum or Categorical type, received '{:?}'", self)
1095 },
1096 }
1097 }
1098
1099 #[cfg(feature = "dtype-categorical")]
1101 pub fn cat_mapping(&self) -> PolarsResult<&Arc<CategoricalMapping>> {
1102 match self {
1103 DataType::Categorical(_, mapping) | DataType::Enum(_, mapping) => Ok(mapping),
1104 _ => {
1105 polars_bail!(SchemaMismatch: "invalid dtype: expected an Enum or Categorical type, received '{:?}'", self)
1106 },
1107 }
1108 }
1109
1110 #[cfg(feature = "dtype-categorical")]
1111 pub fn from_categories(cats: Arc<Categories>) -> Self {
1112 let mapping = cats.mapping();
1113 Self::Categorical(cats, mapping)
1114 }
1115
1116 #[cfg(feature = "dtype-categorical")]
1117 pub fn from_frozen_categories(fcats: Arc<FrozenCategories>) -> Self {
1118 let mapping = fcats.mapping().clone();
1119 Self::Enum(fcats, mapping)
1120 }
1121
1122 pub fn is_numeric(&self) -> bool {
1123 self.is_integer() || self.is_float() || self.is_decimal()
1124 }
1125}
1126
1127impl Display for DataType {
1128 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1129 let s = match self {
1130 DataType::Null => "null",
1131 DataType::Boolean => "bool",
1132 DataType::UInt8 => "u8",
1133 DataType::UInt16 => "u16",
1134 DataType::UInt32 => "u32",
1135 DataType::UInt64 => "u64",
1136 DataType::UInt128 => "u128",
1137 DataType::Int8 => "i8",
1138 DataType::Int16 => "i16",
1139 DataType::Int32 => "i32",
1140 DataType::Int64 => "i64",
1141 DataType::Int128 => "i128",
1142 DataType::Float16 => "f16",
1143 DataType::Float32 => "f32",
1144 DataType::Float64 => "f64",
1145 #[cfg(feature = "dtype-decimal")]
1146 DataType::Decimal(p, s) => return write!(f, "decimal[{p},{s}]"),
1147 DataType::String => "str",
1148 DataType::Binary => "binary",
1149 DataType::BinaryOffset => "binary[offset]",
1150 DataType::Date => "date",
1151 DataType::Datetime(tu, None) => return write!(f, "datetime[{tu}]"),
1152 DataType::Datetime(tu, Some(tz)) => return write!(f, "datetime[{tu}, {tz}]"),
1153 DataType::Duration(tu) => return write!(f, "duration[{tu}]"),
1154 DataType::Time => "time",
1155 #[cfg(feature = "dtype-array")]
1156 DataType::Array(_, _) => {
1157 let tp = self.array_leaf_dtype().unwrap();
1158
1159 let dims = self.get_shape().unwrap();
1160 let shape = if dims.len() == 1 {
1161 format!("{}", dims[0])
1162 } else {
1163 format_tuple!(dims)
1164 };
1165 return write!(f, "array[{tp}, {shape}]");
1166 },
1167 DataType::List(tp) => return write!(f, "list[{tp}]"),
1168 #[cfg(feature = "object")]
1169 DataType::Object(s) => s,
1170 #[cfg(feature = "dtype-categorical")]
1171 DataType::Categorical(_, _) => "cat",
1172 #[cfg(feature = "dtype-categorical")]
1173 DataType::Enum(_, _) => "enum",
1174 #[cfg(feature = "dtype-struct")]
1175 DataType::Struct(fields) => return write!(f, "struct[{}]", fields.len()),
1176 #[cfg(feature = "dtype-extension")]
1177 DataType::Extension(typ, _) => return write!(f, "ext[{}]", typ.0.dyn_display()),
1178 DataType::Unknown(kind) => match kind {
1179 UnknownKind::Any => "unknown",
1180 UnknownKind::Int(_) => "dyn int",
1181 UnknownKind::Float => "dyn float",
1182 UnknownKind::Str => "dyn str",
1183 },
1184 };
1185 f.write_str(s)
1186 }
1187}
1188
1189impl std::fmt::Debug for DataType {
1190 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1191 use DataType::*;
1192 match self {
1193 Boolean => write!(f, "Boolean"),
1194 UInt8 => write!(f, "UInt8"),
1195 UInt16 => write!(f, "UInt16"),
1196 UInt32 => write!(f, "UInt32"),
1197 UInt64 => write!(f, "UInt64"),
1198 UInt128 => write!(f, "UInt128"),
1199 Int8 => write!(f, "Int8"),
1200 Int16 => write!(f, "Int16"),
1201 Int32 => write!(f, "Int32"),
1202 Int64 => write!(f, "Int64"),
1203 Int128 => write!(f, "Int128"),
1204 Float16 => write!(f, "Float16"),
1205 Float32 => write!(f, "Float32"),
1206 Float64 => write!(f, "Float64"),
1207 String => write!(f, "String"),
1208 Binary => write!(f, "Binary"),
1209 BinaryOffset => write!(f, "BinaryOffset"),
1210 Date => write!(f, "Date"),
1211 Time => write!(f, "Time"),
1212 Duration(unit) => write!(f, "Duration('{unit}')"),
1213 Datetime(unit, opt_tz) => {
1214 if let Some(tz) = opt_tz {
1215 write!(f, "Datetime('{unit}', '{tz}')")
1216 } else {
1217 write!(f, "Datetime('{unit}')")
1218 }
1219 },
1220 #[cfg(feature = "dtype-decimal")]
1221 Decimal(p, s) => write!(f, "Decimal({p}, {s})"),
1222 #[cfg(feature = "dtype-array")]
1223 Array(inner, size) => write!(f, "Array({inner:?}, {size})"),
1224 List(inner) => write!(f, "List({inner:?})"),
1225 #[cfg(feature = "dtype-struct")]
1226 Struct(fields) => {
1227 let mut first = true;
1228 write!(f, "Struct({{")?;
1229 for field in fields {
1230 if !first {
1231 write!(f, ", ")?;
1232 }
1233 write!(f, "'{}': {:?}", field.name(), field.dtype())?;
1234 first = false;
1235 }
1236 write!(f, "}})")
1237 },
1238 #[cfg(feature = "dtype-categorical")]
1239 Categorical(cats, _) => {
1240 if cats.is_global() {
1241 write!(f, "Categorical")
1242 } else if cats.namespace().is_empty() && cats.physical() == CategoricalPhysical::U32
1243 {
1244 write!(f, "Categorical('{}')", cats.name())
1245 } else {
1246 write!(
1247 f,
1248 "Categorical('{}', '{}', {:?})",
1249 cats.name(),
1250 cats.namespace(),
1251 cats.physical()
1252 )
1253 }
1254 },
1255 #[cfg(feature = "dtype-categorical")]
1256 Enum(_, _) => write!(f, "Enum([...])"),
1257 #[cfg(feature = "object")]
1258 Object(_) => write!(f, "Object"),
1259 Null => write!(f, "Null"),
1260 #[cfg(feature = "dtype-extension")]
1261 Extension(typ, inner) => write!(f, "Extension({}, {inner:?})", typ.0.dyn_debug()),
1262 Unknown(kind) => write!(f, "Unknown({kind:?})"),
1263 }
1264 }
1265}
1266
1267pub fn merge_dtypes(left: &DataType, right: &DataType) -> PolarsResult<DataType> {
1268 use DataType::*;
1269 Ok(match (left, right) {
1270 #[cfg(feature = "dtype-categorical")]
1271 (Categorical(cats_l, map), Categorical(cats_r, _)) => {
1272 ensure_same_categories(cats_l, cats_r)?;
1273 Categorical(cats_l.clone(), map.clone())
1274 },
1275 #[cfg(feature = "dtype-categorical")]
1276 (Enum(fcats_l, map), Enum(fcats_r, _)) => {
1277 ensure_same_frozen_categories(fcats_l, fcats_r)?;
1278 Enum(fcats_l.clone(), map.clone())
1279 },
1280 (List(inner_l), List(inner_r)) => {
1281 let merged = merge_dtypes(inner_l, inner_r)?;
1282 List(Box::new(merged))
1283 },
1284 #[cfg(feature = "dtype-struct")]
1285 (Struct(inner_l), Struct(inner_r)) => {
1286 polars_ensure!(inner_l.len() == inner_r.len(), ComputeError: "cannot combine structs with differing amounts of fields ({} != {})", inner_l.len(), inner_r.len());
1287 let fields = inner_l.iter().zip(inner_r.iter()).map(|(l, r)| {
1288 polars_ensure!(l.name() == r.name(), ComputeError: "cannot combine structs with different fields ({} != {})", l.name(), r.name());
1289 let merged = merge_dtypes(l.dtype(), r.dtype())?;
1290 Ok(Field::new(l.name().clone(), merged))
1291 }).collect::<PolarsResult<Vec<_>>>()?;
1292 Struct(fields)
1293 },
1294 #[cfg(feature = "dtype-array")]
1295 (Array(inner_l, width_l), Array(inner_r, width_r)) => {
1296 polars_ensure!(width_l == width_r, ComputeError: "widths of FixedSizeWidth Series are not equal");
1297 let merged = merge_dtypes(inner_l, inner_r)?;
1298 Array(Box::new(merged), *width_l)
1299 },
1300 (left, right) if left == right => left.clone(),
1301 _ => polars_bail!(ComputeError: "unable to merge datatypes"),
1302 })
1303}
1304
1305fn collect_nested_types(
1306 dtype: &DataType,
1307 result: &mut PlHashSet<DataType>,
1308 include_compound_types: bool,
1309) {
1310 match dtype {
1311 DataType::List(inner) => {
1312 if include_compound_types {
1313 result.insert(dtype.clone());
1314 }
1315 collect_nested_types(inner, result, include_compound_types);
1316 },
1317 #[cfg(feature = "dtype-array")]
1318 DataType::Array(inner, _) => {
1319 if include_compound_types {
1320 result.insert(dtype.clone());
1321 }
1322 collect_nested_types(inner, result, include_compound_types);
1323 },
1324 #[cfg(feature = "dtype-struct")]
1325 DataType::Struct(fields) => {
1326 if include_compound_types {
1327 result.insert(dtype.clone());
1328 }
1329 for field in fields {
1330 collect_nested_types(field.dtype(), result, include_compound_types);
1331 }
1332 },
1333 _ => {
1334 result.insert(dtype.clone());
1335 },
1336 }
1337}
1338
1339pub fn unpack_dtypes(dtype: &DataType, include_compound_types: bool) -> PlHashSet<DataType> {
1340 let mut result = PlHashSet::new();
1341 collect_nested_types(dtype, &mut result, include_compound_types);
1342 result
1343}
1344
1345#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
1346#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
1347#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
1348pub struct CompatLevel(pub(crate) u16);
1349
1350impl CompatLevel {
1351 pub const fn newest() -> CompatLevel {
1352 CompatLevel(1)
1353 }
1354
1355 pub const fn oldest() -> CompatLevel {
1356 CompatLevel(0)
1357 }
1358
1359 #[doc(hidden)]
1362 pub fn with_level(level: u16) -> PolarsResult<CompatLevel> {
1363 if level > CompatLevel::newest().0 {
1364 polars_bail!(InvalidOperation: "invalid compat level");
1365 }
1366 Ok(CompatLevel(level))
1367 }
1368
1369 #[doc(hidden)]
1370 pub fn get_level(&self) -> u16 {
1371 self.0
1372 }
1373}
1374
1375#[cfg(test)]
1376mod tests {
1377 use super::*;
1378
1379 #[cfg(feature = "dtype-array")]
1380 #[test]
1381 fn test_unpack_primitive_dtypes() {
1382 let inner_type = DataType::Float64;
1383 let array_type = DataType::Array(Box::new(inner_type), 10);
1384 let list_type = DataType::List(Box::new(array_type));
1385
1386 let result = unpack_dtypes(&list_type, false);
1387
1388 let mut expected = PlHashSet::new();
1389 expected.insert(DataType::Float64);
1390
1391 assert_eq!(result, expected)
1392 }
1393
1394 #[cfg(feature = "dtype-array")]
1395 #[test]
1396 fn test_unpack_compound_dtypes() {
1397 let inner_type = DataType::Float64;
1398 let array_type = DataType::Array(Box::new(inner_type), 10);
1399 let list_type = DataType::List(Box::new(array_type.clone()));
1400
1401 let result = unpack_dtypes(&list_type, true);
1402
1403 let mut expected = PlHashSet::new();
1404 expected.insert(list_type);
1405 expected.insert(array_type);
1406 expected.insert(DataType::Float64);
1407
1408 assert_eq!(result, expected)
1409 }
1410}