1use std::collections::BTreeMap;
2
3use arrow::datatypes::{DTYPE_CATEGORICAL, DTYPE_ENUM_VALUES, Metadata};
4#[cfg(feature = "dtype-array")]
5use polars_utils::format_tuple;
6use polars_utils::itertools::Itertools;
7#[cfg(any(feature = "serde-lazy", feature = "serde"))]
8use serde::{Deserialize, Serialize};
9use strum_macros::IntoStaticStr;
10
11use super::*;
12#[cfg(feature = "object")]
13use crate::chunked_array::object::registry::get_object_physical_type;
14use crate::utils::materialize_dyn_int;
15
16pub type TimeZone = PlSmallStr;
17
18static MAINTAIN_PL_TYPE: &str = "maintain_type";
19static PL_KEY: &str = "pl";
20
21pub trait MetaDataExt: IntoMetadata {
22 fn is_enum(&self) -> bool {
23 let metadata = self.into_metadata_ref();
24 metadata.get(DTYPE_ENUM_VALUES).is_some()
25 }
26
27 fn categorical(&self) -> Option<CategoricalOrdering> {
28 let metadata = self.into_metadata_ref();
29 match metadata.get(DTYPE_CATEGORICAL)?.as_str() {
30 "lexical" => Some(CategoricalOrdering::Lexical),
31 _ => Some(CategoricalOrdering::Physical),
33 }
34 }
35
36 fn maintain_type(&self) -> bool {
37 let metadata = self.into_metadata_ref();
38 metadata.get(PL_KEY).map(|s| s.as_str()) == Some(MAINTAIN_PL_TYPE)
39 }
40}
41
42impl MetaDataExt for Metadata {}
43pub trait IntoMetadata {
44 #[allow(clippy::wrong_self_convention)]
45 fn into_metadata_ref(&self) -> &Metadata;
46}
47
48impl IntoMetadata for Metadata {
49 fn into_metadata_ref(&self) -> &Metadata {
50 self
51 }
52}
53
54#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Default)]
55#[cfg_attr(
56 any(feature = "serde", feature = "serde-lazy"),
57 derive(Serialize, Deserialize)
58)]
59pub enum UnknownKind {
60 Int(i128),
62 Float,
63 Str,
65 #[default]
66 Any,
67}
68
69impl UnknownKind {
70 pub fn materialize(&self) -> Option<DataType> {
71 let dtype = match self {
72 UnknownKind::Int(v) => materialize_dyn_int(*v).dtype(),
73 UnknownKind::Float => DataType::Float64,
74 UnknownKind::Str => DataType::String,
75 UnknownKind::Any => return None,
76 };
77 Some(dtype)
78 }
79}
80
81#[derive(Debug, Copy, Clone, PartialEq, Default, IntoStaticStr)]
82#[cfg_attr(
83 any(feature = "serde-lazy", feature = "serde"),
84 derive(Serialize, Deserialize)
85)]
86#[strum(serialize_all = "snake_case")]
87pub enum CategoricalOrdering {
88 #[default]
89 Physical,
90 Lexical,
91}
92
93#[derive(Clone, Debug)]
94pub enum DataType {
95 Boolean,
96 UInt8,
97 UInt16,
98 UInt32,
99 UInt64,
100 Int8,
101 Int16,
102 Int32,
103 Int64,
104 Int128,
105 Float32,
106 Float64,
107 #[cfg(feature = "dtype-decimal")]
111 Decimal(Option<usize>, Option<usize>), String,
114 Binary,
115 BinaryOffset,
116 Date,
119 Datetime(TimeUnit, Option<TimeZone>),
122 Duration(TimeUnit),
124 Time,
126 #[cfg(feature = "dtype-array")]
128 Array(Box<DataType>, usize),
129 List(Box<DataType>),
131 #[cfg(feature = "object")]
134 Object(&'static str),
135 Null,
136 #[cfg(feature = "dtype-categorical")]
139 Categorical(Option<Arc<RevMapping>>, CategoricalOrdering),
140 #[cfg(feature = "dtype-categorical")]
142 Enum(Option<Arc<RevMapping>>, CategoricalOrdering),
143 #[cfg(feature = "dtype-struct")]
144 Struct(Vec<Field>),
145 Unknown(UnknownKind),
147}
148
149impl Default for DataType {
150 fn default() -> Self {
151 DataType::Unknown(UnknownKind::Any)
152 }
153}
154
155pub trait AsRefDataType {
156 fn as_ref_dtype(&self) -> &DataType;
157}
158
159impl Hash for DataType {
160 fn hash<H: Hasher>(&self, state: &mut H) {
161 std::mem::discriminant(self).hash(state)
162 }
163}
164
165impl PartialEq for DataType {
166 fn eq(&self, other: &Self) -> bool {
167 use DataType::*;
168 {
169 match (self, other) {
170 #[cfg(feature = "dtype-categorical")]
171 (Categorical(_, _ordering_l), Categorical(_, _ordering_r)) => true,
174 #[cfg(feature = "dtype-categorical")]
175 (Enum(None, _), Enum(_, _)) | (Enum(_, _), Enum(None, _)) => true,
177 #[cfg(feature = "dtype-categorical")]
178 (Enum(Some(cat_lhs), _), Enum(Some(cat_rhs), _)) => {
179 cat_lhs.get_categories() == cat_rhs.get_categories()
180 },
181 (Datetime(tu_l, tz_l), Datetime(tu_r, tz_r)) => tu_l == tu_r && tz_l == tz_r,
182 (List(left_inner), List(right_inner)) => left_inner == right_inner,
183 #[cfg(feature = "dtype-duration")]
184 (Duration(tu_l), Duration(tu_r)) => tu_l == tu_r,
185 #[cfg(feature = "dtype-decimal")]
186 (Decimal(l_prec, l_scale), Decimal(r_prec, r_scale)) => {
187 let is_prec_eq = l_prec.is_none() || r_prec.is_none() || l_prec == r_prec;
188 let is_scale_eq = l_scale.is_none() || r_scale.is_none() || l_scale == r_scale;
189
190 is_prec_eq && is_scale_eq
191 },
192 #[cfg(feature = "object")]
193 (Object(lhs), Object(rhs)) => lhs == rhs,
194 #[cfg(feature = "dtype-struct")]
195 (Struct(lhs), Struct(rhs)) => Vec::as_ptr(lhs) == Vec::as_ptr(rhs) || lhs == rhs,
196 #[cfg(feature = "dtype-array")]
197 (Array(left_inner, left_width), Array(right_inner, right_width)) => {
198 left_width == right_width && left_inner == right_inner
199 },
200 (Unknown(l), Unknown(r)) => match (l, r) {
201 (UnknownKind::Int(_), UnknownKind::Int(_)) => true,
202 _ => l == r,
203 },
204 _ => std::mem::discriminant(self) == std::mem::discriminant(other),
205 }
206 }
207 }
208}
209
210impl Eq for DataType {}
211
212impl DataType {
213 pub fn new_idxsize() -> Self {
214 #[cfg(feature = "bigidx")]
215 {
216 Self::UInt64
217 }
218 #[cfg(not(feature = "bigidx"))]
219 {
220 Self::UInt32
221 }
222 }
223
224 pub(crate) fn canonical_timezone(tz: &Option<PlSmallStr>) -> Option<TimeZone> {
226 match tz.as_deref() {
227 Some("") | None => None,
228 #[cfg(feature = "timezones")]
229 Some("+00:00") | Some("00:00") | Some("utc") => Some(PlSmallStr::from_static("UTC")),
230 Some(v) => Some(PlSmallStr::from_str(v)),
231 }
232 }
233
234 pub fn value_within_range(&self, other: AnyValue) -> bool {
235 use DataType::*;
236 match self {
237 UInt8 => other.extract::<u8>().is_some(),
238 #[cfg(feature = "dtype-u16")]
239 UInt16 => other.extract::<u16>().is_some(),
240 UInt32 => other.extract::<u32>().is_some(),
241 UInt64 => other.extract::<u64>().is_some(),
242 #[cfg(feature = "dtype-i8")]
243 Int8 => other.extract::<i8>().is_some(),
244 #[cfg(feature = "dtype-i16")]
245 Int16 => other.extract::<i16>().is_some(),
246 Int32 => other.extract::<i32>().is_some(),
247 Int64 => other.extract::<i64>().is_some(),
248 _ => false,
249 }
250 }
251
252 pub fn is_known(&self) -> bool {
254 match self {
255 DataType::List(inner) => inner.is_known(),
256 #[cfg(feature = "dtype-array")]
257 DataType::Array(inner, _) => inner.is_known(),
258 #[cfg(feature = "dtype-struct")]
259 DataType::Struct(fields) => fields.iter().all(|fld| fld.dtype.is_known()),
260 DataType::Unknown(_) => false,
261 _ => true,
262 }
263 }
264
265 pub fn materialize_unknown(self, allow_unknown: bool) -> PolarsResult<DataType> {
268 match self {
269 DataType::Unknown(u) => match u.materialize() {
270 Some(known) => Ok(known),
271 None => {
272 if allow_unknown {
273 Ok(DataType::Unknown(u))
274 } else {
275 polars_bail!(SchemaMismatch: "failed to materialize unknown type")
276 }
277 },
278 },
279 DataType::List(inner) => Ok(DataType::List(Box::new(
280 inner.materialize_unknown(allow_unknown)?,
281 ))),
282 #[cfg(feature = "dtype-array")]
283 DataType::Array(inner, size) => Ok(DataType::Array(
284 Box::new(inner.materialize_unknown(allow_unknown)?),
285 size,
286 )),
287 #[cfg(feature = "dtype-struct")]
288 DataType::Struct(fields) => Ok(DataType::Struct(
289 fields
290 .into_iter()
291 .map(|f| {
292 PolarsResult::Ok(Field::new(
293 f.name,
294 f.dtype.materialize_unknown(allow_unknown)?,
295 ))
296 })
297 .try_collect_vec()?,
298 )),
299 _ => Ok(self),
300 }
301 }
302
303 #[cfg(feature = "dtype-array")]
304 pub fn get_shape(&self) -> Option<Vec<usize>> {
306 fn get_shape_impl(dt: &DataType, shape: &mut Vec<usize>) {
307 if let DataType::Array(inner, size) = dt {
308 shape.push(*size);
309 get_shape_impl(inner, shape);
310 }
311 }
312
313 if let DataType::Array(inner, size) = self {
314 let mut shape = vec![*size];
315 get_shape_impl(inner, &mut shape);
316 Some(shape)
317 } else {
318 None
319 }
320 }
321
322 pub fn inner_dtype(&self) -> Option<&DataType> {
324 match self {
325 DataType::List(inner) => Some(inner),
326 #[cfg(feature = "dtype-array")]
327 DataType::Array(inner, _) => Some(inner),
328 _ => None,
329 }
330 }
331
332 pub fn leaf_dtype(&self) -> &DataType {
334 let mut prev = self;
335 while let Some(dtype) = prev.inner_dtype() {
336 prev = dtype
337 }
338 prev
339 }
340
341 #[cfg(feature = "dtype-array")]
342 pub fn array_leaf_dtype(&self) -> Option<&DataType> {
344 let mut prev = self;
345 match prev {
346 DataType::Array(_, _) => {
347 while let DataType::Array(inner, _) = &prev {
348 prev = inner;
349 }
350 Some(prev)
351 },
352 _ => None,
353 }
354 }
355
356 pub fn cast_leaf(&self, to: DataType) -> DataType {
358 use DataType::*;
359 match self {
360 List(inner) => List(Box::new(inner.cast_leaf(to))),
361 #[cfg(feature = "dtype-array")]
362 Array(inner, size) => Array(Box::new(inner.cast_leaf(to)), *size),
363 _ => to,
364 }
365 }
366
367 pub fn can_cast_to(&self, to: &DataType) -> Option<bool> {
371 if self == to {
372 return Some(true);
373 }
374 if self.is_primitive_numeric() && to.is_primitive_numeric() {
375 return Some(true);
376 }
377
378 if self.is_null() {
379 return Some(true);
380 }
381
382 use DataType as D;
383 Some(match (self, to) {
384 #[cfg(feature = "dtype-categorical")]
385 (D::Categorical(_, _) | D::Enum(_, _), D::Binary)
386 | (D::Binary, D::Categorical(_, _) | D::Enum(_, _)) => false,
387
388 #[cfg(feature = "object")]
389 (D::Object(_), D::Object(_)) => true,
390 #[cfg(feature = "object")]
391 (D::Object(_), _) | (_, D::Object(_)) => false,
392
393 (D::Boolean, dt) | (dt, D::Boolean) => match dt {
394 dt if dt.is_primitive_numeric() => true,
395 #[cfg(feature = "dtype-decimal")]
396 D::Decimal(_, _) => true,
397 D::String | D::Binary => true,
398 _ => false,
399 },
400
401 (D::List(from), D::List(to)) => from.can_cast_to(to)?,
402 #[cfg(feature = "dtype-array")]
403 (D::Array(from, l_width), D::Array(to, r_width)) => {
404 l_width == r_width && from.can_cast_to(to)?
405 },
406 #[cfg(feature = "dtype-struct")]
407 (D::Struct(l_fields), D::Struct(r_fields)) => {
408 if l_fields.is_empty() {
409 return Some(true);
410 }
411
412 if l_fields.len() != r_fields.len() {
413 return Some(false);
414 }
415
416 for (l, r) in l_fields.iter().zip(r_fields) {
417 if !l.dtype().can_cast_to(r.dtype())? {
418 return Some(false);
419 }
420 }
421
422 true
423 },
424
425 _ => return None,
427 })
428 }
429
430 pub fn implode(self) -> DataType {
431 DataType::List(Box::new(self))
432 }
433
434 #[must_use]
436 pub fn to_physical(&self) -> DataType {
437 use DataType::*;
438 match self {
439 Date => Int32,
440 Datetime(_, _) => Int64,
441 Duration(_) => Int64,
442 Time => Int64,
443 #[cfg(feature = "dtype-decimal")]
444 Decimal(_, _) => Int128,
445 #[cfg(feature = "dtype-categorical")]
446 Categorical(_, _) | Enum(_, _) => UInt32,
447 #[cfg(feature = "dtype-array")]
448 Array(dt, width) => Array(Box::new(dt.to_physical()), *width),
449 List(dt) => List(Box::new(dt.to_physical())),
450 #[cfg(feature = "dtype-struct")]
451 Struct(fields) => {
452 let new_fields = fields
453 .iter()
454 .map(|s| Field::new(s.name().clone(), s.dtype().to_physical()))
455 .collect();
456 Struct(new_fields)
457 },
458 _ => self.clone(),
459 }
460 }
461
462 pub fn is_supported_list_arithmetic_input(&self) -> bool {
463 self.is_primitive_numeric() || self.is_bool() || self.is_null()
464 }
465
466 pub fn is_logical(&self) -> bool {
468 self != &self.to_physical()
469 }
470
471 pub fn is_temporal(&self) -> bool {
473 use DataType::*;
474 matches!(self, Date | Datetime(_, _) | Duration(_) | Time)
475 }
476
477 pub fn is_primitive(&self) -> bool {
480 self.is_primitive_numeric()
481 | matches!(
482 self,
483 DataType::Boolean | DataType::String | DataType::Binary
484 )
485 }
486
487 pub fn is_primitive_numeric(&self) -> bool {
489 self.is_float() || self.is_integer()
490 }
491
492 pub fn is_bool(&self) -> bool {
494 matches!(self, DataType::Boolean)
495 }
496
497 pub fn is_list(&self) -> bool {
499 matches!(self, DataType::List(_))
500 }
501
502 pub fn is_array(&self) -> bool {
504 #[cfg(feature = "dtype-array")]
505 {
506 matches!(self, DataType::Array(_, _))
507 }
508 #[cfg(not(feature = "dtype-array"))]
509 {
510 false
511 }
512 }
513
514 pub fn is_nested(&self) -> bool {
515 self.is_list() || self.is_struct() || self.is_array()
516 }
517
518 pub fn is_struct(&self) -> bool {
520 #[cfg(feature = "dtype-struct")]
521 {
522 matches!(self, DataType::Struct(_))
523 }
524 #[cfg(not(feature = "dtype-struct"))]
525 {
526 false
527 }
528 }
529
530 pub fn is_binary(&self) -> bool {
531 matches!(self, DataType::Binary)
532 }
533
534 pub fn is_date(&self) -> bool {
535 matches!(self, DataType::Date)
536 }
537 pub fn is_datetime(&self) -> bool {
538 matches!(self, DataType::Datetime(..))
539 }
540
541 pub fn is_object(&self) -> bool {
542 #[cfg(feature = "object")]
543 {
544 matches!(self, DataType::Object(_))
545 }
546 #[cfg(not(feature = "object"))]
547 {
548 false
549 }
550 }
551
552 pub fn is_null(&self) -> bool {
553 matches!(self, DataType::Null)
554 }
555
556 pub fn contains_views(&self) -> bool {
557 use DataType::*;
558 match self {
559 Binary | String => true,
560 #[cfg(feature = "dtype-categorical")]
561 Categorical(_, _) | Enum(_, _) => true,
562 List(inner) => inner.contains_views(),
563 #[cfg(feature = "dtype-array")]
564 Array(inner, _) => inner.contains_views(),
565 #[cfg(feature = "dtype-struct")]
566 Struct(fields) => fields.iter().any(|field| field.dtype.contains_views()),
567 _ => false,
568 }
569 }
570
571 pub fn contains_categoricals(&self) -> bool {
572 use DataType::*;
573 match self {
574 #[cfg(feature = "dtype-categorical")]
575 Categorical(_, _) | Enum(_, _) => true,
576 List(inner) => inner.contains_categoricals(),
577 #[cfg(feature = "dtype-array")]
578 Array(inner, _) => inner.contains_categoricals(),
579 #[cfg(feature = "dtype-struct")]
580 Struct(fields) => fields
581 .iter()
582 .any(|field| field.dtype.contains_categoricals()),
583 _ => false,
584 }
585 }
586
587 pub fn contains_objects(&self) -> bool {
588 use DataType::*;
589 match self {
590 #[cfg(feature = "object")]
591 Object(_) => true,
592 List(inner) => inner.contains_objects(),
593 #[cfg(feature = "dtype-array")]
594 Array(inner, _) => inner.contains_objects(),
595 #[cfg(feature = "dtype-struct")]
596 Struct(fields) => fields.iter().any(|field| field.dtype.contains_objects()),
597 _ => false,
598 }
599 }
600
601 pub fn is_ord(&self) -> bool {
603 #[cfg(feature = "dtype-categorical")]
604 let is_cat = matches!(self, DataType::Categorical(_, _) | DataType::Enum(_, _));
605 #[cfg(not(feature = "dtype-categorical"))]
606 let is_cat = false;
607
608 let phys = self.to_physical();
609 (phys.is_primitive_numeric()
610 || self.is_decimal()
611 || matches!(
612 phys,
613 DataType::Binary | DataType::String | DataType::Boolean
614 ))
615 && !is_cat
616 }
617
618 pub fn is_decimal(&self) -> bool {
620 match self {
621 #[cfg(feature = "dtype-decimal")]
622 DataType::Decimal(_, _) => true,
623 _ => false,
624 }
625 }
626
627 pub fn is_float(&self) -> bool {
630 matches!(
631 self,
632 DataType::Float32 | DataType::Float64 | DataType::Unknown(UnknownKind::Float)
633 )
634 }
635
636 pub fn is_integer(&self) -> bool {
638 matches!(
639 self,
640 DataType::Int8
641 | DataType::Int16
642 | DataType::Int32
643 | DataType::Int64
644 | DataType::Int128
645 | DataType::UInt8
646 | DataType::UInt16
647 | DataType::UInt32
648 | DataType::UInt64
649 | DataType::Unknown(UnknownKind::Int(_))
650 )
651 }
652
653 pub fn is_signed_integer(&self) -> bool {
654 matches!(
656 self,
657 DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 | DataType::Int128
658 )
659 }
660
661 pub fn is_unsigned_integer(&self) -> bool {
662 matches!(
663 self,
664 DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64,
665 )
666 }
667
668 pub fn is_string(&self) -> bool {
669 matches!(self, DataType::String | DataType::Unknown(UnknownKind::Str))
670 }
671
672 pub fn is_categorical(&self) -> bool {
673 #[cfg(feature = "dtype-categorical")]
674 {
675 matches!(self, DataType::Categorical(_, _))
676 }
677 #[cfg(not(feature = "dtype-categorical"))]
678 {
679 false
680 }
681 }
682
683 pub fn is_enum(&self) -> bool {
684 #[cfg(feature = "dtype-categorical")]
685 {
686 matches!(self, DataType::Enum(_, _))
687 }
688 #[cfg(not(feature = "dtype-categorical"))]
689 {
690 false
691 }
692 }
693
694 pub fn to_arrow_field(&self, name: PlSmallStr, compat_level: CompatLevel) -> ArrowField {
696 let metadata = match self {
697 #[cfg(feature = "dtype-categorical")]
698 DataType::Enum(Some(revmap), _) => {
699 let cats = revmap.get_categories();
700 let mut encoded = String::with_capacity(cats.len() * 10);
701 for cat in cats.values_iter() {
702 encoded.push_str(itoa::Buffer::new().format(cat.len()));
703 encoded.push(';');
704 encoded.push_str(cat);
705 }
706 Some(BTreeMap::from([(
707 PlSmallStr::from_static(DTYPE_ENUM_VALUES),
708 PlSmallStr::from_string(encoded),
709 )]))
710 },
711 #[cfg(feature = "dtype-categorical")]
712 DataType::Categorical(_, ordering) => Some(BTreeMap::from([(
713 PlSmallStr::from_static(DTYPE_CATEGORICAL),
714 PlSmallStr::from_static(ordering.into()),
715 )])),
716 DataType::BinaryOffset => Some(BTreeMap::from([(
717 PlSmallStr::from_static(PL_KEY),
718 PlSmallStr::from_static(MAINTAIN_PL_TYPE),
719 )])),
720 _ => None,
721 };
722
723 let field = ArrowField::new(name, self.to_arrow(compat_level), true);
724
725 if let Some(metadata) = metadata {
726 field.with_metadata(metadata)
727 } else {
728 field
729 }
730 }
731
732 pub fn max(&self) -> PolarsResult<Scalar> {
734 use DataType::*;
735 let v = match self {
736 Int8 => Scalar::from(i8::MAX),
737 Int16 => Scalar::from(i16::MAX),
738 Int32 => Scalar::from(i32::MAX),
739 Int64 => Scalar::from(i64::MAX),
740 Int128 => Scalar::from(i128::MAX),
741 UInt8 => Scalar::from(u8::MAX),
742 UInt16 => Scalar::from(u16::MAX),
743 UInt32 => Scalar::from(u32::MAX),
744 UInt64 => Scalar::from(u64::MAX),
745 Float32 => Scalar::from(f32::INFINITY),
746 Float64 => Scalar::from(f64::INFINITY),
747 #[cfg(feature = "dtype-time")]
748 Time => Scalar::new(Time, AnyValue::Time(NS_IN_DAY - 1)),
749 dt => polars_bail!(ComputeError: "cannot determine upper bound for dtype `{}`", dt),
750 };
751 Ok(v)
752 }
753
754 pub fn min(&self) -> PolarsResult<Scalar> {
756 use DataType::*;
757 let v = match self {
758 Int8 => Scalar::from(i8::MIN),
759 Int16 => Scalar::from(i16::MIN),
760 Int32 => Scalar::from(i32::MIN),
761 Int64 => Scalar::from(i64::MIN),
762 Int128 => Scalar::from(i128::MIN),
763 UInt8 => Scalar::from(u8::MIN),
764 UInt16 => Scalar::from(u16::MIN),
765 UInt32 => Scalar::from(u32::MIN),
766 UInt64 => Scalar::from(u64::MIN),
767 Float32 => Scalar::from(f32::NEG_INFINITY),
768 Float64 => Scalar::from(f64::NEG_INFINITY),
769 #[cfg(feature = "dtype-time")]
770 Time => Scalar::new(Time, AnyValue::Time(0)),
771 dt => polars_bail!(ComputeError: "cannot determine lower bound for dtype `{}`", dt),
772 };
773 Ok(v)
774 }
775
776 #[inline]
778 pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowDataType {
779 self.try_to_arrow(compat_level).unwrap()
780 }
781
782 #[inline]
783 pub fn try_to_arrow(&self, compat_level: CompatLevel) -> PolarsResult<ArrowDataType> {
784 use DataType::*;
785 match self {
786 Boolean => Ok(ArrowDataType::Boolean),
787 UInt8 => Ok(ArrowDataType::UInt8),
788 UInt16 => Ok(ArrowDataType::UInt16),
789 UInt32 => Ok(ArrowDataType::UInt32),
790 UInt64 => Ok(ArrowDataType::UInt64),
791 Int8 => Ok(ArrowDataType::Int8),
792 Int16 => Ok(ArrowDataType::Int16),
793 Int32 => Ok(ArrowDataType::Int32),
794 Int64 => Ok(ArrowDataType::Int64),
795 Int128 => Ok(ArrowDataType::Int128),
796 Float32 => Ok(ArrowDataType::Float32),
797 Float64 => Ok(ArrowDataType::Float64),
798 #[cfg(feature = "dtype-decimal")]
799 Decimal(precision, scale) => {
800 let precision = (*precision).unwrap_or(38);
801 polars_ensure!(precision <= 38 && precision > 0, InvalidOperation: "decimal precision should be <= 38 & >= 1");
802
803 Ok(ArrowDataType::Decimal(
804 precision,
805 scale.unwrap_or(0), ))
807 },
808 String => {
809 let dt = if compat_level.0 >= 1 {
810 ArrowDataType::Utf8View
811 } else {
812 ArrowDataType::LargeUtf8
813 };
814 Ok(dt)
815 },
816 Binary => {
817 let dt = if compat_level.0 >= 1 {
818 ArrowDataType::BinaryView
819 } else {
820 ArrowDataType::LargeBinary
821 };
822 Ok(dt)
823 },
824 Date => Ok(ArrowDataType::Date32),
825 Datetime(unit, tz) => Ok(ArrowDataType::Timestamp(unit.to_arrow(), tz.clone())),
826 Duration(unit) => Ok(ArrowDataType::Duration(unit.to_arrow())),
827 Time => Ok(ArrowDataType::Time64(ArrowTimeUnit::Nanosecond)),
828 #[cfg(feature = "dtype-array")]
829 Array(dt, size) => Ok(dt
830 .try_to_arrow(compat_level)?
831 .to_fixed_size_list(*size, true)),
832 List(dt) => Ok(ArrowDataType::LargeList(Box::new(
833 dt.to_arrow_field(PlSmallStr::from_static("item"), compat_level),
834 ))),
835 Null => Ok(ArrowDataType::Null),
836 #[cfg(feature = "object")]
837 Object(_) => Ok(get_object_physical_type()),
838 #[cfg(feature = "dtype-categorical")]
839 Categorical(_, _) | Enum(_, _) => {
840 let values = if compat_level.0 >= 1 {
841 ArrowDataType::Utf8View
842 } else {
843 ArrowDataType::LargeUtf8
844 };
845 Ok(ArrowDataType::Dictionary(
846 IntegerType::UInt32,
847 Box::new(values),
848 false,
849 ))
850 },
851 #[cfg(feature = "dtype-struct")]
852 Struct(fields) => {
853 let fields = fields
854 .iter()
855 .map(|fld| fld.to_arrow(compat_level))
856 .collect();
857 Ok(ArrowDataType::Struct(fields))
858 },
859 BinaryOffset => Ok(ArrowDataType::LargeBinary),
860 Unknown(kind) => {
861 let dt = match kind {
862 UnknownKind::Any => ArrowDataType::Unknown,
863 UnknownKind::Float => ArrowDataType::Float64,
864 UnknownKind::Str => ArrowDataType::Utf8View,
865 UnknownKind::Int(v) => {
866 return materialize_dyn_int(*v).dtype().try_to_arrow(compat_level);
867 },
868 };
869 Ok(dt)
870 },
871 }
872 }
873
874 pub fn is_nested_null(&self) -> bool {
875 use DataType::*;
876 match self {
877 Null => true,
878 List(field) => field.is_nested_null(),
879 #[cfg(feature = "dtype-array")]
880 Array(field, _) => field.is_nested_null(),
881 #[cfg(feature = "dtype-struct")]
882 Struct(fields) => fields.iter().all(|fld| fld.dtype.is_nested_null()),
883 _ => false,
884 }
885 }
886
887 pub fn matches_schema_type(&self, schema_type: &DataType) -> PolarsResult<bool> {
894 match (self, schema_type) {
895 (DataType::List(l), DataType::List(r)) => l.matches_schema_type(r),
896 #[cfg(feature = "dtype-array")]
897 (DataType::Array(l, sl), DataType::Array(r, sr)) => {
898 Ok(l.matches_schema_type(r)? && sl == sr)
899 },
900 #[cfg(feature = "dtype-struct")]
901 (DataType::Struct(l), DataType::Struct(r)) => {
902 let mut must_cast = false;
903 for (l, r) in l.iter().zip(r.iter()) {
904 must_cast |= l.dtype.matches_schema_type(&r.dtype)?;
905 }
906 Ok(must_cast)
907 },
908 (DataType::Null, DataType::Null) => Ok(false),
909 #[cfg(feature = "dtype-decimal")]
910 (DataType::Decimal(_, s1), DataType::Decimal(_, s2)) => Ok(s1 != s2),
911 (DataType::Null, _) => Ok(true),
914 (l, r) if l == r => Ok(false),
915 (l, r) => {
916 polars_bail!(SchemaMismatch: "type {:?} is incompatible with expected type {:?}", l, r)
917 },
918 }
919 }
920}
921
922impl Display for DataType {
923 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
924 let s = match self {
925 DataType::Null => "null",
926 DataType::Boolean => "bool",
927 DataType::UInt8 => "u8",
928 DataType::UInt16 => "u16",
929 DataType::UInt32 => "u32",
930 DataType::UInt64 => "u64",
931 DataType::Int8 => "i8",
932 DataType::Int16 => "i16",
933 DataType::Int32 => "i32",
934 DataType::Int64 => "i64",
935 DataType::Int128 => "i128",
936 DataType::Float32 => "f32",
937 DataType::Float64 => "f64",
938 #[cfg(feature = "dtype-decimal")]
939 DataType::Decimal(precision, scale) => {
940 return match (precision, scale) {
941 (Some(precision), Some(scale)) => {
942 f.write_str(&format!("decimal[{precision},{scale}]"))
943 },
944 (None, Some(scale)) => f.write_str(&format!("decimal[*,{scale}]")),
945 _ => f.write_str("decimal[?]"), };
947 },
948 DataType::String => "str",
949 DataType::Binary => "binary",
950 DataType::Date => "date",
951 DataType::Datetime(tu, tz) => {
952 let s = match tz {
953 None => format!("datetime[{tu}]"),
954 Some(tz) => format!("datetime[{tu}, {tz}]"),
955 };
956 return f.write_str(&s);
957 },
958 DataType::Duration(tu) => return write!(f, "duration[{tu}]"),
959 DataType::Time => "time",
960 #[cfg(feature = "dtype-array")]
961 DataType::Array(_, _) => {
962 let tp = self.array_leaf_dtype().unwrap();
963
964 let dims = self.get_shape().unwrap();
965 let shape = if dims.len() == 1 {
966 format!("{}", dims[0])
967 } else {
968 format_tuple!(dims)
969 };
970 return write!(f, "array[{tp}, {}]", shape);
971 },
972 DataType::List(tp) => return write!(f, "list[{tp}]"),
973 #[cfg(feature = "object")]
974 DataType::Object(s) => s,
975 #[cfg(feature = "dtype-categorical")]
976 DataType::Categorical(_, _) => "cat",
977 #[cfg(feature = "dtype-categorical")]
978 DataType::Enum(_, _) => "enum",
979 #[cfg(feature = "dtype-struct")]
980 DataType::Struct(fields) => return write!(f, "struct[{}]", fields.len()),
981 DataType::Unknown(kind) => match kind {
982 UnknownKind::Any => "unknown",
983 UnknownKind::Int(_) => "dyn int",
984 UnknownKind::Float => "dyn float",
985 UnknownKind::Str => "dyn str",
986 },
987 DataType::BinaryOffset => "binary[offset]",
988 };
989 f.write_str(s)
990 }
991}
992
993pub fn merge_dtypes(left: &DataType, right: &DataType) -> PolarsResult<DataType> {
994 use DataType::*;
995 Ok(match (left, right) {
996 #[cfg(feature = "dtype-categorical")]
997 (Categorical(Some(rev_map_l), ordering), Categorical(Some(rev_map_r), _)) => {
998 match (&**rev_map_l, &**rev_map_r) {
999 (RevMapping::Global(_, _, idl), RevMapping::Global(_, _, idr)) if idl == idr => {
1000 let mut merger = GlobalRevMapMerger::new(rev_map_l.clone());
1001 merger.merge_map(rev_map_r)?;
1002 Categorical(Some(merger.finish()), *ordering)
1003 },
1004 (RevMapping::Local(_, idl), RevMapping::Local(_, idr)) if idl == idr => {
1005 left.clone()
1006 },
1007 _ => polars_bail!(string_cache_mismatch),
1008 }
1009 },
1010 #[cfg(feature = "dtype-categorical")]
1011 (Enum(Some(rev_map_l), _), Enum(Some(rev_map_r), _)) => {
1012 match (&**rev_map_l, &**rev_map_r) {
1013 (RevMapping::Local(_, idl), RevMapping::Local(_, idr)) if idl == idr => {
1014 left.clone()
1015 },
1016 _ => polars_bail!(ComputeError: "can not combine with different categories"),
1017 }
1018 },
1019 (List(inner_l), List(inner_r)) => {
1020 let merged = merge_dtypes(inner_l, inner_r)?;
1021 List(Box::new(merged))
1022 },
1023 #[cfg(feature = "dtype-struct")]
1024 (Struct(inner_l), Struct(inner_r)) => {
1025 polars_ensure!(inner_l.len() == inner_r.len(), ComputeError: "cannot combine structs with differing amounts of fields ({} != {})", inner_l.len(), inner_r.len());
1026 let fields = inner_l.iter().zip(inner_r.iter()).map(|(l, r)| {
1027 polars_ensure!(l.name() == r.name(), ComputeError: "cannot combine structs with different fields ({} != {})", l.name(), r.name());
1028 let merged = merge_dtypes(l.dtype(), r.dtype())?;
1029 Ok(Field::new(l.name().clone(), merged))
1030 }).collect::<PolarsResult<Vec<_>>>()?;
1031 Struct(fields)
1032 },
1033 #[cfg(feature = "dtype-array")]
1034 (Array(inner_l, width_l), Array(inner_r, width_r)) => {
1035 polars_ensure!(width_l == width_r, ComputeError: "widths of FixedSizeWidth Series are not equal");
1036 let merged = merge_dtypes(inner_l, inner_r)?;
1037 Array(Box::new(merged), *width_l)
1038 },
1039 (left, right) if left == right => left.clone(),
1040 _ => polars_bail!(ComputeError: "unable to merge datatypes"),
1041 })
1042}
1043
1044fn collect_nested_types(
1045 dtype: &DataType,
1046 result: &mut PlHashSet<DataType>,
1047 include_compound_types: bool,
1048) {
1049 match dtype {
1050 DataType::List(inner) => {
1051 if include_compound_types {
1052 result.insert(dtype.clone());
1053 }
1054 collect_nested_types(inner, result, include_compound_types);
1055 },
1056 #[cfg(feature = "dtype-array")]
1057 DataType::Array(inner, _) => {
1058 if include_compound_types {
1059 result.insert(dtype.clone());
1060 }
1061 collect_nested_types(inner, result, include_compound_types);
1062 },
1063 #[cfg(feature = "dtype-struct")]
1064 DataType::Struct(fields) => {
1065 if include_compound_types {
1066 result.insert(dtype.clone());
1067 }
1068 for field in fields {
1069 collect_nested_types(field.dtype(), result, include_compound_types);
1070 }
1071 },
1072 _ => {
1073 result.insert(dtype.clone());
1074 },
1075 }
1076}
1077
1078pub fn unpack_dtypes(dtype: &DataType, include_compound_types: bool) -> PlHashSet<DataType> {
1079 let mut result = PlHashSet::new();
1080 collect_nested_types(dtype, &mut result, include_compound_types);
1081 result
1082}
1083
1084#[cfg(feature = "dtype-categorical")]
1085pub fn create_enum_dtype(categories: Utf8ViewArray) -> DataType {
1086 let rev_map = RevMapping::build_local(categories);
1087 DataType::Enum(Some(Arc::new(rev_map)), Default::default())
1088}
1089
1090#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
1091#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
1092pub struct CompatLevel(pub(crate) u16);
1093
1094impl CompatLevel {
1095 pub const fn newest() -> CompatLevel {
1096 CompatLevel(1)
1097 }
1098
1099 pub const fn oldest() -> CompatLevel {
1100 CompatLevel(0)
1101 }
1102
1103 #[doc(hidden)]
1106 pub fn with_level(level: u16) -> PolarsResult<CompatLevel> {
1107 if level > CompatLevel::newest().0 {
1108 polars_bail!(InvalidOperation: "invalid compat level");
1109 }
1110 Ok(CompatLevel(level))
1111 }
1112
1113 #[doc(hidden)]
1114 pub fn get_level(&self) -> u16 {
1115 self.0
1116 }
1117}
1118
1119#[cfg(test)]
1120mod tests {
1121 use super::*;
1122
1123 #[cfg(feature = "dtype-array")]
1124 #[test]
1125 fn test_unpack_primitive_dtypes() {
1126 let inner_type = DataType::Float64;
1127 let array_type = DataType::Array(Box::new(inner_type), 10);
1128 let list_type = DataType::List(Box::new(array_type.clone()));
1129
1130 let result = unpack_dtypes(&list_type, false);
1131
1132 let mut expected = PlHashSet::new();
1133 expected.insert(DataType::Float64);
1134
1135 assert_eq!(result, expected)
1136 }
1137
1138 #[cfg(feature = "dtype-array")]
1139 #[test]
1140 fn test_unpack_compound_dtypes() {
1141 let inner_type = DataType::Float64;
1142 let array_type = DataType::Array(Box::new(inner_type), 10);
1143 let list_type = DataType::List(Box::new(array_type.clone()));
1144
1145 let result = unpack_dtypes(&list_type, true);
1146
1147 let mut expected = PlHashSet::new();
1148 expected.insert(list_type.clone());
1149 expected.insert(array_type.clone());
1150 expected.insert(DataType::Float64);
1151
1152 assert_eq!(result, expected)
1153 }
1154}