Skip to main content

polars_core/series/arrow_export/
mod.rs

1macro_rules! bail_unhandled_arrow_conversion_dtype_pair {
2    ($input_pl_dtype:expr, $output_arrow_field:expr) => {{
3        return Err(
4            $crate::series::arrow_export::unhandled_arrow_conversion_dtype_pair_err(
5                $input_pl_dtype,
6                $output_arrow_field,
7            ),
8        );
9    }};
10}
11
12#[cfg(feature = "dtype-categorical")]
13pub mod categorical;
14
15use std::borrow::Cow;
16use std::sync::Arc;
17
18use polars_compute::cast::cast_unchecked;
19use polars_error::{PolarsError, PolarsResult, polars_ensure, polars_err};
20
21use crate::prelude::{
22    Array, ArrayRef, ArrowDataType, ArrowField, BinaryViewArray, CompatLevel, DataType, ListArray,
23    PlSmallStr, PrimitiveArray, Series,
24};
25
26fn unhandled_arrow_conversion_dtype_pair_err(
27    input_pl_dtype: &DataType,
28    output_arrow_field: &ArrowField,
29) -> PolarsError {
30    polars_err!(
31        InvalidOperation:
32        "to_arrow() conversion failed: cannot convert \
33        ({input_pl_dtype:?}) to ({output_arrow_field:?})",
34    )
35}
36
37/// Downcasts to a primitive array, boxes it, then sets its dtype.
38macro_rules! primitive_to_boxed_with_logical {
39    ($array:expr, $physical:ty, $logical_arrow_dtype:expr) => {{
40        let arr: &PrimitiveArray<$physical> = $array.as_any().downcast_ref().unwrap();
41        arr.clone().to($logical_arrow_dtype).to_boxed()
42    }};
43}
44
45fn ensure_no_nulls(array: &dyn Array) -> PolarsResult<()> {
46    polars_ensure!(
47        !array.has_nulls(),
48        SchemaMismatch:
49        "to_arrow() conversion failed: nullable is false but array contained {} NULLs (arrow dtype: {:?})",
50        array.null_count(), array.dtype(),
51    );
52
53    Ok(())
54}
55
56impl Series {
57    /// Export this Series to an arrow array. The dtype of the returned array will be chosen
58    /// according to the provided `compat_level`.
59    pub fn to_arrow(&self, chunk_idx: usize, compat_level: CompatLevel) -> ArrayRef {
60        self.to_arrow_with_field(
61            chunk_idx,
62            Cow::Owned(
63                self.dtype()
64                    .to_arrow_field(self.name().clone(), compat_level),
65            ),
66            true,
67        )
68        .unwrap()
69    }
70
71    /// Export this Series to an arrow array. The dtype of the returned array will match the
72    /// provided arrow field. Returns an error if this Series cannot be exported to the arrow field.
73    pub fn to_arrow_with_field<'a>(
74        &self,
75        chunk_idx: usize,
76        output_arrow_field: Cow<'a, ArrowField>,
77        skip_attach_pl_metadata: bool,
78    ) -> PolarsResult<ArrayRef> {
79        ToArrowConverter {
80            skip_attach_pl_metadata,
81            #[cfg(feature = "dtype-categorical")]
82            categorical_converter: {
83                let mut categorical_converter =
84                    crate::series::arrow_export::categorical::CategoricalToArrowConverter {
85                        converters: Default::default(),
86                        persist_remap: false,
87                    };
88
89                categorical_converter.initialize(self.dtype());
90
91                categorical_converter
92            },
93        }
94        .array_to_arrow(
95            self.chunks().get(chunk_idx).unwrap().as_ref(),
96            self.dtype(),
97            output_arrow_field,
98        )
99    }
100}
101
102/// Low-level converter that exports `ArrayRef`s from Polars Series to arrow arrays.
103///
104/// This can be held to perform repeated categorical exports with persisted indices to ensure
105/// the exported chunks use the same set of indices.
106pub struct ToArrowConverter {
107    /// If the `arrow_field` being passed was generated by `DataType::to_arrow_field`,
108    /// it will already have polars metadata.
109    pub skip_attach_pl_metadata: bool,
110    #[cfg(feature = "dtype-categorical")]
111    pub categorical_converter:
112        crate::series::arrow_export::categorical::CategoricalToArrowConverter,
113}
114
115impl ToArrowConverter {
116    /// Returns an error if `output_arrow_field` was provided and does not match the output data type.
117    pub fn array_to_arrow<'a>(
118        &mut self,
119        array: &dyn Array,
120        dtype: &DataType,
121        arrow_field: Cow<'a, ArrowField>,
122    ) -> PolarsResult<Box<dyn Array>> {
123        let nullable = arrow_field.is_nullable;
124        let out = self.array_to_arrow_impl(array, dtype, arrow_field)?;
125
126        if !nullable {
127            ensure_no_nulls(array)?
128        }
129
130        Ok(out)
131    }
132
133    fn array_to_arrow_impl<'a>(
134        &mut self,
135        array: &dyn Array,
136        polars_dtype: &DataType,
137        arrow_field: Cow<'a, ArrowField>,
138    ) -> PolarsResult<Box<dyn Array>> {
139        // We perform additional steps where necessary. E.g.
140        // * If we are exporting a logical type, set the array dtype to the corresponding arrow logical type.
141        // * Attach field metadata where necessary (e.g. for categorical and extension types).
142        Ok(match (polars_dtype, arrow_field.dtype()) {
143            #[cfg(feature = "dtype-struct")]
144            (DataType::Struct(struct_fields), ArrowDataType::Struct(arrow_struct_fields)) => {
145                use arrow::array::StructArray;
146                let arr: &StructArray = array.as_any().downcast_ref().unwrap();
147
148                polars_ensure!(
149                    arrow_struct_fields.len() == arr.fields().len()
150                    && arrow_struct_fields
151                        .iter()
152                        .zip(arr.fields())
153                        .all(|(l, r)| l.name() == r.name()),
154                    SchemaMismatch:
155                    "to_arrow() conversion failed: struct field names mismatch: {:?} != expected: {:?}",
156                    arrow_field.dtype(), arr.dtype()
157                );
158
159                let mut arrow_dtype = to_owned_dtype(arrow_field);
160
161                let ArrowDataType::Struct(arrow_struct_fields) = &mut arrow_dtype else {
162                    unreachable!()
163                };
164
165                self.attach_pl_field_metadata(
166                    struct_fields
167                        .iter()
168                        .map(|x| x.dtype())
169                        .zip(arrow_struct_fields.iter_mut()),
170                );
171
172                let values: Vec<ArrayRef> = arr
173                    .values()
174                    .iter()
175                    .zip(struct_fields.iter())
176                    .zip(arrow_struct_fields.iter())
177                    .map(|((values, pl_field), arrow_field)| {
178                        self.array_to_arrow(
179                            values.as_ref(),
180                            pl_field.dtype(),
181                            Cow::Borrowed(arrow_field),
182                        )
183                    })
184                    .collect::<PolarsResult<_>>()?;
185
186                let arr =
187                    StructArray::try_new(arrow_dtype, arr.len(), values, arr.validity().cloned())?;
188
189                Box::new(arr)
190            },
191            (DataType::List(item_dtype), ArrowDataType::LargeList(_)) => {
192                let arr: &ListArray<i64> = array.as_any().downcast_ref().unwrap();
193
194                let mut arrow_dtype = to_owned_dtype(arrow_field);
195
196                let ArrowDataType::LargeList(arrow_item_field) = &mut arrow_dtype else {
197                    unreachable!()
198                };
199
200                self.attach_pl_field_metadata(std::iter::once((
201                    item_dtype.as_ref(),
202                    arrow_item_field.as_mut(),
203                )));
204
205                let new_values = self.array_to_arrow(
206                    arr.values().as_ref(),
207                    item_dtype,
208                    Cow::Borrowed(arrow_item_field.as_ref()),
209                )?;
210
211                let arr = ListArray::<i64>::new(
212                    arrow_dtype,
213                    arr.offsets().clone(),
214                    new_values,
215                    arr.validity().cloned(),
216                );
217
218                Box::new(arr)
219            },
220            #[cfg(feature = "dtype-array")]
221            (DataType::Array(item_dtype, width), ArrowDataType::FixedSizeList(_, arrow_width)) => {
222                use arrow::array::FixedSizeListArray;
223                let arr: &FixedSizeListArray = array.as_any().downcast_ref().unwrap();
224
225                polars_ensure!(
226                    *arrow_width == *width,
227                    SchemaMismatch:
228                    "to_arrow() conversion failed: fixed-size list width mismatch \
229                    ({arrow_width:?} != expected: {width:?})"
230                );
231
232                let mut arrow_dtype = to_owned_dtype(arrow_field);
233
234                let ArrowDataType::FixedSizeList(arrow_item_field, _) = &mut arrow_dtype else {
235                    unreachable!()
236                };
237
238                self.attach_pl_field_metadata(std::iter::once((
239                    item_dtype.as_ref(),
240                    arrow_item_field.as_mut(),
241                )));
242
243                let new_values = self.array_to_arrow(
244                    arr.values().as_ref(),
245                    item_dtype,
246                    Cow::Borrowed(arrow_item_field.as_ref()),
247                )?;
248
249                let arr = FixedSizeListArray::new(
250                    arrow_dtype,
251                    arr.len(),
252                    new_values,
253                    arr.validity().cloned(),
254                );
255
256                Box::new(arr)
257            },
258            #[cfg(feature = "dtype-categorical")]
259            (DataType::Categorical(_, _) | DataType::Enum(_, _), _) => {
260                self.categorical_converter.array_to_arrow(
261                    array,
262                    polars_dtype,
263                    arrow_field.as_ref(),
264                )?
265            },
266            #[cfg(feature = "dtype-date")]
267            (DataType::Date, ArrowDataType::Date32) => {
268                primitive_to_boxed_with_logical!(array, i32, ArrowDataType::Date32)
269            },
270            #[cfg(feature = "dtype-datetime")]
271            (DataType::Datetime(tu, tz), ArrowDataType::Timestamp(atu, atz)) => {
272                use crate::prelude::TimeZone;
273
274                let matching = atu == &tu.to_arrow()
275                    && TimeZone::eq_none_as_utc(
276                        TimeZone::opt_try_new(atz.clone())?.as_ref(),
277                        tz.as_ref(),
278                    );
279
280                if !matching {
281                    bail_unhandled_arrow_conversion_dtype_pair!(polars_dtype, &arrow_field)
282                }
283
284                primitive_to_boxed_with_logical!(array, i64, to_owned_dtype(arrow_field))
285            },
286            #[cfg(feature = "dtype-duration")]
287            (DataType::Duration(tu), ArrowDataType::Duration(atu)) => {
288                let matching = atu == &tu.to_arrow();
289
290                if !matching {
291                    bail_unhandled_arrow_conversion_dtype_pair!(polars_dtype, &arrow_field)
292                }
293
294                primitive_to_boxed_with_logical!(array, i64, to_owned_dtype(arrow_field))
295            },
296            #[cfg(feature = "dtype-time")]
297            (DataType::Time, ArrowDataType::Time64(crate::prelude::ArrowTimeUnit::Nanosecond)) => {
298                primitive_to_boxed_with_logical!(array, i64, to_owned_dtype(arrow_field))
299            },
300            #[cfg(feature = "dtype-time")]
301            (DataType::Time, ArrowDataType::Time64(crate::prelude::ArrowTimeUnit::Microsecond)) => {
302                use polars_compute::cast::time64ns_to_time64us;
303
304                let array: &PrimitiveArray<i64> = array.as_any().downcast_ref().unwrap();
305
306                time64ns_to_time64us(array).boxed()
307            },
308            #[cfg(feature = "dtype-decimal")]
309            (DataType::Decimal(prec, scale), ArrowDataType::Decimal(a_prec, a_scale)) => {
310                let matching = *a_prec == *prec && *a_scale == *scale;
311
312                if !matching {
313                    bail_unhandled_arrow_conversion_dtype_pair!(polars_dtype, &arrow_field)
314                }
315
316                primitive_to_boxed_with_logical!(array, i128, to_owned_dtype(arrow_field))
317            },
318            #[cfg(feature = "object")]
319            (DataType::Object(_), ArrowDataType::FixedSizeBinary(8)) => {
320                use crate::chunked_array::object::builder::object_series_to_arrow_array;
321
322                let out = object_series_to_arrow_array(&unsafe {
323                    Series::from_chunks_and_dtype_unchecked(
324                        PlSmallStr::EMPTY,
325                        vec![array.to_boxed()],
326                        polars_dtype,
327                    )
328                });
329
330                assert_eq!(out.dtype(), &ArrowDataType::FixedSizeBinary(8));
331
332                out
333            },
334            (DataType::String, ArrowDataType::Utf8View) => array.to_boxed(),
335            (DataType::String, ArrowDataType::LargeUtf8) => {
336                cast_unchecked(array, &ArrowDataType::LargeUtf8).unwrap()
337            },
338            (DataType::Binary, ArrowDataType::BinaryView) => array.to_boxed(),
339            (DataType::Binary, ArrowDataType::LargeBinary) => {
340                cast_unchecked(array, &ArrowDataType::LargeBinary).unwrap()
341            },
342            (DataType::Binary, ArrowDataType::FixedSizeBinary(row_width)) => {
343                use polars_compute::cast::binview_to_fixed_binary;
344
345                let array: &BinaryViewArray = array.as_any().downcast_ref().unwrap();
346
347                binview_to_fixed_binary(array, *row_width)?.boxed()
348            },
349            (DataType::Binary, ArrowDataType::Extension(_)) => {
350                let arrow_dtype = to_owned_dtype(arrow_field);
351
352                let ArrowDataType::Extension(ext_type) = &arrow_dtype else {
353                    unreachable!()
354                };
355
356                let storage_field =
357                    ArrowField::new(ext_type.name.clone(), ext_type.inner.clone(), true);
358
359                let mut array =
360                    self.array_to_arrow(array, &DataType::Binary, Cow::Owned(storage_field))?;
361
362                *array.dtype_mut() = arrow_dtype;
363
364                array.to_boxed()
365            },
366            #[cfg(feature = "dtype-extension")]
367            (
368                DataType::Extension(pl_ext_type, storage_dtype),
369                ArrowDataType::Extension(arrow_ext_type),
370            ) => {
371                use arrow::datatypes::ExtensionType;
372
373                let ExtensionType {
374                    name,
375                    inner: _,
376                    metadata,
377                } = arrow_ext_type.as_ref();
378
379                if name != pl_ext_type.name().as_ref() {
380                    bail_unhandled_arrow_conversion_dtype_pair!(polars_dtype, &arrow_field)
381                }
382
383                match (
384                    metadata.as_deref(),
385                    pl_ext_type.serialize_metadata().as_deref(),
386                ) {
387                    (Some("") | None, Some("") | None) => {},
388                    (l, r) => {
389                        if l != r {
390                            bail_unhandled_arrow_conversion_dtype_pair!(polars_dtype, &arrow_field)
391                        }
392                    },
393                };
394
395                let arrow_dtype = to_owned_dtype(arrow_field);
396
397                let ArrowDataType::Extension(arrow_ext_type) = &arrow_dtype else {
398                    unreachable!()
399                };
400
401                let storage_arrow_field = ArrowField::new(
402                    arrow_ext_type.name.clone(),
403                    arrow_ext_type.inner.clone(),
404                    true,
405                );
406
407                let mut arr =
408                    self.array_to_arrow(array, storage_dtype, Cow::Owned(storage_arrow_field))?;
409
410                *arr.dtype_mut() = arrow_dtype;
411
412                arr
413            },
414            (pl_dtype, arrow_dtype) => {
415                if array.dtype() != arrow_dtype {
416                    bail_unhandled_arrow_conversion_dtype_pair!(polars_dtype, &arrow_field)
417                }
418
419                if pl_dtype.is_logical() {
420                    panic!("{pl_dtype:?}");
421                }
422
423                array.to_boxed()
424            },
425        })
426    }
427
428    #[inline]
429    fn attach_pl_field_metadata<'a, 'b, I>(&self, iter: I)
430    where
431        I: IntoIterator<Item = (&'a DataType, &'b mut ArrowField)>,
432    {
433        if self.skip_attach_pl_metadata {
434            return;
435        }
436
437        inner(&mut iter.into_iter());
438
439        #[inline(never)]
440        fn inner(iter: &mut dyn Iterator<Item = (&DataType, &mut ArrowField)>) {
441            for (pl_dtype, arrow_field) in iter {
442                match pl_dtype {
443                    #[cfg(feature = "dtype-categorical")]
444                    DataType::Categorical(..) | DataType::Enum(..) => {
445                        if !matches!(arrow_field.dtype(), ArrowDataType::Dictionary(..)) {
446                            // IPC sink can hit here when it exports only the keys of the categorical.
447                            // In this case we do not want to attach categorical metadata.
448                            continue;
449                        }
450                    },
451                    _ => {},
452                }
453
454                let mut pl_md = pl_dtype.to_arrow_field_metadata();
455
456                if arrow_field.metadata.is_none() {
457                    arrow_field.metadata = pl_md.take().map(|x| x.into());
458                }
459
460                // Insert polars categorical and enum metadata.
461                if let Some(pl_md) = pl_md
462                    && let Some(md) = arrow_field.metadata.as_mut()
463                {
464                    for (k, v) in pl_md {
465                        if !md.contains_key(&k) {
466                            Arc::make_mut(md).insert(k, v);
467                        }
468                    }
469                }
470            }
471        }
472    }
473}
474
475fn to_owned_dtype(field: Cow<ArrowField>) -> ArrowDataType {
476    match field {
477        Cow::Borrowed(f) => f.dtype().clone(),
478        Cow::Owned(f) => f.dtype,
479    }
480}