polars_core/chunked_array/
from.rs

1use arrow::compute::concatenate::concatenate_unchecked;
2
3use super::*;
4
5#[allow(clippy::all)]
6fn from_chunks_list_dtype(chunks: &mut Vec<ArrayRef>, dtype: DataType) -> DataType {
7    // ensure we don't get List<null>
8    let dtype = if let Some(arr) = chunks.get(0) {
9        DataType::from_arrow_dtype(arr.dtype())
10    } else {
11        dtype
12    };
13
14    match dtype {
15        #[cfg(feature = "dtype-categorical")]
16        // arrow dictionaries are not nested as dictionaries, but only by their keys, so we must
17        // change the list-value array to the keys and store the dictionary values in the datatype.
18        // if a global string cache is set, we also must modify the keys.
19        DataType::List(inner)
20            if matches!(
21                *inner,
22                DataType::Categorical(None, _) | DataType::Enum(None, _)
23            ) =>
24        {
25            let array = concatenate_unchecked(chunks).unwrap();
26            let list_arr = array.as_any().downcast_ref::<ListArray<i64>>().unwrap();
27            let values_arr = list_arr.values();
28            let cat = unsafe {
29                Series::_try_from_arrow_unchecked(
30                    PlSmallStr::EMPTY,
31                    vec![values_arr.clone()],
32                    values_arr.dtype(),
33                )
34                .unwrap()
35            };
36
37            // we nest only the physical representation
38            // the mapping is still in our rev-map
39            let arrow_dtype = ListArray::<i64>::default_datatype(ArrowDataType::UInt32);
40            let new_array = ListArray::new(
41                arrow_dtype,
42                list_arr.offsets().clone(),
43                cat.array_ref(0).clone(),
44                list_arr.validity().cloned(),
45            );
46            chunks.clear();
47            chunks.push(Box::new(new_array));
48            DataType::List(Box::new(cat.dtype().clone()))
49        },
50        #[cfg(all(feature = "dtype-array", feature = "dtype-categorical"))]
51        DataType::Array(inner, width)
52            if matches!(
53                *inner,
54                DataType::Categorical(None, _) | DataType::Enum(None, _)
55            ) =>
56        {
57            let array = concatenate_unchecked(chunks).unwrap();
58            let list_arr = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
59            let values_arr = list_arr.values();
60            let cat = unsafe {
61                Series::_try_from_arrow_unchecked(
62                    PlSmallStr::EMPTY,
63                    vec![values_arr.clone()],
64                    values_arr.dtype(),
65                )
66                .unwrap()
67            };
68
69            // we nest only the physical representation
70            // the mapping is still in our rev-map
71            let arrow_dtype = FixedSizeListArray::default_datatype(ArrowDataType::UInt32, width);
72            let new_array = FixedSizeListArray::new(
73                arrow_dtype,
74                values_arr.len(),
75                cat.array_ref(0).clone(),
76                list_arr.validity().cloned(),
77            );
78            chunks.clear();
79            chunks.push(Box::new(new_array));
80            DataType::Array(Box::new(cat.dtype().clone()), width)
81        },
82        _ => dtype,
83    }
84}
85
86impl<T, A> From<A> for ChunkedArray<T>
87where
88    T: PolarsDataType<Array = A>,
89    A: Array,
90{
91    fn from(arr: A) -> Self {
92        Self::with_chunk(PlSmallStr::EMPTY, arr)
93    }
94}
95
96impl<T> ChunkedArray<T>
97where
98    T: PolarsDataType,
99{
100    pub fn with_chunk<A>(name: PlSmallStr, arr: A) -> Self
101    where
102        A: Array,
103        T: PolarsDataType<Array = A>,
104    {
105        unsafe { Self::from_chunks(name, vec![Box::new(arr)]) }
106    }
107
108    pub fn with_chunk_like<A>(ca: &Self, arr: A) -> Self
109    where
110        A: Array,
111        T: PolarsDataType<Array = A>,
112    {
113        Self::from_chunk_iter_like(ca, std::iter::once(arr))
114    }
115
116    pub fn from_chunk_iter<I>(name: PlSmallStr, iter: I) -> Self
117    where
118        I: IntoIterator,
119        T: PolarsDataType<Array = <I as IntoIterator>::Item>,
120        <I as IntoIterator>::Item: Array,
121    {
122        let chunks = iter
123            .into_iter()
124            .map(|x| Box::new(x) as Box<dyn Array>)
125            .collect();
126        unsafe { Self::from_chunks(name, chunks) }
127    }
128
129    pub fn from_chunk_iter_like<I>(ca: &Self, iter: I) -> Self
130    where
131        I: IntoIterator,
132        T: PolarsDataType<Array = <I as IntoIterator>::Item>,
133        <I as IntoIterator>::Item: Array,
134    {
135        let chunks = iter
136            .into_iter()
137            .map(|x| Box::new(x) as Box<dyn Array>)
138            .collect();
139        unsafe {
140            Self::from_chunks_and_dtype_unchecked(ca.name().clone(), chunks, ca.dtype().clone())
141        }
142    }
143
144    pub fn try_from_chunk_iter<I, A, E>(name: PlSmallStr, iter: I) -> Result<Self, E>
145    where
146        I: IntoIterator<Item = Result<A, E>>,
147        T: PolarsDataType<Array = A>,
148        A: Array,
149    {
150        let chunks: Result<_, _> = iter
151            .into_iter()
152            .map(|x| Ok(Box::new(x?) as Box<dyn Array>))
153            .collect();
154        unsafe { Ok(Self::from_chunks(name, chunks?)) }
155    }
156
157    pub(crate) fn from_chunk_iter_and_field<I>(field: Arc<Field>, chunks: I) -> Self
158    where
159        I: IntoIterator,
160        T: PolarsDataType<Array = <I as IntoIterator>::Item>,
161        <I as IntoIterator>::Item: Array,
162    {
163        assert_eq!(
164            std::mem::discriminant(&T::get_dtype()),
165            std::mem::discriminant(&field.dtype)
166        );
167
168        let mut length = 0;
169        let mut null_count = 0;
170        let chunks = chunks
171            .into_iter()
172            .map(|x| {
173                length += x.len();
174                null_count += x.null_count();
175                Box::new(x) as Box<dyn Array>
176            })
177            .collect();
178
179        unsafe { ChunkedArray::new_with_dims(field, chunks, length, null_count) }
180    }
181
182    /// Create a new [`ChunkedArray`] from existing chunks.
183    ///
184    /// # Safety
185    /// The Arrow datatype of all chunks must match the [`PolarsDataType`] `T`.
186    pub unsafe fn from_chunks(name: PlSmallStr, mut chunks: Vec<ArrayRef>) -> Self {
187        let dtype = match T::get_dtype() {
188            dtype @ DataType::List(_) => from_chunks_list_dtype(&mut chunks, dtype),
189            #[cfg(feature = "dtype-array")]
190            dtype @ DataType::Array(_, _) => from_chunks_list_dtype(&mut chunks, dtype),
191            #[cfg(feature = "dtype-struct")]
192            dtype @ DataType::Struct(_) => from_chunks_list_dtype(&mut chunks, dtype),
193            dt => dt,
194        };
195        Self::from_chunks_and_dtype(name, chunks, dtype)
196    }
197
198    /// # Safety
199    /// The Arrow datatype of all chunks must match the [`PolarsDataType`] `T`.
200    pub unsafe fn with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
201        ChunkedArray::new_with_compute_len(self.field.clone(), chunks)
202    }
203
204    /// Create a new [`ChunkedArray`] from existing chunks.
205    ///
206    /// # Safety
207    ///
208    /// The Arrow datatype of all chunks must match the [`PolarsDataType`] `T`.
209    pub unsafe fn from_chunks_and_dtype(
210        name: PlSmallStr,
211        chunks: Vec<ArrayRef>,
212        dtype: DataType,
213    ) -> Self {
214        // assertions in debug mode
215        // that check if the data types in the arrays are as expected
216        #[cfg(debug_assertions)]
217        {
218            if !chunks.is_empty() && !chunks[0].is_empty() && dtype.is_primitive() {
219                assert_eq!(chunks[0].dtype(), &dtype.to_arrow(CompatLevel::newest()))
220            }
221        }
222
223        Self::from_chunks_and_dtype_unchecked(name, chunks, dtype)
224    }
225
226    /// Create a new [`ChunkedArray`] from existing chunks.
227    ///
228    /// # Safety
229    ///
230    /// The Arrow datatype of all chunks must match the [`PolarsDataType`] `T`.
231    pub(crate) unsafe fn from_chunks_and_dtype_unchecked(
232        name: PlSmallStr,
233        chunks: Vec<ArrayRef>,
234        dtype: DataType,
235    ) -> Self {
236        let field = Arc::new(Field::new(name, dtype));
237        ChunkedArray::new_with_compute_len(field, chunks)
238    }
239
240    pub fn full_null_like(ca: &Self, length: usize) -> Self {
241        let chunks = std::iter::once(T::Array::full_null(
242            length,
243            ca.dtype().to_arrow(CompatLevel::newest()),
244        ));
245        Self::from_chunk_iter_like(ca, chunks)
246    }
247}
248
249impl<T> ChunkedArray<T>
250where
251    T: PolarsNumericType,
252{
253    /// Create a new ChunkedArray by taking ownership of the Vec. This operation is zero copy.
254    pub fn from_vec(name: PlSmallStr, v: Vec<T::Native>) -> Self {
255        Self::with_chunk(name, to_primitive::<T>(v, None))
256    }
257
258    /// Create a new ChunkedArray from a Vec and a validity mask.
259    pub fn from_vec_validity(
260        name: PlSmallStr,
261        values: Vec<T::Native>,
262        buffer: Option<Bitmap>,
263    ) -> Self {
264        let arr = to_array::<T>(values, buffer);
265        ChunkedArray::new_with_compute_len(Arc::new(Field::new(name, T::get_dtype())), vec![arr])
266    }
267
268    /// Create a temporary [`ChunkedArray`] from a slice.
269    ///
270    /// # Safety
271    /// The lifetime will be bound to the lifetime of the slice.
272    /// This will not be checked by the borrowchecker.
273    pub unsafe fn mmap_slice(name: PlSmallStr, values: &[T::Native]) -> Self {
274        Self::with_chunk(name, arrow::ffi::mmap::slice(values))
275    }
276}
277
278impl BooleanChunked {
279    /// Create a temporary [`ChunkedArray`] from a slice.
280    ///
281    /// # Safety
282    /// The lifetime will be bound to the lifetime of the slice.
283    /// This will not be checked by the borrowchecker.
284    pub unsafe fn mmap_slice(name: PlSmallStr, values: &[u8], offset: usize, len: usize) -> Self {
285        let arr = arrow::ffi::mmap::bitmap(values, offset, len).unwrap();
286        Self::with_chunk(name, arr)
287    }
288
289    pub fn from_bitmap(name: PlSmallStr, bitmap: Bitmap) -> Self {
290        Self::with_chunk(
291            name,
292            BooleanArray::new(ArrowDataType::Boolean, bitmap, None),
293        )
294    }
295}
296
297impl<'a, T> From<&'a ChunkedArray<T>> for Vec<Option<T::Physical<'a>>>
298where
299    T: PolarsDataType,
300{
301    fn from(ca: &'a ChunkedArray<T>) -> Self {
302        let mut out = Vec::with_capacity(ca.len());
303        for arr in ca.downcast_iter() {
304            out.extend(arr.iter())
305        }
306        out
307    }
308}
309impl From<StringChunked> for Vec<Option<String>> {
310    fn from(ca: StringChunked) -> Self {
311        ca.iter().map(|opt| opt.map(|s| s.to_string())).collect()
312    }
313}
314
315impl From<BooleanChunked> for Vec<Option<bool>> {
316    fn from(ca: BooleanChunked) -> Self {
317        let mut out = Vec::with_capacity(ca.len());
318        for arr in ca.downcast_iter() {
319            out.extend(arr.iter())
320        }
321        out
322    }
323}