Skip to main content

polars_core/chunked_array/
mod.rs

1//! The typed heart of every Series column.
2#![allow(unsafe_op_in_unsafe_fn)]
3use std::sync::Arc;
4
5use arrow::array::*;
6use arrow::bitmap::Bitmap;
7use arrow::compute::concatenate::concatenate_unchecked;
8use arrow::compute::utils::combine_validities_and;
9use polars_compute::filter::filter_with_bitmap;
10
11use crate::prelude::{ChunkTakeUnchecked, *};
12
13pub mod ops;
14#[macro_use]
15pub mod arithmetic;
16pub mod builder;
17pub mod cast;
18pub mod collect;
19pub mod comparison;
20pub mod flags;
21pub mod float;
22pub mod iterator;
23#[cfg(feature = "ndarray")]
24pub(crate) mod ndarray;
25
26pub mod arg_min_max;
27#[cfg(feature = "dtype-array")]
28pub(crate) mod array;
29mod binary;
30mod binary_offset;
31mod bitwise;
32#[cfg(feature = "object")]
33mod drop;
34mod from;
35mod from_iterator;
36pub mod from_iterator_par;
37pub(crate) mod list;
38pub(crate) mod logical;
39#[cfg(feature = "object")]
40pub mod object;
41#[cfg(feature = "random")]
42mod random;
43#[cfg(feature = "dtype-struct")]
44mod struct_;
45#[cfg(any(
46    feature = "temporal",
47    feature = "dtype-datetime",
48    feature = "dtype-date"
49))]
50pub mod temporal;
51mod to_vec;
52mod trusted_len;
53pub(crate) use arg_min_max::*;
54use arrow::legacy::prelude::*;
55#[cfg(feature = "dtype-struct")]
56pub use struct_::StructChunked;
57
58use self::flags::{StatisticsFlags, StatisticsFlagsIM};
59use crate::series::IsSorted;
60use crate::utils::{first_non_null, first_null, last_non_null};
61
62pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;
63
64/// # ChunkedArray
65///
66/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows
67/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.
68/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].
69///
70/// ```rust
71/// # use polars_core::prelude::*;
72/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {
73///     ca.apply_values(|v| v.cos())
74/// }
75/// ```
76///
77/// ## Conversion between Series and ChunkedArrays
78/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.
79///
80/// ```rust
81/// # use polars_core::prelude::*;
82/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{
83///     series.i32()
84/// }
85///
86/// fn to_series(ca: Int32Chunked) -> Series {
87///     ca.into_series()
88/// }
89/// ```
90///
91/// # Iterators
92///
93/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
94/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby
95/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).
96///
97/// ```rust
98/// # use polars_core::prelude::*;
99///
100/// fn iter_forward(ca: &Float32Chunked) {
101///     ca.iter()
102///         .for_each(|opt_v| println!("{:?}", opt_v))
103/// }
104///
105/// fn iter_backward(ca: &Float32Chunked) {
106///     ca.iter()
107///         .rev()
108///         .for_each(|opt_v| println!("{:?}", opt_v))
109/// }
110/// ```
111///
112/// # Memory layout
113///
114/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.
115/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.
116///
117/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.
118/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).
119///
120/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection
121/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.
122/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.
123///
124/// If you want to have predictable performance
125/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after
126/// multiple append operations.
127///
128/// See also [`ChunkedArray::extend`] for appends within a chunk.
129///
130/// # Invariants
131/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].
132/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]
133///   chunks.
134/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the
135///   logical type given by the datatype.
136///
137/// [`List`]: crate::datatypes::DataType::List
138pub struct ChunkedArray<T: PolarsDataType> {
139    pub(crate) field: Arc<Field>,
140    pub(crate) chunks: Vec<ArrayRef>,
141
142    pub(crate) flags: StatisticsFlagsIM,
143
144    length: usize,
145    null_count: usize,
146    _pd: std::marker::PhantomData<T>,
147}
148
149impl<T: PolarsDataType> ChunkedArray<T> {
150    fn should_rechunk(&self) -> bool {
151        self.chunks.len() > 1 && self.chunks.len() > self.len() / 3
152    }
153
154    fn optional_rechunk(mut self) -> Self {
155        // Rechunk if we have many small chunks.
156        if self.should_rechunk() {
157            self.rechunk_mut()
158        }
159        self
160    }
161
162    pub(crate) fn as_any(&self) -> &dyn std::any::Any {
163        self
164    }
165
166    /// Series to [`ChunkedArray<T>`]
167    pub fn unpack_series_matching_type<'a>(
168        &self,
169        series: &'a Series,
170    ) -> PolarsResult<&'a ChunkedArray<T>> {
171        polars_ensure!(
172            self.dtype() == series.dtype(),
173            SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
174            series.dtype(),
175            self.dtype(),
176        );
177
178        // SAFETY: dtype will be correct.
179        Ok(unsafe { self.unpack_series_matching_physical_type(series) })
180    }
181
182    /// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.
183    ///
184    /// If you want to explicitly the `length` and `null_count`, look at
185    /// [`ChunkedArray::new_with_dims`]
186    fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {
187        unsafe {
188            let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);
189            chunked_arr.compute_len();
190            chunked_arr
191        }
192    }
193
194    /// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.
195    /// # Safety
196    /// The length and null_count must be correct.
197    pub unsafe fn new_with_dims(
198        field: Arc<Field>,
199        chunks: Vec<ArrayRef>,
200        length: usize,
201        null_count: usize,
202    ) -> Self {
203        Self {
204            field,
205            chunks,
206            flags: StatisticsFlagsIM::empty(),
207
208            _pd: Default::default(),
209            length,
210            null_count,
211        }
212    }
213
214    pub(crate) fn is_sorted_ascending_flag(&self) -> bool {
215        self.get_flags().is_sorted_ascending()
216    }
217
218    pub(crate) fn is_sorted_descending_flag(&self) -> bool {
219        self.get_flags().is_sorted_descending()
220    }
221
222    /// Whether `self` is sorted in any direction.
223    pub(crate) fn is_sorted_any(&self) -> bool {
224        self.get_flags().is_sorted_any()
225    }
226
227    pub fn unset_fast_explode_list(&mut self) {
228        self.set_fast_explode_list(false)
229    }
230
231    pub fn set_fast_explode_list(&mut self, value: bool) {
232        let mut flags = self.flags.get_mut();
233        flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);
234        self.flags.set_mut(flags);
235    }
236
237    pub fn get_fast_explode_list(&self) -> bool {
238        self.get_flags().can_fast_explode_list()
239    }
240
241    pub fn get_flags(&self) -> StatisticsFlags {
242        self.flags.get()
243    }
244
245    /// Set flags for the [`ChunkedArray`]
246    pub fn set_flags(&mut self, flags: StatisticsFlags) {
247        self.flags = StatisticsFlagsIM::new(flags);
248    }
249
250    pub fn is_sorted_flag(&self) -> IsSorted {
251        self.get_flags().is_sorted()
252    }
253
254    pub fn retain_flags_from<U: PolarsDataType>(
255        &mut self,
256        from: &ChunkedArray<U>,
257        retain_flags: StatisticsFlags,
258    ) {
259        let flags = from.flags.get();
260        // Try to avoid write contention.
261        if !flags.is_empty() {
262            self.set_flags(flags & retain_flags)
263        }
264    }
265
266    /// Set the 'sorted' bit meta info.
267    pub fn set_sorted_flag(&mut self, sorted: IsSorted) {
268        let mut flags = self.flags.get_mut();
269        flags.set_sorted(sorted);
270        self.flags.set_mut(flags);
271    }
272
273    /// Set the 'sorted' bit meta info.
274    pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {
275        let mut out = self.clone();
276        out.set_sorted_flag(sorted);
277        out
278    }
279
280    pub fn first_null(&self) -> Option<usize> {
281        if self.null_count() == 0 {
282            None
283        }
284        // We now know there is at least 1 non-null item in the array, and self.len() > 0
285        else if self.null_count() == self.len() {
286            Some(0)
287        } else if self.is_sorted_any() {
288            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
289                // nulls are all at the start
290                0
291            } else {
292                // nulls are all at the end
293                self.null_count()
294            };
295
296            debug_assert!(
297                // If we are lucky this catches something.
298                unsafe { self.get_unchecked(out) }.is_some(),
299                "incorrect sorted flag"
300            );
301
302            Some(out)
303        } else {
304            first_null(self.chunks().iter().map(|arr| arr.as_ref()))
305        }
306    }
307
308    /// Get the index of the first non null value in this [`ChunkedArray`].
309    pub fn first_non_null(&self) -> Option<usize> {
310        if self.null_count() == self.len() {
311            None
312        }
313        // We now know there is at least 1 non-null item in the array, and self.len() > 0
314        else if self.null_count() == 0 {
315            Some(0)
316        } else if self.is_sorted_any() {
317            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
318                // nulls are all at the start
319                self.null_count()
320            } else {
321                // nulls are all at the end
322                0
323            };
324
325            debug_assert!(
326                // If we are lucky this catches something.
327                unsafe { self.get_unchecked(out) }.is_some(),
328                "incorrect sorted flag"
329            );
330
331            Some(out)
332        } else {
333            first_non_null(self.chunks().iter().map(|arr| arr.as_ref()))
334        }
335    }
336
337    /// Get the index of the last non null value in this [`ChunkedArray`].
338    pub fn last_non_null(&self) -> Option<usize> {
339        if self.null_count() == self.len() {
340            None
341        }
342        // We now know there is at least 1 non-null item in the array, and self.len() > 0
343        else if self.null_count() == 0 {
344            Some(self.len() - 1)
345        } else if self.is_sorted_any() {
346            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
347                // nulls are all at the start
348                self.len() - 1
349            } else {
350                // nulls are all at the end
351                self.len() - self.null_count() - 1
352            };
353
354            debug_assert!(
355                // If we are lucky this catches something.
356                unsafe { self.get_unchecked(out) }.is_some(),
357                "incorrect sorted flag"
358            );
359
360            Some(out)
361        } else {
362            last_non_null(self.chunks().iter().map(|arr| arr.as_ref()), self.len())
363        }
364    }
365
366    pub fn drop_nulls(&self) -> Self {
367        if self.null_count() == 0 {
368            self.clone()
369        } else {
370            let chunks = self
371                .downcast_iter()
372                .map(|arr| {
373                    if arr.null_count() == 0 {
374                        arr.to_boxed()
375                    } else {
376                        filter_with_bitmap(arr, arr.validity().unwrap())
377                    }
378                })
379                .collect();
380            unsafe {
381                Self::new_with_dims(
382                    self.field.clone(),
383                    chunks,
384                    self.len() - self.null_count(),
385                    0,
386                )
387            }
388        }
389    }
390
391    /// Get the buffer of bits representing null values
392    #[inline]
393    #[allow(clippy::type_complexity)]
394    pub fn iter_validities(
395        &self,
396    ) -> impl ExactSizeIterator<Item = Option<&Bitmap>> + DoubleEndedIterator {
397        fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {
398            arr.validity()
399        }
400        self.chunks.iter().map(to_validity)
401    }
402
403    #[inline]
404    /// Return if any the chunks in this [`ChunkedArray`] have nulls.
405    pub fn has_nulls(&self) -> bool {
406        self.null_count > 0
407    }
408
409    /// Shrink the capacity of this array to fit its length.
410    pub fn shrink_to_fit(&mut self) {
411        self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];
412    }
413
414    pub fn clear(&self) -> Self {
415        // SAFETY: we keep the correct dtype
416        let mut ca = unsafe {
417            self.copy_with_chunks(vec![new_empty_array(
418                self.chunks.first().unwrap().dtype().clone(),
419            )])
420        };
421
422        use StatisticsFlags as F;
423        ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
424        ca
425    }
426
427    /// Unpack a [`Series`] to the same physical type.
428    ///
429    /// # Safety
430    ///
431    /// This is unsafe as the dtype may be incorrect and
432    /// is assumed to be correct in other safe code.
433    pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(
434        &self,
435        series: &'a Series,
436    ) -> &'a ChunkedArray<T> {
437        let series_trait = &**series;
438        if self.dtype() == series.dtype() {
439            &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
440        } else {
441            use DataType::*;
442            match (self.dtype(), series.dtype()) {
443                (Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {
444                    &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
445                },
446                _ => panic!(
447                    "cannot unpack series {:?} into matching type {:?}",
448                    series,
449                    self.dtype()
450                ),
451            }
452        }
453    }
454
455    /// Returns an iterator over the lengths of the chunks of the array.
456    pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {
457        self.chunks.iter().map(|chunk| chunk.len())
458    }
459
460    /// A reference to the chunks
461    #[inline]
462    pub fn chunks(&self) -> &Vec<ArrayRef> {
463        &self.chunks
464    }
465
466    /// A mutable reference to the chunks
467    ///
468    /// # Safety
469    /// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.
470    /// And the `null_count` remains correct.
471    #[inline]
472    pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
473        &mut self.chunks
474    }
475
476    /// Returns true if contains a single chunk and has no null values
477    pub fn is_optimal_aligned(&self) -> bool {
478        self.chunks.len() == 1 && self.null_count() == 0
479    }
480
481    /// Create a new [`ChunkedArray`] from self, where the chunks are replaced.
482    ///
483    /// # Safety
484    /// The caller must ensure the dtypes of the chunks are correct
485    unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
486        Self::new_with_compute_len(self.field.clone(), chunks)
487    }
488
489    /// Get data type of [`ChunkedArray`].
490    pub fn dtype(&self) -> &DataType {
491        self.field.dtype()
492    }
493
494    pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {
495        self.field = Arc::new(Field::new(self.name().clone(), dtype))
496    }
497
498    /// Name of the [`ChunkedArray`].
499    pub fn name(&self) -> &PlSmallStr {
500        self.field.name()
501    }
502
503    /// Get a reference to the field.
504    pub fn ref_field(&self) -> &Field {
505        &self.field
506    }
507
508    /// Rename this [`ChunkedArray`].
509    pub fn rename(&mut self, name: PlSmallStr) {
510        self.field = Arc::new(Field::new(name, self.field.dtype().clone()));
511    }
512
513    /// Return this [`ChunkedArray`] with a new name.
514    pub fn with_name(mut self, name: PlSmallStr) -> Self {
515        self.rename(name);
516        self
517    }
518}
519
520impl<T> ChunkedArray<T>
521where
522    T: PolarsDataType,
523{
524    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
525    /// indicates a NULL value.
526    ///
527    /// # Panics
528    /// This function will panic if `idx` is out of bounds.
529    #[inline]
530    pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {
531        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
532        assert!(
533            chunk_idx < self.chunks().len(),
534            "index: {} out of bounds for len: {}",
535            idx,
536            self.len()
537        );
538        unsafe {
539            let arr = self.downcast_get_unchecked(chunk_idx);
540            assert!(
541                arr_idx < arr.len(),
542                "index: {} out of bounds for len: {}",
543                idx,
544                self.len()
545            );
546            arr.get_unchecked(arr_idx)
547        }
548    }
549
550    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
551    /// indicates a NULL value.
552    ///
553    /// # Safety
554    /// It is the callers responsibility that the `idx < self.len()`.
555    #[inline]
556    pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {
557        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
558
559        unsafe {
560            // SAFETY: up to the caller to make sure the index is valid.
561            self.downcast_get_unchecked(chunk_idx)
562                .get_unchecked(arr_idx)
563        }
564    }
565
566    /// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned
567    /// value could be garbage if it was masked out by NULL. Note that the value always is initialized.
568    ///
569    /// # Safety
570    /// It is the callers responsibility that the `idx < self.len()`.
571    #[inline]
572    pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {
573        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
574
575        unsafe {
576            // SAFETY: up to the caller to make sure the index is valid.
577            self.downcast_get_unchecked(chunk_idx)
578                .value_unchecked(arr_idx)
579        }
580    }
581
582    /// # Panics
583    /// Panics if the [`ChunkedArray`] is empty.
584    #[inline]
585    pub fn first(&self) -> Option<T::Physical<'_>> {
586        self.iter().next().unwrap()
587    }
588
589    /// # Panics
590    /// Panics if the [`ChunkedArray`] is empty.
591    #[inline]
592    pub fn last(&self) -> Option<T::Physical<'_>> {
593        let arr = self
594            .downcast_iter()
595            .rev()
596            .find(|arr| !arr.is_empty())
597            .unwrap();
598        unsafe { arr.get_unchecked(arr.len() - 1) }
599    }
600
601    pub fn set_validity(&mut self, validity: Option<Bitmap>) {
602        assert!(
603            !self.dtype().is_struct(),
604            "set_outer_validity should be used for struct types"
605        );
606        if let Some(v) = &validity {
607            assert_eq!(self.len(), v.len());
608        }
609        let mut i = 0;
610        for chunk in unsafe { self.chunks_mut() } {
611            *chunk =
612                chunk.with_validity(validity.as_ref().map(|v| v.clone().sliced(i, chunk.len())));
613            i += chunk.len();
614        }
615        self.null_count = validity.map(|v| v.unset_bits()).unwrap_or(0);
616        self.set_fast_explode_list(false);
617    }
618
619    pub fn with_validity(mut self, validity: Option<Bitmap>) -> Self {
620        self.set_validity(validity);
621        self
622    }
623}
624
625impl<T> ChunkedArray<T>
626where
627    T: PolarsDataType,
628    ChunkedArray<T>: ChunkTakeUnchecked<[IdxSize]>,
629{
630    /// Deposit values into nulls with a certain validity mask.
631    pub fn deposit(&self, validity: &Bitmap) -> Self {
632        let set_bits = validity.set_bits();
633
634        assert_eq!(self.len(), set_bits);
635
636        if set_bits == validity.len() {
637            return self.clone();
638        }
639
640        if set_bits == 0 {
641            return Self::full_null_like(self, validity.len());
642        }
643
644        let mut null_mask = validity.clone();
645
646        let mut gather_idxs = Vec::with_capacity(validity.len());
647        let leading_nulls = null_mask.take_leading_zeros();
648        gather_idxs.extend(std::iter::repeat_n(0, leading_nulls + 1));
649
650        let mut i = 0 as IdxSize;
651        gather_idxs.extend(null_mask.iter().skip(1).map(|v| {
652            i += IdxSize::from(v);
653            i
654        }));
655
656        let mut ca = unsafe { ChunkTakeUnchecked::take_unchecked(self, &gather_idxs) };
657        ca.set_validity(combine_validities_and(
658            Some(validity),
659            ca.rechunk_validity().as_ref(),
660        ));
661        ca
662    }
663}
664
665impl ListChunked {
666    #[inline]
667    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
668        unsafe {
669            Some(Series::from_chunks_and_dtype_unchecked(
670                self.name().clone(),
671                vec![self.get(idx)?],
672                &self.inner_dtype().to_physical(),
673            ))
674        }
675    }
676
677    pub fn has_empty_lists(&self) -> bool {
678        for arr in self.downcast_iter() {
679            if arr.is_empty() {
680                continue;
681            }
682
683            if match arr.validity() {
684                None => arr.offsets().lengths().any(|l| l == 0),
685                Some(validity) => arr
686                    .offsets()
687                    .lengths()
688                    .enumerate()
689                    .any(|(i, l)| l == 0 && unsafe { validity.get_bit_unchecked(i) }),
690            } {
691                return true;
692            }
693        }
694
695        false
696    }
697
698    pub fn has_masked_out_values(&self) -> bool {
699        for arr in self.downcast_iter() {
700            if arr.is_empty() {
701                continue;
702            }
703
704            if *arr.offsets().first() != 0 || *arr.offsets().last() != arr.values().len() as i64 {
705                return true;
706            }
707
708            let Some(validity) = arr.validity() else {
709                continue;
710            };
711            if validity.set_bits() == 0 {
712                continue;
713            }
714
715            // @Performance: false_idx_iter
716            for i in (!validity).true_idx_iter() {
717                if arr.offsets().length_at(i) > 0 {
718                    return true;
719                }
720            }
721        }
722
723        false
724    }
725}
726
727#[cfg(feature = "dtype-array")]
728impl ArrayChunked {
729    #[inline]
730    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
731        unsafe {
732            Some(Series::from_chunks_and_dtype_unchecked(
733                self.name().clone(),
734                vec![self.get(idx)?],
735                &self.inner_dtype().to_physical(),
736            ))
737        }
738    }
739
740    pub fn from_aligned_values(
741        name: PlSmallStr,
742        inner_dtype: &DataType,
743        width: usize,
744        chunks: Vec<ArrayRef>,
745        length: usize,
746    ) -> Self {
747        let dtype = DataType::Array(Box::new(inner_dtype.clone()), width);
748        let arrow_dtype = dtype.to_arrow(CompatLevel::newest()).to_storage_recursive();
749        let field = Arc::new(Field::new(name, dtype));
750        if width == 0 {
751            use arrow::array::builder::{ArrayBuilder, make_builder};
752            let values = make_builder(&inner_dtype.to_arrow(CompatLevel::newest())).freeze();
753            return ArrayChunked::new_with_compute_len(
754                field,
755                vec![FixedSizeListArray::new(arrow_dtype, length, values, None).into_boxed()],
756            );
757        }
758        let mut total_len = 0;
759        let chunks = chunks
760            .iter()
761            .map(|chunk| {
762                debug_assert_eq!(chunk.len() % width, 0);
763                let chunk_len = chunk.len() / width;
764                total_len += chunk_len;
765                FixedSizeListArray::new(arrow_dtype.clone(), chunk_len, chunk.clone(), None)
766                    .into_boxed()
767            })
768            .collect();
769        debug_assert_eq!(total_len, length);
770
771        unsafe { Self::new_with_dims(field, chunks, length, 0) }
772    }
773
774    /// Turn the ArrayChunked into the ListChunked with the same items.
775    ///
776    /// This will always zero copy the values into the ListChunked.
777    pub fn to_list(&self) -> ListChunked {
778        let inner_dtype = self.inner_dtype();
779        let chunks = self
780            .downcast_iter()
781            .map(|chunk| {
782                use arrow::offset::OffsetsBuffer;
783
784                let inner_dtype = chunk.dtype().inner_dtype().unwrap();
785                let dtype = inner_dtype.clone().to_large_list(true);
786
787                let offsets = (0..=chunk.len())
788                    .map(|i| (i * self.width()) as i64)
789                    .collect::<Vec<i64>>();
790
791                // SAFETY: We created our offsets in ascending manner.
792                let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) };
793
794                ListArray::<i64>::new(
795                    dtype,
796                    offsets,
797                    chunk.values().clone(),
798                    chunk.validity().cloned(),
799                )
800                .into_boxed()
801            })
802            .collect();
803
804        // SAFETY: All the items were mapped 1-1 with the validity staying the same.
805        let mut ca = unsafe {
806            ListChunked::new_with_dims(
807                Arc::new(Field::new(
808                    self.name().clone(),
809                    DataType::List(Box::new(inner_dtype.clone())),
810                )),
811                chunks,
812                self.len(),
813                self.null_count(),
814            )
815        };
816        ca.set_fast_explode_list(!self.has_nulls());
817        ca
818    }
819}
820
821impl<T> ChunkedArray<T>
822where
823    T: PolarsDataType,
824{
825    /// Should be used to match the chunk_id of another [`ChunkedArray`].
826    /// # Panics
827    /// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.
828    pub fn match_chunks<I>(&self, chunk_id: I) -> Self
829    where
830        I: Iterator<Item = usize>,
831    {
832        debug_assert!(self.chunks.len() == 1);
833        // Takes a ChunkedArray containing a single chunk.
834        let slice = |ca: &Self| {
835            let array = &ca.chunks[0];
836
837            let mut offset = 0;
838            let chunks = chunk_id
839                .map(|len| {
840                    // SAFETY: within bounds.
841                    debug_assert!((offset + len) <= array.len());
842                    let out = unsafe { array.sliced_unchecked(offset, len) };
843                    offset += len;
844                    out
845                })
846                .collect();
847
848            debug_assert_eq!(offset, array.len());
849
850            // SAFETY: We just slice the original chunks, their type will not change.
851            unsafe {
852                Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())
853            }
854        };
855
856        if self.chunks.len() != 1 {
857            let out = self.rechunk();
858            slice(&out)
859        } else {
860            slice(self)
861        }
862    }
863}
864
865impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {
866    fn as_ref_dtype(&self) -> &DataType {
867        self.dtype()
868    }
869}
870
871pub(crate) trait AsSinglePtr: AsRefDataType {
872    /// Rechunk and return a ptr to the start of the array
873    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
874        polars_bail!(opq = as_single_ptr, self.as_ref_dtype());
875    }
876}
877
878impl<T> AsSinglePtr for ChunkedArray<T>
879where
880    T: PolarsNumericType,
881{
882    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
883        self.rechunk_mut();
884        let a = self.data_views().next().unwrap();
885        let ptr = a.as_ptr();
886        Ok(ptr as usize)
887    }
888}
889
890impl AsSinglePtr for BooleanChunked {}
891impl AsSinglePtr for ListChunked {}
892#[cfg(feature = "dtype-array")]
893impl AsSinglePtr for ArrayChunked {}
894impl AsSinglePtr for StringChunked {}
895impl AsSinglePtr for BinaryChunked {}
896#[cfg(feature = "object")]
897impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}
898
899pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {
900    SingleNoNull(&'a T::Array),
901    Single(&'a T::Array),
902    MultiNoNull(&'a ChunkedArray<T>),
903    Multi(&'a ChunkedArray<T>),
904}
905
906impl<T> ChunkedArray<T>
907where
908    T: PolarsDataType,
909{
910    pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {
911        if self.chunks.len() == 1 {
912            let arr = self.downcast_iter().next().unwrap();
913            return if arr.null_count() == 0 {
914                ChunkedArrayLayout::SingleNoNull(arr)
915            } else {
916                ChunkedArrayLayout::Single(arr)
917            };
918        }
919
920        if self.downcast_iter().all(|a| a.null_count() == 0) {
921            ChunkedArrayLayout::MultiNoNull(self)
922        } else {
923            ChunkedArrayLayout::Multi(self)
924        }
925    }
926}
927
928impl<T> ChunkedArray<T>
929where
930    T: PolarsNumericType,
931{
932    /// Returns the values of the array as a contiguous slice.
933    pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {
934        polars_ensure!(
935            self.chunks.len() == 1 && self.chunks[0].null_count() == 0,
936            ComputeError: "chunked array is not contiguous"
937        );
938        Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())
939    }
940
941    /// Returns the values of the array as a contiguous mutable slice.
942    pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {
943        if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {
944            // SAFETY, we will not swap the PrimitiveArray.
945            let arr = unsafe { self.downcast_iter_mut().next().unwrap() };
946            arr.get_mut_values()
947        } else {
948            None
949        }
950    }
951
952    /// Get slices of the underlying arrow data.
953    /// NOTE: null values should be taken into account by the user of these slices as they are handled
954    /// separately
955    pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {
956        self.downcast_iter().map(|arr| arr.values().as_slice())
957    }
958
959    #[allow(clippy::wrong_self_convention)]
960    pub fn into_no_null_iter(
961        &self,
962    ) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen
963    {
964        // .copied was significantly slower in benchmark, next call did not inline?
965        #[allow(clippy::map_clone)]
966        // we know the iterators len
967        unsafe {
968            self.data_views()
969                .flatten()
970                .map(|v| *v)
971                .trust_my_length(self.len())
972        }
973    }
974}
975
976impl<T: PolarsDataType> Clone for ChunkedArray<T> {
977    fn clone(&self) -> Self {
978        ChunkedArray {
979            field: self.field.clone(),
980            chunks: self.chunks.clone(),
981            flags: self.flags.clone(),
982
983            _pd: Default::default(),
984            length: self.length,
985            null_count: self.null_count,
986        }
987    }
988}
989
990impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {
991    fn as_ref(&self) -> &ChunkedArray<T> {
992        self
993    }
994}
995
996impl ValueSize for ListChunked {
997    fn get_values_size(&self) -> usize {
998        self.chunks
999            .iter()
1000            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1001    }
1002}
1003
1004#[cfg(feature = "dtype-array")]
1005impl ValueSize for ArrayChunked {
1006    fn get_values_size(&self) -> usize {
1007        self.chunks
1008            .iter()
1009            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1010    }
1011}
1012impl ValueSize for StringChunked {
1013    fn get_values_size(&self) -> usize {
1014        self.chunks
1015            .iter()
1016            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1017    }
1018}
1019
1020impl ValueSize for BinaryOffsetChunked {
1021    fn get_values_size(&self) -> usize {
1022        self.chunks
1023            .iter()
1024            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1025    }
1026}
1027
1028pub(crate) fn to_primitive<T: PolarsNumericType>(
1029    values: Vec<T::Native>,
1030    validity: Option<Bitmap>,
1031) -> PrimitiveArray<T::Native> {
1032    PrimitiveArray::new(
1033        T::get_static_dtype().to_arrow(CompatLevel::newest()),
1034        values.into(),
1035        validity,
1036    )
1037}
1038
1039pub(crate) fn to_array<T: PolarsNumericType>(
1040    values: Vec<T::Native>,
1041    validity: Option<Bitmap>,
1042) -> ArrayRef {
1043    Box::new(to_primitive::<T>(values, validity))
1044}
1045
1046impl<T: PolarsDataType> Default for ChunkedArray<T> {
1047    fn default() -> Self {
1048        let dtype = T::get_static_dtype();
1049        let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());
1050        ChunkedArray {
1051            field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),
1052            // Invariant: always has 1 chunk.
1053            chunks: vec![new_empty_array(arrow_dtype)],
1054            flags: StatisticsFlagsIM::empty(),
1055
1056            _pd: Default::default(),
1057            length: 0,
1058            null_count: 0,
1059        }
1060    }
1061}
1062
1063#[cfg(test)]
1064pub(crate) mod test {
1065    use crate::prelude::*;
1066
1067    pub(crate) fn get_chunked_array() -> Int32Chunked {
1068        ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])
1069    }
1070
1071    #[test]
1072    fn test_sort() {
1073        let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);
1074        let b = a
1075            .sort(false)
1076            .iter()
1077            .map(|opt| opt.unwrap())
1078            .collect::<Vec<_>>();
1079        assert_eq!(b, [1, 2, 3, 9]);
1080        let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);
1081        let a = a.sort(false);
1082        let b = a.iter().collect::<Vec<_>>();
1083        assert_eq!(b, [Some("a"), Some("b"), Some("c")]);
1084        assert!(a.is_sorted_ascending_flag());
1085    }
1086
1087    #[test]
1088    fn arithmetic() {
1089        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);
1090        let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);
1091
1092        // Not really asserting anything here but still making sure the code is exercised
1093        // This (and more) is properly tested from the integration test suite and Python bindings.
1094        println!("{:?}", a + b);
1095        println!("{:?}", a - b);
1096        println!("{:?}", a * b);
1097        println!("{:?}", a / b);
1098    }
1099
1100    #[test]
1101    fn iter() {
1102        let s1 = get_chunked_array();
1103        // sum
1104        assert_eq!(s1.iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)
1105    }
1106
1107    #[test]
1108    fn limit() {
1109        let a = get_chunked_array();
1110        let b = a.limit(2);
1111        println!("{b:?}");
1112        assert_eq!(b.len(), 2)
1113    }
1114
1115    #[test]
1116    fn filter() {
1117        let a = get_chunked_array();
1118        let b = a
1119            .filter(&BooleanChunked::new(
1120                PlSmallStr::from_static("filter"),
1121                &[true, false, false],
1122            ))
1123            .unwrap();
1124        assert_eq!(b.len(), 1);
1125        assert_eq!(b.iter().next(), Some(Some(1)));
1126    }
1127
1128    #[test]
1129    fn aggregates() {
1130        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);
1131        assert_eq!(a.max(), Some(100));
1132        assert_eq!(a.min(), Some(1));
1133        assert_eq!(a.sum(), Some(120))
1134    }
1135
1136    #[test]
1137    fn take() {
1138        let a = get_chunked_array();
1139        let new = a.take(&[0 as IdxSize, 1]).unwrap();
1140        assert_eq!(new.len(), 2)
1141    }
1142
1143    #[test]
1144    fn cast() {
1145        let a = get_chunked_array();
1146        let b = a.cast(&DataType::Int64).unwrap();
1147        assert_eq!(b.dtype(), &DataType::Int64)
1148    }
1149
1150    fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])
1151    where
1152        T: PolarsNumericType,
1153    {
1154        assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)
1155    }
1156
1157    #[test]
1158    fn slice() {
1159        let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);
1160        let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);
1161        first.append(&second).unwrap();
1162        assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);
1163        assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);
1164        assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);
1165        assert_slice_equal(&first.slice(3, 2), &[3, 4]);
1166        assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);
1167        assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);
1168        assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);
1169
1170        assert_eq!(first.slice(-7, 2).len(), 1);
1171        assert_eq!(first.slice(-3, 4).len(), 3);
1172        assert_eq!(first.slice(3, 4).len(), 3);
1173        assert_eq!(first.slice(10, 4).len(), 0);
1174    }
1175
1176    #[test]
1177    fn sorting() {
1178        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);
1179        let sorted = s.sort(false);
1180        assert_slice_equal(&sorted, &[2, 4, 9]);
1181        let sorted = s.sort(true);
1182        assert_slice_equal(&sorted, &[9, 4, 2]);
1183
1184        let s: StringChunked = ["b", "a", "z"].iter().collect();
1185        let sorted = s.sort(false);
1186        assert_eq!(
1187            sorted.iter().collect::<Vec<_>>(),
1188            &[Some("a"), Some("b"), Some("z")]
1189        );
1190        let sorted = s.sort(true);
1191        assert_eq!(
1192            sorted.iter().collect::<Vec<_>>(),
1193            &[Some("z"), Some("b"), Some("a")]
1194        );
1195        let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();
1196        let sorted = s.sort(false);
1197        assert_eq!(
1198            sorted.iter().collect::<Vec<_>>(),
1199            &[None, Some("b"), Some("z")]
1200        );
1201    }
1202
1203    #[test]
1204    fn reverse() {
1205        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);
1206        // path with continuous slice
1207        assert_slice_equal(&s.reverse(), &[3, 2, 1]);
1208        // path with options
1209        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);
1210        assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);
1211        let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);
1212        assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);
1213
1214        let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);
1215        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);
1216
1217        let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);
1218        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);
1219    }
1220
1221    #[test]
1222    #[cfg(feature = "dtype-categorical")]
1223    fn test_iter_categorical() {
1224        let ca = StringChunked::new(
1225            PlSmallStr::EMPTY,
1226            &[Some("foo"), None, Some("bar"), Some("ham")],
1227        );
1228        let cats = Categories::new(
1229            PlSmallStr::EMPTY,
1230            PlSmallStr::EMPTY,
1231            CategoricalPhysical::U32,
1232        );
1233        let ca = ca.cast(&DataType::from_categories(cats)).unwrap();
1234        let ca = ca.cat32().unwrap();
1235        let v: Vec<_> = ca.physical().iter().collect();
1236        assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);
1237    }
1238
1239    #[test]
1240    #[ignore]
1241    fn test_shrink_to_fit() {
1242        let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);
1243        builder.append_value("foo");
1244        let mut arr = builder.finish();
1245        let before = arr
1246            .chunks()
1247            .iter()
1248            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1249            .sum::<usize>();
1250        arr.shrink_to_fit();
1251        let after = arr
1252            .chunks()
1253            .iter()
1254            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1255            .sum::<usize>();
1256        assert!(before > after);
1257    }
1258}