polars_core/chunked_array/
mod.rs

1//! The typed heart of every Series column.
2#![allow(unsafe_op_in_unsafe_fn)]
3use std::sync::Arc;
4
5use arrow::array::*;
6use arrow::bitmap::Bitmap;
7use arrow::compute::concatenate::concatenate_unchecked;
8use polars_compute::filter::filter_with_bitmap;
9
10use crate::prelude::{ChunkTakeUnchecked, *};
11
12pub mod ops;
13#[macro_use]
14pub mod arithmetic;
15pub mod builder;
16pub mod cast;
17pub mod collect;
18pub mod comparison;
19pub mod flags;
20pub mod float;
21pub mod iterator;
22#[cfg(feature = "ndarray")]
23pub(crate) mod ndarray;
24
25pub mod arg_min_max;
26#[cfg(feature = "dtype-array")]
27pub(crate) mod array;
28mod binary;
29mod binary_offset;
30mod bitwise;
31#[cfg(feature = "object")]
32mod drop;
33mod from;
34mod from_iterator;
35pub mod from_iterator_par;
36pub(crate) mod list;
37pub(crate) mod logical;
38#[cfg(feature = "object")]
39pub mod object;
40#[cfg(feature = "random")]
41mod random;
42#[cfg(feature = "dtype-struct")]
43mod struct_;
44#[cfg(any(
45    feature = "temporal",
46    feature = "dtype-datetime",
47    feature = "dtype-date"
48))]
49pub mod temporal;
50mod to_vec;
51mod trusted_len;
52pub(crate) use arg_min_max::*;
53use arrow::legacy::prelude::*;
54#[cfg(feature = "dtype-struct")]
55pub use struct_::StructChunked;
56
57use self::flags::{StatisticsFlags, StatisticsFlagsIM};
58use crate::series::IsSorted;
59use crate::utils::{first_non_null, first_null, last_non_null};
60
61pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;
62
63/// # ChunkedArray
64///
65/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows
66/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.
67/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].
68///
69/// ```rust
70/// # use polars_core::prelude::*;
71/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {
72///     ca.apply_values(|v| v.cos())
73/// }
74/// ```
75///
76/// ## Conversion between Series and ChunkedArrays
77/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.
78///
79/// ```rust
80/// # use polars_core::prelude::*;
81/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{
82///     series.i32()
83/// }
84///
85/// fn to_series(ca: Int32Chunked) -> Series {
86///     ca.into_series()
87/// }
88/// ```
89///
90/// # Iterators
91///
92/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
93/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby
94/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).
95///
96/// ```rust
97/// # use polars_core::prelude::*;
98///
99/// fn iter_forward(ca: &Float32Chunked) {
100///     ca.iter()
101///         .for_each(|opt_v| println!("{:?}", opt_v))
102/// }
103///
104/// fn iter_backward(ca: &Float32Chunked) {
105///     ca.iter()
106///         .rev()
107///         .for_each(|opt_v| println!("{:?}", opt_v))
108/// }
109/// ```
110///
111/// # Memory layout
112///
113/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.
114/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.
115///
116/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.
117/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).
118///
119/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection
120/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.
121/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.
122///
123/// If you want to have predictable performance
124/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after
125/// multiple append operations.
126///
127/// See also [`ChunkedArray::extend`] for appends within a chunk.
128///
129/// # Invariants
130/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].
131/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]
132///   chunks.
133/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the
134///   logical type given by the datatype.
135///
136/// [`List`]: crate::datatypes::DataType::List
137pub struct ChunkedArray<T: PolarsDataType> {
138    pub(crate) field: Arc<Field>,
139    pub(crate) chunks: Vec<ArrayRef>,
140
141    pub(crate) flags: StatisticsFlagsIM,
142
143    length: usize,
144    null_count: usize,
145    _pd: std::marker::PhantomData<T>,
146}
147
148impl<T: PolarsDataType> ChunkedArray<T> {
149    fn should_rechunk(&self) -> bool {
150        self.chunks.len() > 1 && self.chunks.len() > self.len() / 3
151    }
152
153    fn optional_rechunk(mut self) -> Self {
154        // Rechunk if we have many small chunks.
155        if self.should_rechunk() {
156            self.rechunk_mut()
157        }
158        self
159    }
160
161    pub(crate) fn as_any(&self) -> &dyn std::any::Any {
162        self
163    }
164
165    /// Series to [`ChunkedArray<T>`]
166    pub fn unpack_series_matching_type<'a>(
167        &self,
168        series: &'a Series,
169    ) -> PolarsResult<&'a ChunkedArray<T>> {
170        polars_ensure!(
171            self.dtype() == series.dtype(),
172            SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
173            series.dtype(),
174            self.dtype(),
175        );
176
177        // SAFETY: dtype will be correct.
178        Ok(unsafe { self.unpack_series_matching_physical_type(series) })
179    }
180
181    /// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.
182    ///
183    /// If you want to explicitly the `length` and `null_count`, look at
184    /// [`ChunkedArray::new_with_dims`]
185    fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {
186        unsafe {
187            let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);
188            chunked_arr.compute_len();
189            chunked_arr
190        }
191    }
192
193    /// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.
194    /// # Safety
195    /// The length and null_count must be correct.
196    pub unsafe fn new_with_dims(
197        field: Arc<Field>,
198        chunks: Vec<ArrayRef>,
199        length: usize,
200        null_count: usize,
201    ) -> Self {
202        Self {
203            field,
204            chunks,
205            flags: StatisticsFlagsIM::empty(),
206
207            _pd: Default::default(),
208            length,
209            null_count,
210        }
211    }
212
213    pub(crate) fn is_sorted_ascending_flag(&self) -> bool {
214        self.get_flags().is_sorted_ascending()
215    }
216
217    pub(crate) fn is_sorted_descending_flag(&self) -> bool {
218        self.get_flags().is_sorted_descending()
219    }
220
221    /// Whether `self` is sorted in any direction.
222    pub(crate) fn is_sorted_any(&self) -> bool {
223        self.get_flags().is_sorted_any()
224    }
225
226    pub fn unset_fast_explode_list(&mut self) {
227        self.set_fast_explode_list(false)
228    }
229
230    pub fn set_fast_explode_list(&mut self, value: bool) {
231        let mut flags = self.flags.get_mut();
232        flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);
233        self.flags.set_mut(flags);
234    }
235
236    pub fn get_fast_explode_list(&self) -> bool {
237        self.get_flags().can_fast_explode_list()
238    }
239
240    pub fn get_flags(&self) -> StatisticsFlags {
241        self.flags.get()
242    }
243
244    /// Set flags for the [`ChunkedArray`]
245    pub fn set_flags(&mut self, flags: StatisticsFlags) {
246        self.flags = StatisticsFlagsIM::new(flags);
247    }
248
249    pub fn is_sorted_flag(&self) -> IsSorted {
250        self.get_flags().is_sorted()
251    }
252
253    pub fn retain_flags_from<U: PolarsDataType>(
254        &mut self,
255        from: &ChunkedArray<U>,
256        retain_flags: StatisticsFlags,
257    ) {
258        let flags = from.flags.get();
259        // Try to avoid write contention.
260        if !flags.is_empty() {
261            self.set_flags(flags & retain_flags)
262        }
263    }
264
265    /// Set the 'sorted' bit meta info.
266    pub fn set_sorted_flag(&mut self, sorted: IsSorted) {
267        let mut flags = self.flags.get_mut();
268        flags.set_sorted(sorted);
269        self.flags.set_mut(flags);
270    }
271
272    /// Set the 'sorted' bit meta info.
273    pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {
274        let mut out = self.clone();
275        out.set_sorted_flag(sorted);
276        out
277    }
278
279    pub fn first_null(&self) -> Option<usize> {
280        if self.null_count() == 0 {
281            None
282        }
283        // We now know there is at least 1 non-null item in the array, and self.len() > 0
284        else if self.null_count() == self.len() {
285            Some(0)
286        } else if self.is_sorted_any() {
287            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
288                // nulls are all at the start
289                0
290            } else {
291                // nulls are all at the end
292                self.null_count()
293            };
294
295            debug_assert!(
296                // If we are lucky this catches something.
297                unsafe { self.get_unchecked(out) }.is_some(),
298                "incorrect sorted flag"
299            );
300
301            Some(out)
302        } else {
303            first_null(self.chunks().iter().map(|arr| arr.as_ref()))
304        }
305    }
306
307    /// Get the index of the first non null value in this [`ChunkedArray`].
308    pub fn first_non_null(&self) -> Option<usize> {
309        if self.null_count() == self.len() {
310            None
311        }
312        // We now know there is at least 1 non-null item in the array, and self.len() > 0
313        else if self.null_count() == 0 {
314            Some(0)
315        } else if self.is_sorted_any() {
316            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
317                // nulls are all at the start
318                self.null_count()
319            } else {
320                // nulls are all at the end
321                0
322            };
323
324            debug_assert!(
325                // If we are lucky this catches something.
326                unsafe { self.get_unchecked(out) }.is_some(),
327                "incorrect sorted flag"
328            );
329
330            Some(out)
331        } else {
332            first_non_null(self.chunks().iter().map(|arr| arr.as_ref()))
333        }
334    }
335
336    /// Get the index of the last non null value in this [`ChunkedArray`].
337    pub fn last_non_null(&self) -> Option<usize> {
338        if self.null_count() == self.len() {
339            None
340        }
341        // We now know there is at least 1 non-null item in the array, and self.len() > 0
342        else if self.null_count() == 0 {
343            Some(self.len() - 1)
344        } else if self.is_sorted_any() {
345            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
346                // nulls are all at the start
347                self.len() - 1
348            } else {
349                // nulls are all at the end
350                self.len() - self.null_count() - 1
351            };
352
353            debug_assert!(
354                // If we are lucky this catches something.
355                unsafe { self.get_unchecked(out) }.is_some(),
356                "incorrect sorted flag"
357            );
358
359            Some(out)
360        } else {
361            last_non_null(self.chunks().iter().map(|arr| arr.as_ref()), self.len())
362        }
363    }
364
365    pub fn drop_nulls(&self) -> Self {
366        if self.null_count() == 0 {
367            self.clone()
368        } else {
369            let chunks = self
370                .downcast_iter()
371                .map(|arr| {
372                    if arr.null_count() == 0 {
373                        arr.to_boxed()
374                    } else {
375                        filter_with_bitmap(arr, arr.validity().unwrap())
376                    }
377                })
378                .collect();
379            unsafe {
380                Self::new_with_dims(
381                    self.field.clone(),
382                    chunks,
383                    self.len() - self.null_count(),
384                    0,
385                )
386            }
387        }
388    }
389
390    /// Get the buffer of bits representing null values
391    #[inline]
392    #[allow(clippy::type_complexity)]
393    pub fn iter_validities(
394        &self,
395    ) -> impl ExactSizeIterator<Item = Option<&Bitmap>> + DoubleEndedIterator {
396        fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {
397            arr.validity()
398        }
399        self.chunks.iter().map(to_validity)
400    }
401
402    #[inline]
403    /// Return if any the chunks in this [`ChunkedArray`] have nulls.
404    pub fn has_nulls(&self) -> bool {
405        self.null_count > 0
406    }
407
408    /// Shrink the capacity of this array to fit its length.
409    pub fn shrink_to_fit(&mut self) {
410        self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];
411    }
412
413    pub fn clear(&self) -> Self {
414        // SAFETY: we keep the correct dtype
415        let mut ca = unsafe {
416            self.copy_with_chunks(vec![new_empty_array(
417                self.chunks.first().unwrap().dtype().clone(),
418            )])
419        };
420
421        use StatisticsFlags as F;
422        ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
423        ca
424    }
425
426    /// Unpack a [`Series`] to the same physical type.
427    ///
428    /// # Safety
429    ///
430    /// This is unsafe as the dtype may be incorrect and
431    /// is assumed to be correct in other safe code.
432    pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(
433        &self,
434        series: &'a Series,
435    ) -> &'a ChunkedArray<T> {
436        let series_trait = &**series;
437        if self.dtype() == series.dtype() {
438            &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
439        } else {
440            use DataType::*;
441            match (self.dtype(), series.dtype()) {
442                (Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {
443                    &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
444                },
445                _ => panic!(
446                    "cannot unpack series {:?} into matching type {:?}",
447                    series,
448                    self.dtype()
449                ),
450            }
451        }
452    }
453
454    /// Returns an iterator over the lengths of the chunks of the array.
455    pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {
456        self.chunks.iter().map(|chunk| chunk.len())
457    }
458
459    /// A reference to the chunks
460    #[inline]
461    pub fn chunks(&self) -> &Vec<ArrayRef> {
462        &self.chunks
463    }
464
465    /// A mutable reference to the chunks
466    ///
467    /// # Safety
468    /// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.
469    /// And the `null_count` remains correct.
470    #[inline]
471    pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
472        &mut self.chunks
473    }
474
475    /// Returns true if contains a single chunk and has no null values
476    pub fn is_optimal_aligned(&self) -> bool {
477        self.chunks.len() == 1 && self.null_count() == 0
478    }
479
480    /// Create a new [`ChunkedArray`] from self, where the chunks are replaced.
481    ///
482    /// # Safety
483    /// The caller must ensure the dtypes of the chunks are correct
484    unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
485        Self::new_with_compute_len(self.field.clone(), chunks)
486    }
487
488    /// Get data type of [`ChunkedArray`].
489    pub fn dtype(&self) -> &DataType {
490        self.field.dtype()
491    }
492
493    pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {
494        self.field = Arc::new(Field::new(self.name().clone(), dtype))
495    }
496
497    /// Name of the [`ChunkedArray`].
498    pub fn name(&self) -> &PlSmallStr {
499        self.field.name()
500    }
501
502    /// Get a reference to the field.
503    pub fn ref_field(&self) -> &Field {
504        &self.field
505    }
506
507    /// Rename this [`ChunkedArray`].
508    pub fn rename(&mut self, name: PlSmallStr) {
509        self.field = Arc::new(Field::new(name, self.field.dtype().clone()));
510    }
511
512    /// Return this [`ChunkedArray`] with a new name.
513    pub fn with_name(mut self, name: PlSmallStr) -> Self {
514        self.rename(name);
515        self
516    }
517}
518
519impl<T> ChunkedArray<T>
520where
521    T: PolarsDataType,
522{
523    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
524    /// indicates a NULL value.
525    ///
526    /// # Panics
527    /// This function will panic if `idx` is out of bounds.
528    #[inline]
529    pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {
530        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
531        assert!(
532            chunk_idx < self.chunks().len(),
533            "index: {} out of bounds for len: {}",
534            idx,
535            self.len()
536        );
537        unsafe {
538            let arr = self.downcast_get_unchecked(chunk_idx);
539            assert!(
540                arr_idx < arr.len(),
541                "index: {} out of bounds for len: {}",
542                idx,
543                self.len()
544            );
545            arr.get_unchecked(arr_idx)
546        }
547    }
548
549    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
550    /// indicates a NULL value.
551    ///
552    /// # Safety
553    /// It is the callers responsibility that the `idx < self.len()`.
554    #[inline]
555    pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {
556        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
557
558        unsafe {
559            // SAFETY: up to the caller to make sure the index is valid.
560            self.downcast_get_unchecked(chunk_idx)
561                .get_unchecked(arr_idx)
562        }
563    }
564
565    /// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned
566    /// value could be garbage if it was masked out by NULL. Note that the value always is initialized.
567    ///
568    /// # Safety
569    /// It is the callers responsibility that the `idx < self.len()`.
570    #[inline]
571    pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {
572        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
573
574        unsafe {
575            // SAFETY: up to the caller to make sure the index is valid.
576            self.downcast_get_unchecked(chunk_idx)
577                .value_unchecked(arr_idx)
578        }
579    }
580
581    /// # Panics
582    /// Panics if the [`ChunkedArray`] is empty.
583    #[inline]
584    pub fn first(&self) -> Option<T::Physical<'_>> {
585        self.iter().next().unwrap()
586    }
587
588    /// # Panics
589    /// Panics if the [`ChunkedArray`] is empty.
590    #[inline]
591    pub fn last(&self) -> Option<T::Physical<'_>> {
592        let arr = self
593            .downcast_iter()
594            .rev()
595            .find(|arr| !arr.is_empty())
596            .unwrap();
597        unsafe { arr.get_unchecked(arr.len() - 1) }
598    }
599
600    pub fn set_validity(&mut self, validity: &Bitmap) {
601        assert_eq!(self.len(), validity.len());
602        let mut i = 0;
603        for chunk in unsafe { self.chunks_mut() } {
604            *chunk = chunk.with_validity(Some(validity.clone().sliced(i, chunk.len())));
605            i += chunk.len();
606        }
607        self.null_count = validity.unset_bits();
608        self.set_fast_explode_list(false);
609    }
610}
611
612impl<T> ChunkedArray<T>
613where
614    T: PolarsDataType,
615    ChunkedArray<T>: ChunkTakeUnchecked<[IdxSize]>,
616{
617    /// Deposit values into nulls with a certain validity mask.
618    pub fn deposit(&self, validity: &Bitmap) -> Self {
619        let set_bits = validity.set_bits();
620
621        assert_eq!(self.null_count(), 0);
622        assert_eq!(self.len(), set_bits);
623
624        if set_bits == validity.len() {
625            return self.clone();
626        }
627
628        if set_bits == 0 {
629            return Self::full_null_like(self, validity.len());
630        }
631
632        let mut null_mask = validity.clone();
633
634        let mut gather_idxs = Vec::with_capacity(validity.len());
635        let leading_nulls = null_mask.take_leading_zeros();
636        gather_idxs.extend(std::iter::repeat_n(0, leading_nulls + 1));
637
638        let mut i = 0 as IdxSize;
639        gather_idxs.extend(null_mask.iter().skip(1).map(|v| {
640            i += IdxSize::from(v);
641            i
642        }));
643
644        let mut ca = unsafe { ChunkTakeUnchecked::take_unchecked(self, &gather_idxs) };
645        ca.set_validity(validity);
646        ca
647    }
648}
649
650impl ListChunked {
651    #[inline]
652    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
653        unsafe {
654            Some(Series::from_chunks_and_dtype_unchecked(
655                self.name().clone(),
656                vec![self.get(idx)?],
657                &self.inner_dtype().to_physical(),
658            ))
659        }
660    }
661
662    pub fn has_empty_lists(&self) -> bool {
663        for arr in self.downcast_iter() {
664            if arr.is_empty() {
665                continue;
666            }
667
668            if match arr.validity() {
669                None => arr.offsets().lengths().any(|l| l == 0),
670                Some(validity) => arr
671                    .offsets()
672                    .lengths()
673                    .enumerate()
674                    .any(|(i, l)| l == 0 && unsafe { validity.get_bit_unchecked(i) }),
675            } {
676                return true;
677            }
678        }
679
680        false
681    }
682
683    pub fn has_masked_out_values(&self) -> bool {
684        for arr in self.downcast_iter() {
685            if arr.is_empty() {
686                continue;
687            }
688
689            if *arr.offsets().first() != 0 || *arr.offsets().last() != arr.values().len() as i64 {
690                return true;
691            }
692
693            let Some(validity) = arr.validity() else {
694                continue;
695            };
696            if validity.set_bits() == 0 {
697                continue;
698            }
699
700            // @Performance: false_idx_iter
701            for i in (!validity).true_idx_iter() {
702                if arr.offsets().length_at(i) > 0 {
703                    return true;
704                }
705            }
706        }
707
708        false
709    }
710}
711
712#[cfg(feature = "dtype-array")]
713impl ArrayChunked {
714    #[inline]
715    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
716        unsafe {
717            Some(Series::from_chunks_and_dtype_unchecked(
718                self.name().clone(),
719                vec![self.get(idx)?],
720                &self.inner_dtype().to_physical(),
721            ))
722        }
723    }
724
725    pub fn from_aligned_values(
726        name: PlSmallStr,
727        inner_dtype: &DataType,
728        width: usize,
729        chunks: Vec<ArrayRef>,
730        length: usize,
731    ) -> Self {
732        let dtype = DataType::Array(Box::new(inner_dtype.clone()), width);
733        let arrow_dtype = dtype.to_arrow(CompatLevel::newest());
734        let field = Arc::new(Field::new(name, dtype));
735        if width == 0 {
736            use arrow::array::builder::{ArrayBuilder, make_builder};
737            let values = make_builder(&inner_dtype.to_arrow(CompatLevel::newest())).freeze();
738            return ArrayChunked::new_with_compute_len(
739                field,
740                vec![FixedSizeListArray::new(arrow_dtype, length, values, None).into_boxed()],
741            );
742        }
743        let mut total_len = 0;
744        let chunks = chunks
745            .into_iter()
746            .map(|chunk| {
747                debug_assert_eq!(chunk.len() % width, 0);
748                let chunk_len = chunk.len() / width;
749                total_len += chunk_len;
750                FixedSizeListArray::new(arrow_dtype.clone(), chunk_len, chunk, None).into_boxed()
751            })
752            .collect();
753        debug_assert_eq!(total_len, length);
754
755        unsafe { Self::new_with_dims(field, chunks, length, 0) }
756    }
757
758    /// Turn the ArrayChunked into the ListChunked with the same items.
759    ///
760    /// This will always zero copy the values into the ListChunked.
761    pub fn to_list(&self) -> ListChunked {
762        let inner_dtype = self.inner_dtype();
763        let chunks = self
764            .downcast_iter()
765            .map(|chunk| {
766                use arrow::offset::OffsetsBuffer;
767
768                let inner_dtype = chunk.dtype().inner_dtype().unwrap();
769                let dtype = inner_dtype.clone().to_large_list(true);
770
771                let offsets = (0..=chunk.len())
772                    .map(|i| (i * self.width()) as i64)
773                    .collect::<Vec<i64>>();
774
775                // SAFETY: We created our offsets in ascending manner.
776                let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) };
777
778                ListArray::<i64>::new(
779                    dtype,
780                    offsets,
781                    chunk.values().clone(),
782                    chunk.validity().cloned(),
783                )
784                .into_boxed()
785            })
786            .collect();
787
788        // SAFETY: All the items were mapped 1-1 with the validity staying the same.
789        let mut ca = unsafe {
790            ListChunked::new_with_dims(
791                Arc::new(Field::new(
792                    self.name().clone(),
793                    DataType::List(Box::new(inner_dtype.clone())),
794                )),
795                chunks,
796                self.len(),
797                self.null_count(),
798            )
799        };
800        ca.set_fast_explode_list(!self.has_nulls());
801        ca
802    }
803}
804
805impl<T> ChunkedArray<T>
806where
807    T: PolarsDataType,
808{
809    /// Should be used to match the chunk_id of another [`ChunkedArray`].
810    /// # Panics
811    /// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.
812    pub fn match_chunks<I>(&self, chunk_id: I) -> Self
813    where
814        I: Iterator<Item = usize>,
815    {
816        debug_assert!(self.chunks.len() == 1);
817        // Takes a ChunkedArray containing a single chunk.
818        let slice = |ca: &Self| {
819            let array = &ca.chunks[0];
820
821            let mut offset = 0;
822            let chunks = chunk_id
823                .map(|len| {
824                    // SAFETY: within bounds.
825                    debug_assert!((offset + len) <= array.len());
826                    let out = unsafe { array.sliced_unchecked(offset, len) };
827                    offset += len;
828                    out
829                })
830                .collect();
831
832            debug_assert_eq!(offset, array.len());
833
834            // SAFETY: We just slice the original chunks, their type will not change.
835            unsafe {
836                Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())
837            }
838        };
839
840        if self.chunks.len() != 1 {
841            let out = self.rechunk();
842            slice(&out)
843        } else {
844            slice(self)
845        }
846    }
847}
848
849impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {
850    fn as_ref_dtype(&self) -> &DataType {
851        self.dtype()
852    }
853}
854
855pub(crate) trait AsSinglePtr: AsRefDataType {
856    /// Rechunk and return a ptr to the start of the array
857    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
858        polars_bail!(opq = as_single_ptr, self.as_ref_dtype());
859    }
860}
861
862impl<T> AsSinglePtr for ChunkedArray<T>
863where
864    T: PolarsNumericType,
865{
866    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
867        self.rechunk_mut();
868        let a = self.data_views().next().unwrap();
869        let ptr = a.as_ptr();
870        Ok(ptr as usize)
871    }
872}
873
874impl AsSinglePtr for BooleanChunked {}
875impl AsSinglePtr for ListChunked {}
876#[cfg(feature = "dtype-array")]
877impl AsSinglePtr for ArrayChunked {}
878impl AsSinglePtr for StringChunked {}
879impl AsSinglePtr for BinaryChunked {}
880#[cfg(feature = "object")]
881impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}
882
883pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {
884    SingleNoNull(&'a T::Array),
885    Single(&'a T::Array),
886    MultiNoNull(&'a ChunkedArray<T>),
887    Multi(&'a ChunkedArray<T>),
888}
889
890impl<T> ChunkedArray<T>
891where
892    T: PolarsDataType,
893{
894    pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {
895        if self.chunks.len() == 1 {
896            let arr = self.downcast_iter().next().unwrap();
897            return if arr.null_count() == 0 {
898                ChunkedArrayLayout::SingleNoNull(arr)
899            } else {
900                ChunkedArrayLayout::Single(arr)
901            };
902        }
903
904        if self.downcast_iter().all(|a| a.null_count() == 0) {
905            ChunkedArrayLayout::MultiNoNull(self)
906        } else {
907            ChunkedArrayLayout::Multi(self)
908        }
909    }
910}
911
912impl<T> ChunkedArray<T>
913where
914    T: PolarsNumericType,
915{
916    /// Returns the values of the array as a contiguous slice.
917    pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {
918        polars_ensure!(
919            self.chunks.len() == 1 && self.chunks[0].null_count() == 0,
920            ComputeError: "chunked array is not contiguous"
921        );
922        Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())
923    }
924
925    /// Returns the values of the array as a contiguous mutable slice.
926    pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {
927        if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {
928            // SAFETY, we will not swap the PrimitiveArray.
929            let arr = unsafe { self.downcast_iter_mut().next().unwrap() };
930            arr.get_mut_values()
931        } else {
932            None
933        }
934    }
935
936    /// Get slices of the underlying arrow data.
937    /// NOTE: null values should be taken into account by the user of these slices as they are handled
938    /// separately
939    pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {
940        self.downcast_iter().map(|arr| arr.values().as_slice())
941    }
942
943    #[allow(clippy::wrong_self_convention)]
944    pub fn into_no_null_iter(
945        &self,
946    ) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen
947    {
948        // .copied was significantly slower in benchmark, next call did not inline?
949        #[allow(clippy::map_clone)]
950        // we know the iterators len
951        unsafe {
952            self.data_views()
953                .flatten()
954                .map(|v| *v)
955                .trust_my_length(self.len())
956        }
957    }
958}
959
960impl<T: PolarsDataType> Clone for ChunkedArray<T> {
961    fn clone(&self) -> Self {
962        ChunkedArray {
963            field: self.field.clone(),
964            chunks: self.chunks.clone(),
965            flags: self.flags.clone(),
966
967            _pd: Default::default(),
968            length: self.length,
969            null_count: self.null_count,
970        }
971    }
972}
973
974impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {
975    fn as_ref(&self) -> &ChunkedArray<T> {
976        self
977    }
978}
979
980impl ValueSize for ListChunked {
981    fn get_values_size(&self) -> usize {
982        self.chunks
983            .iter()
984            .fold(0usize, |acc, arr| acc + arr.get_values_size())
985    }
986}
987
988#[cfg(feature = "dtype-array")]
989impl ValueSize for ArrayChunked {
990    fn get_values_size(&self) -> usize {
991        self.chunks
992            .iter()
993            .fold(0usize, |acc, arr| acc + arr.get_values_size())
994    }
995}
996impl ValueSize for StringChunked {
997    fn get_values_size(&self) -> usize {
998        self.chunks
999            .iter()
1000            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1001    }
1002}
1003
1004impl ValueSize for BinaryOffsetChunked {
1005    fn get_values_size(&self) -> usize {
1006        self.chunks
1007            .iter()
1008            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1009    }
1010}
1011
1012pub(crate) fn to_primitive<T: PolarsNumericType>(
1013    values: Vec<T::Native>,
1014    validity: Option<Bitmap>,
1015) -> PrimitiveArray<T::Native> {
1016    PrimitiveArray::new(
1017        T::get_static_dtype().to_arrow(CompatLevel::newest()),
1018        values.into(),
1019        validity,
1020    )
1021}
1022
1023pub(crate) fn to_array<T: PolarsNumericType>(
1024    values: Vec<T::Native>,
1025    validity: Option<Bitmap>,
1026) -> ArrayRef {
1027    Box::new(to_primitive::<T>(values, validity))
1028}
1029
1030impl<T: PolarsDataType> Default for ChunkedArray<T> {
1031    fn default() -> Self {
1032        let dtype = T::get_static_dtype();
1033        let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());
1034        ChunkedArray {
1035            field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),
1036            // Invariant: always has 1 chunk.
1037            chunks: vec![new_empty_array(arrow_dtype)],
1038            flags: StatisticsFlagsIM::empty(),
1039
1040            _pd: Default::default(),
1041            length: 0,
1042            null_count: 0,
1043        }
1044    }
1045}
1046
1047#[cfg(test)]
1048pub(crate) mod test {
1049    use crate::prelude::*;
1050
1051    pub(crate) fn get_chunked_array() -> Int32Chunked {
1052        ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])
1053    }
1054
1055    #[test]
1056    fn test_sort() {
1057        let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);
1058        let b = a
1059            .sort(false)
1060            .into_iter()
1061            .map(|opt| opt.unwrap())
1062            .collect::<Vec<_>>();
1063        assert_eq!(b, [1, 2, 3, 9]);
1064        let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);
1065        let a = a.sort(false);
1066        let b = a.into_iter().collect::<Vec<_>>();
1067        assert_eq!(b, [Some("a"), Some("b"), Some("c")]);
1068        assert!(a.is_sorted_ascending_flag());
1069    }
1070
1071    #[test]
1072    fn arithmetic() {
1073        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);
1074        let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);
1075
1076        // Not really asserting anything here but still making sure the code is exercised
1077        // This (and more) is properly tested from the integration test suite and Python bindings.
1078        println!("{:?}", a + b);
1079        println!("{:?}", a - b);
1080        println!("{:?}", a * b);
1081        println!("{:?}", a / b);
1082    }
1083
1084    #[test]
1085    fn iter() {
1086        let s1 = get_chunked_array();
1087        // sum
1088        assert_eq!(s1.into_iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)
1089    }
1090
1091    #[test]
1092    fn limit() {
1093        let a = get_chunked_array();
1094        let b = a.limit(2);
1095        println!("{b:?}");
1096        assert_eq!(b.len(), 2)
1097    }
1098
1099    #[test]
1100    fn filter() {
1101        let a = get_chunked_array();
1102        let b = a
1103            .filter(&BooleanChunked::new(
1104                PlSmallStr::from_static("filter"),
1105                &[true, false, false],
1106            ))
1107            .unwrap();
1108        assert_eq!(b.len(), 1);
1109        assert_eq!(b.into_iter().next(), Some(Some(1)));
1110    }
1111
1112    #[test]
1113    fn aggregates() {
1114        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);
1115        assert_eq!(a.max(), Some(100));
1116        assert_eq!(a.min(), Some(1));
1117        assert_eq!(a.sum(), Some(120))
1118    }
1119
1120    #[test]
1121    fn take() {
1122        let a = get_chunked_array();
1123        let new = a.take(&[0 as IdxSize, 1]).unwrap();
1124        assert_eq!(new.len(), 2)
1125    }
1126
1127    #[test]
1128    fn cast() {
1129        let a = get_chunked_array();
1130        let b = a.cast(&DataType::Int64).unwrap();
1131        assert_eq!(b.dtype(), &DataType::Int64)
1132    }
1133
1134    fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])
1135    where
1136        T: PolarsNumericType,
1137    {
1138        assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)
1139    }
1140
1141    #[test]
1142    fn slice() {
1143        let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);
1144        let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);
1145        first.append(&second).unwrap();
1146        assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);
1147        assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);
1148        assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);
1149        assert_slice_equal(&first.slice(3, 2), &[3, 4]);
1150        assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);
1151        assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);
1152        assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);
1153
1154        assert_eq!(first.slice(-7, 2).len(), 1);
1155        assert_eq!(first.slice(-3, 4).len(), 3);
1156        assert_eq!(first.slice(3, 4).len(), 3);
1157        assert_eq!(first.slice(10, 4).len(), 0);
1158    }
1159
1160    #[test]
1161    fn sorting() {
1162        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);
1163        let sorted = s.sort(false);
1164        assert_slice_equal(&sorted, &[2, 4, 9]);
1165        let sorted = s.sort(true);
1166        assert_slice_equal(&sorted, &[9, 4, 2]);
1167
1168        let s: StringChunked = ["b", "a", "z"].iter().collect();
1169        let sorted = s.sort(false);
1170        assert_eq!(
1171            sorted.into_iter().collect::<Vec<_>>(),
1172            &[Some("a"), Some("b"), Some("z")]
1173        );
1174        let sorted = s.sort(true);
1175        assert_eq!(
1176            sorted.into_iter().collect::<Vec<_>>(),
1177            &[Some("z"), Some("b"), Some("a")]
1178        );
1179        let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();
1180        let sorted = s.sort(false);
1181        assert_eq!(
1182            sorted.into_iter().collect::<Vec<_>>(),
1183            &[None, Some("b"), Some("z")]
1184        );
1185    }
1186
1187    #[test]
1188    fn reverse() {
1189        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);
1190        // path with continuous slice
1191        assert_slice_equal(&s.reverse(), &[3, 2, 1]);
1192        // path with options
1193        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);
1194        assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);
1195        let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);
1196        assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);
1197
1198        let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);
1199        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);
1200
1201        let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);
1202        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);
1203    }
1204
1205    #[test]
1206    #[cfg(feature = "dtype-categorical")]
1207    fn test_iter_categorical() {
1208        let ca = StringChunked::new(
1209            PlSmallStr::EMPTY,
1210            &[Some("foo"), None, Some("bar"), Some("ham")],
1211        );
1212        let cats = Categories::new(
1213            PlSmallStr::EMPTY,
1214            PlSmallStr::EMPTY,
1215            CategoricalPhysical::U32,
1216        );
1217        let ca = ca.cast(&DataType::from_categories(cats)).unwrap();
1218        let ca = ca.cat32().unwrap();
1219        let v: Vec<_> = ca.physical().into_iter().collect();
1220        assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);
1221    }
1222
1223    #[test]
1224    #[ignore]
1225    fn test_shrink_to_fit() {
1226        let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);
1227        builder.append_value("foo");
1228        let mut arr = builder.finish();
1229        let before = arr
1230            .chunks()
1231            .iter()
1232            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1233            .sum::<usize>();
1234        arr.shrink_to_fit();
1235        let after = arr
1236            .chunks()
1237            .iter()
1238            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1239            .sum::<usize>();
1240        assert!(before > after);
1241    }
1242}