Skip to main content

polars_core/chunked_array/
mod.rs

1//! The typed heart of every Series column.
2#![allow(unsafe_op_in_unsafe_fn)]
3use std::sync::Arc;
4
5use arrow::array::*;
6use arrow::bitmap::Bitmap;
7use arrow::compute::concatenate::concatenate_unchecked;
8use arrow::compute::utils::combine_validities_and;
9use polars_compute::filter::filter_with_bitmap;
10
11use crate::prelude::{ChunkTakeUnchecked, *};
12
13pub mod ops;
14#[macro_use]
15pub mod arithmetic;
16pub mod builder;
17pub mod cast;
18pub mod collect;
19pub mod comparison;
20pub mod flags;
21pub mod float;
22pub mod iterator;
23#[cfg(feature = "ndarray")]
24pub(crate) mod ndarray;
25
26pub mod arg_min_max;
27#[cfg(feature = "dtype-array")]
28pub(crate) mod array;
29mod binary;
30mod binary_offset;
31mod bitwise;
32#[cfg(feature = "object")]
33mod drop;
34mod from;
35mod from_iterator;
36pub mod from_iterator_par;
37pub(crate) mod list;
38pub(crate) mod logical;
39#[cfg(feature = "object")]
40pub mod object;
41#[cfg(feature = "random")]
42mod random;
43#[cfg(feature = "dtype-struct")]
44mod struct_;
45#[cfg(any(
46    feature = "temporal",
47    feature = "dtype-datetime",
48    feature = "dtype-date"
49))]
50pub mod temporal;
51mod to_vec;
52mod trusted_len;
53pub(crate) use arg_min_max::*;
54#[cfg(feature = "dtype-struct")]
55pub use struct_::StructChunked;
56
57use self::flags::{StatisticsFlags, StatisticsFlagsIM};
58use crate::series::IsSorted;
59use crate::utils::{first_non_null, first_null, last_non_null};
60
61pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;
62
63/// # ChunkedArray
64///
65/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows
66/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.
67/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].
68///
69/// ```rust
70/// # use polars_core::prelude::*;
71/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {
72///     ca.apply_values(|v| v.cos())
73/// }
74/// ```
75///
76/// ## Conversion between Series and ChunkedArrays
77/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.
78///
79/// ```rust
80/// # use polars_core::prelude::*;
81/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{
82///     series.i32()
83/// }
84///
85/// fn to_series(ca: Int32Chunked) -> Series {
86///     ca.into_series()
87/// }
88/// ```
89///
90/// # Iterators
91///
92/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
93/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby
94/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).
95///
96/// ```rust
97/// # use polars_core::prelude::*;
98///
99/// fn iter_forward(ca: &Float32Chunked) {
100///     ca.iter()
101///         .for_each(|opt_v| println!("{:?}", opt_v))
102/// }
103///
104/// fn iter_backward(ca: &Float32Chunked) {
105///     ca.iter()
106///         .rev()
107///         .for_each(|opt_v| println!("{:?}", opt_v))
108/// }
109/// ```
110///
111/// # Memory layout
112///
113/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.
114/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.
115///
116/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.
117/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).
118///
119/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection
120/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.
121/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.
122///
123/// If you want to have predictable performance
124/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after
125/// multiple append operations.
126///
127/// See also [`ChunkedArray::extend`] for appends within a chunk.
128///
129/// # Invariants
130/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].
131/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]
132///   chunks.
133/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the
134///   logical type given by the datatype.
135///
136/// [`List`]: crate::datatypes::DataType::List
137pub struct ChunkedArray<T: PolarsDataType> {
138    pub(crate) field: Arc<Field>,
139    pub(crate) chunks: Vec<ArrayRef>,
140
141    pub(crate) flags: StatisticsFlagsIM,
142
143    length: usize,
144    null_count: usize,
145    _pd: std::marker::PhantomData<T>,
146}
147
148impl<T: PolarsDataType> ChunkedArray<T> {
149    fn should_rechunk(&self) -> bool {
150        self.chunks.len() > 1 && self.chunks.len() > self.len() / 3
151    }
152
153    fn optional_rechunk(mut self) -> Self {
154        // Rechunk if we have many small chunks.
155        if self.should_rechunk() {
156            self.rechunk_mut()
157        }
158        self
159    }
160
161    pub(crate) fn as_any(&self) -> &dyn std::any::Any {
162        self
163    }
164
165    /// Series to [`ChunkedArray<T>`]
166    pub fn unpack_series_matching_type<'a>(
167        &self,
168        series: &'a Series,
169    ) -> PolarsResult<&'a ChunkedArray<T>> {
170        polars_ensure!(
171            self.dtype() == series.dtype(),
172            SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
173            series.dtype(),
174            self.dtype(),
175        );
176
177        // SAFETY: dtype will be correct.
178        Ok(unsafe { self.unpack_series_matching_physical_type(series) })
179    }
180
181    /// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.
182    ///
183    /// If you want to explicitly the `length` and `null_count`, look at
184    /// [`ChunkedArray::new_with_dims`]
185    fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {
186        unsafe {
187            let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);
188            chunked_arr.compute_len();
189            chunked_arr
190        }
191    }
192
193    /// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.
194    /// # Safety
195    /// The length and null_count must be correct.
196    pub unsafe fn new_with_dims(
197        field: Arc<Field>,
198        chunks: Vec<ArrayRef>,
199        length: usize,
200        null_count: usize,
201    ) -> Self {
202        Self {
203            field,
204            chunks,
205            flags: StatisticsFlagsIM::empty(),
206
207            _pd: Default::default(),
208            length,
209            null_count,
210        }
211    }
212
213    pub(crate) fn is_sorted_ascending_flag(&self) -> bool {
214        self.get_flags().is_sorted_ascending()
215    }
216
217    pub(crate) fn is_sorted_descending_flag(&self) -> bool {
218        self.get_flags().is_sorted_descending()
219    }
220
221    /// Whether `self` is sorted in any direction.
222    pub(crate) fn is_sorted_any(&self) -> bool {
223        self.get_flags().is_sorted_any()
224    }
225
226    pub fn unset_fast_explode_list(&mut self) {
227        self.set_fast_explode_list(false)
228    }
229
230    pub fn set_fast_explode_list(&mut self, value: bool) {
231        let mut flags = self.flags.get_mut();
232        flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);
233        self.flags.set_mut(flags);
234    }
235
236    pub fn get_fast_explode_list(&self) -> bool {
237        self.get_flags().can_fast_explode_list()
238    }
239
240    pub fn get_flags(&self) -> StatisticsFlags {
241        self.flags.get()
242    }
243
244    /// Set flags for the [`ChunkedArray`]
245    pub fn set_flags(&mut self, flags: StatisticsFlags) {
246        self.flags = StatisticsFlagsIM::new(flags);
247    }
248
249    pub fn is_sorted_flag(&self) -> IsSorted {
250        self.get_flags().is_sorted()
251    }
252
253    pub fn retain_flags_from<U: PolarsDataType>(
254        &mut self,
255        from: &ChunkedArray<U>,
256        retain_flags: StatisticsFlags,
257    ) {
258        let flags = from.flags.get();
259        // Try to avoid write contention.
260        if !flags.is_empty() {
261            self.set_flags(flags & retain_flags)
262        }
263    }
264
265    /// Set the 'sorted' bit meta info.
266    pub fn set_sorted_flag(&mut self, sorted: IsSorted) {
267        let mut flags = self.flags.get_mut();
268        flags.set_sorted(sorted);
269        self.flags.set_mut(flags);
270    }
271
272    /// Set the 'sorted' bit meta info.
273    pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {
274        let mut out = self.clone();
275        out.set_sorted_flag(sorted);
276        out
277    }
278
279    pub fn first_null(&self) -> Option<usize> {
280        if self.null_count() == 0 {
281            None
282        }
283        // We now know there is at least 1 non-null item in the array, and self.len() > 0
284        else if self.null_count() == self.len() {
285            Some(0)
286        } else if self.is_sorted_any() {
287            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
288                // nulls are all at the start
289                0
290            } else {
291                // nulls are all at the end
292                self.null_count()
293            };
294
295            debug_assert!(
296                // If we are lucky this catches something.
297                unsafe { self.get_unchecked(out) }.is_some(),
298                "incorrect sorted flag"
299            );
300
301            Some(out)
302        } else {
303            first_null(self.chunks().iter().map(|arr| arr.as_ref()))
304        }
305    }
306
307    /// Get the index of the first non null value in this [`ChunkedArray`].
308    pub fn first_non_null(&self) -> Option<usize> {
309        if self.null_count() == self.len() {
310            None
311        }
312        // We now know there is at least 1 non-null item in the array, and self.len() > 0
313        else if self.null_count() == 0 {
314            Some(0)
315        } else if self.is_sorted_any() {
316            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
317                // nulls are all at the start
318                self.null_count()
319            } else {
320                // nulls are all at the end
321                0
322            };
323
324            debug_assert!(
325                // If we are lucky this catches something.
326                unsafe { self.get_unchecked(out) }.is_some(),
327                "incorrect sorted flag"
328            );
329
330            Some(out)
331        } else {
332            first_non_null(self.chunks().iter().map(|arr| arr.as_ref()))
333        }
334    }
335
336    /// Get the index of the last non null value in this [`ChunkedArray`].
337    pub fn last_non_null(&self) -> Option<usize> {
338        if self.null_count() == self.len() {
339            None
340        }
341        // We now know there is at least 1 non-null item in the array, and self.len() > 0
342        else if self.null_count() == 0 {
343            Some(self.len() - 1)
344        } else if self.is_sorted_any() {
345            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
346                // nulls are all at the start
347                self.len() - 1
348            } else {
349                // nulls are all at the end
350                self.len() - self.null_count() - 1
351            };
352
353            debug_assert!(
354                // If we are lucky this catches something.
355                unsafe { self.get_unchecked(out) }.is_some(),
356                "incorrect sorted flag"
357            );
358
359            Some(out)
360        } else {
361            last_non_null(self.chunks().iter().map(|arr| arr.as_ref()), self.len())
362        }
363    }
364
365    pub fn drop_nulls(&self) -> Self {
366        if self.null_count() == 0 {
367            self.clone()
368        } else {
369            let chunks = self
370                .downcast_iter()
371                .map(|arr| {
372                    if arr.null_count() == 0 {
373                        arr.to_boxed()
374                    } else {
375                        filter_with_bitmap(arr, arr.validity().unwrap())
376                    }
377                })
378                .collect();
379            unsafe {
380                Self::new_with_dims(
381                    self.field.clone(),
382                    chunks,
383                    self.len() - self.null_count(),
384                    0,
385                )
386            }
387        }
388    }
389
390    /// Get the buffer of bits representing null values
391    #[inline]
392    #[allow(clippy::type_complexity)]
393    pub fn iter_validities(
394        &self,
395    ) -> impl ExactSizeIterator<Item = Option<&Bitmap>> + DoubleEndedIterator {
396        fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {
397            arr.validity()
398        }
399        self.chunks.iter().map(to_validity)
400    }
401
402    #[inline]
403    /// Return if any the chunks in this [`ChunkedArray`] have nulls.
404    pub fn has_nulls(&self) -> bool {
405        self.null_count > 0
406    }
407
408    /// Shrink the capacity of this array to fit its length.
409    pub fn shrink_to_fit(&mut self) {
410        self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];
411    }
412
413    pub fn clear(&self) -> Self {
414        // SAFETY: we keep the correct dtype
415        let mut ca = unsafe {
416            self.copy_with_chunks(vec![new_empty_array(
417                self.chunks.first().unwrap().dtype().clone(),
418            )])
419        };
420
421        use StatisticsFlags as F;
422        ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
423        ca
424    }
425
426    /// Unpack a [`Series`] to the same physical type.
427    ///
428    /// # Safety
429    ///
430    /// This is unsafe as the dtype may be incorrect and
431    /// is assumed to be correct in other safe code.
432    pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(
433        &self,
434        series: &'a Series,
435    ) -> &'a ChunkedArray<T> {
436        let series_trait = &**series;
437        if self.dtype() == series.dtype() {
438            &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
439        } else {
440            use DataType::*;
441            match (self.dtype(), series.dtype()) {
442                (Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {
443                    &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
444                },
445                _ => panic!(
446                    "cannot unpack series {:?} into matching type {:?}",
447                    series,
448                    self.dtype()
449                ),
450            }
451        }
452    }
453
454    /// Returns an iterator over the lengths of the chunks of the array.
455    pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {
456        self.chunks.iter().map(|chunk| chunk.len())
457    }
458
459    /// A reference to the chunks
460    #[inline]
461    pub fn chunks(&self) -> &Vec<ArrayRef> {
462        &self.chunks
463    }
464
465    /// A mutable reference to the chunks
466    ///
467    /// # Safety
468    /// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.
469    /// And the `null_count` remains correct.
470    #[inline]
471    pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
472        &mut self.chunks
473    }
474
475    /// Returns true if contains a single chunk and has no null values
476    pub fn is_optimal_aligned(&self) -> bool {
477        self.chunks.len() == 1 && self.null_count() == 0
478    }
479
480    /// Create a new [`ChunkedArray`] from self, where the chunks are replaced.
481    ///
482    /// # Safety
483    /// The caller must ensure the dtypes of the chunks are correct
484    unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
485        Self::new_with_compute_len(self.field.clone(), chunks)
486    }
487
488    /// Get data type of [`ChunkedArray`].
489    pub fn dtype(&self) -> &DataType {
490        self.field.dtype()
491    }
492
493    pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {
494        self.field = Arc::new(Field::new(self.name().clone(), dtype))
495    }
496
497    /// Name of the [`ChunkedArray`].
498    pub fn name(&self) -> &PlSmallStr {
499        self.field.name()
500    }
501
502    /// Get a reference to the field.
503    pub fn ref_field(&self) -> &Field {
504        &self.field
505    }
506
507    /// Rename this [`ChunkedArray`].
508    pub fn rename(&mut self, name: PlSmallStr) {
509        self.field = Arc::new(Field::new(name, self.field.dtype().clone()));
510    }
511
512    /// Return this [`ChunkedArray`] with a new name.
513    pub fn with_name(mut self, name: PlSmallStr) -> Self {
514        self.rename(name);
515        self
516    }
517}
518
519impl<T> ChunkedArray<T>
520where
521    T: PolarsDataType,
522{
523    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
524    /// indicates a NULL value.
525    ///
526    /// # Panics
527    /// This function will panic if `idx` is out of bounds.
528    #[inline]
529    pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {
530        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
531        assert!(
532            chunk_idx < self.chunks().len(),
533            "index: {} out of bounds for len: {}",
534            idx,
535            self.len()
536        );
537        unsafe {
538            let arr = self.downcast_get_unchecked(chunk_idx);
539            assert!(
540                arr_idx < arr.len(),
541                "index: {} out of bounds for len: {}",
542                idx,
543                self.len()
544            );
545            arr.get_unchecked(arr_idx)
546        }
547    }
548
549    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
550    /// indicates a NULL value.
551    ///
552    /// # Safety
553    /// It is the callers responsibility that the `idx < self.len()`.
554    #[inline]
555    pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {
556        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
557
558        unsafe {
559            // SAFETY: up to the caller to make sure the index is valid.
560            self.downcast_get_unchecked(chunk_idx)
561                .get_unchecked(arr_idx)
562        }
563    }
564
565    /// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned
566    /// value could be garbage if it was masked out by NULL. Note that the value always is initialized.
567    ///
568    /// # Safety
569    /// It is the callers responsibility that the `idx < self.len()`.
570    #[inline]
571    pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {
572        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
573
574        unsafe {
575            // SAFETY: up to the caller to make sure the index is valid.
576            self.downcast_get_unchecked(chunk_idx)
577                .value_unchecked(arr_idx)
578        }
579    }
580
581    /// # Panics
582    /// Panics if the [`ChunkedArray`] is empty.
583    #[inline]
584    pub fn first(&self) -> Option<T::Physical<'_>> {
585        self.iter().next().unwrap()
586    }
587
588    /// # Panics
589    /// Panics if the [`ChunkedArray`] is empty.
590    #[inline]
591    pub fn last(&self) -> Option<T::Physical<'_>> {
592        let arr = self
593            .downcast_iter()
594            .rev()
595            .find(|arr| !arr.is_empty())
596            .unwrap();
597        unsafe { arr.get_unchecked(arr.len() - 1) }
598    }
599
600    pub fn set_validity(&mut self, validity: Option<Bitmap>) {
601        assert!(
602            !self.dtype().is_struct(),
603            "set_outer_validity should be used for struct types"
604        );
605        if let Some(v) = &validity {
606            assert_eq!(self.len(), v.len());
607        }
608        let mut i = 0;
609        for chunk in unsafe { self.chunks_mut() } {
610            *chunk =
611                chunk.with_validity(validity.as_ref().map(|v| v.clone().sliced(i, chunk.len())));
612            i += chunk.len();
613        }
614        self.null_count = validity.map(|v| v.unset_bits()).unwrap_or(0);
615        self.set_fast_explode_list(false);
616    }
617
618    pub fn with_validity(mut self, validity: Option<Bitmap>) -> Self {
619        self.set_validity(validity);
620        self
621    }
622}
623
624impl<T> ChunkedArray<T>
625where
626    T: PolarsDataType,
627    ChunkedArray<T>: ChunkTakeUnchecked<[IdxSize]>,
628{
629    /// Deposit values into nulls with a certain validity mask.
630    pub fn deposit(&self, validity: &Bitmap) -> Self {
631        let set_bits = validity.set_bits();
632
633        assert_eq!(self.len(), set_bits);
634
635        if set_bits == validity.len() {
636            return self.clone();
637        }
638
639        if set_bits == 0 {
640            return Self::full_null_like(self, validity.len());
641        }
642
643        let mut null_mask = validity.clone();
644
645        let mut gather_idxs = Vec::with_capacity(validity.len());
646        let leading_nulls = null_mask.take_leading_zeros();
647        gather_idxs.extend(std::iter::repeat_n(0, leading_nulls + 1));
648
649        let mut i = 0 as IdxSize;
650        gather_idxs.extend(null_mask.iter().skip(1).map(|v| {
651            i += IdxSize::from(v);
652            i
653        }));
654
655        let mut ca = unsafe { ChunkTakeUnchecked::take_unchecked(self, &gather_idxs) };
656        ca.set_validity(combine_validities_and(
657            Some(validity),
658            ca.rechunk_validity().as_ref(),
659        ));
660        ca
661    }
662}
663
664impl ListChunked {
665    #[inline]
666    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
667        unsafe {
668            Some(Series::from_chunks_and_dtype_unchecked(
669                self.name().clone(),
670                vec![self.get(idx)?],
671                &self.inner_dtype().to_physical(),
672            ))
673        }
674    }
675
676    pub fn has_empty_lists(&self) -> bool {
677        for arr in self.downcast_iter() {
678            if arr.is_empty() {
679                continue;
680            }
681
682            if match arr.validity() {
683                None => arr.offsets().lengths().any(|l| l == 0),
684                Some(validity) => arr
685                    .offsets()
686                    .lengths()
687                    .enumerate()
688                    .any(|(i, l)| l == 0 && unsafe { validity.get_bit_unchecked(i) }),
689            } {
690                return true;
691            }
692        }
693
694        false
695    }
696
697    pub fn has_masked_out_values(&self) -> bool {
698        for arr in self.downcast_iter() {
699            if arr.is_empty() {
700                continue;
701            }
702
703            if *arr.offsets().first() != 0 || *arr.offsets().last() != arr.values().len() as i64 {
704                return true;
705            }
706
707            let Some(validity) = arr.validity() else {
708                continue;
709            };
710            if validity.set_bits() == 0 {
711                continue;
712            }
713
714            // @Performance: false_idx_iter
715            for i in (!validity).true_idx_iter() {
716                if arr.offsets().length_at(i) > 0 {
717                    return true;
718                }
719            }
720        }
721
722        false
723    }
724}
725
726#[cfg(feature = "dtype-array")]
727impl ArrayChunked {
728    #[inline]
729    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
730        unsafe {
731            Some(Series::from_chunks_and_dtype_unchecked(
732                self.name().clone(),
733                vec![self.get(idx)?],
734                &self.inner_dtype().to_physical(),
735            ))
736        }
737    }
738
739    pub fn from_aligned_values(
740        name: PlSmallStr,
741        inner_dtype: &DataType,
742        width: usize,
743        chunks: Vec<ArrayRef>,
744        length: usize,
745    ) -> Self {
746        let dtype = DataType::Array(Box::new(inner_dtype.clone()), width);
747        let arrow_dtype = dtype.to_arrow(CompatLevel::newest()).to_storage_recursive();
748        let field = Arc::new(Field::new(name, dtype));
749        if width == 0 {
750            use arrow::array::builder::{ArrayBuilder, make_builder};
751            let values = make_builder(&inner_dtype.to_arrow(CompatLevel::newest())).freeze();
752            return ArrayChunked::new_with_compute_len(
753                field,
754                vec![FixedSizeListArray::new(arrow_dtype, length, values, None).into_boxed()],
755            );
756        }
757        let mut total_len = 0;
758        let chunks = chunks
759            .iter()
760            .map(|chunk| {
761                debug_assert_eq!(chunk.len() % width, 0);
762                let chunk_len = chunk.len() / width;
763                total_len += chunk_len;
764                FixedSizeListArray::new(arrow_dtype.clone(), chunk_len, chunk.clone(), None)
765                    .into_boxed()
766            })
767            .collect();
768        debug_assert_eq!(total_len, length);
769
770        unsafe { Self::new_with_dims(field, chunks, length, 0) }
771    }
772
773    /// Turn the ArrayChunked into the ListChunked with the same items.
774    ///
775    /// This will always zero copy the values into the ListChunked.
776    pub fn to_list(&self) -> ListChunked {
777        let inner_dtype = self.inner_dtype();
778        let chunks = self
779            .downcast_iter()
780            .map(|chunk| {
781                use arrow::offset::OffsetsBuffer;
782
783                let inner_dtype = chunk.dtype().inner_dtype().unwrap();
784                let dtype = inner_dtype.clone().to_large_list(true);
785
786                let offsets = (0..=chunk.len())
787                    .map(|i| (i * self.width()) as i64)
788                    .collect::<Vec<i64>>();
789
790                // SAFETY: We created our offsets in ascending manner.
791                let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) };
792
793                ListArray::<i64>::new(
794                    dtype,
795                    offsets,
796                    chunk.values().clone(),
797                    chunk.validity().cloned(),
798                )
799                .into_boxed()
800            })
801            .collect();
802
803        // SAFETY: All the items were mapped 1-1 with the validity staying the same.
804        let mut ca = unsafe {
805            ListChunked::new_with_dims(
806                Arc::new(Field::new(
807                    self.name().clone(),
808                    DataType::List(Box::new(inner_dtype.clone())),
809                )),
810                chunks,
811                self.len(),
812                self.null_count(),
813            )
814        };
815        ca.set_fast_explode_list(!self.has_nulls());
816        ca
817    }
818}
819
820impl<T> ChunkedArray<T>
821where
822    T: PolarsDataType,
823{
824    /// Should be used to match the chunk_id of another [`ChunkedArray`].
825    /// # Panics
826    /// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.
827    pub fn match_chunks<I>(&self, chunk_id: I) -> Self
828    where
829        I: Iterator<Item = usize>,
830    {
831        debug_assert!(self.chunks.len() == 1);
832        // Takes a ChunkedArray containing a single chunk.
833        let slice = |ca: &Self| {
834            let array = &ca.chunks[0];
835
836            let mut offset = 0;
837            let chunks = chunk_id
838                .map(|len| {
839                    // SAFETY: within bounds.
840                    debug_assert!((offset + len) <= array.len());
841                    let out = unsafe { array.sliced_unchecked(offset, len) };
842                    offset += len;
843                    out
844                })
845                .collect();
846
847            debug_assert_eq!(offset, array.len());
848
849            // SAFETY: We just slice the original chunks, their type will not change.
850            unsafe {
851                Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())
852            }
853        };
854
855        if self.chunks.len() != 1 {
856            let out = self.rechunk();
857            slice(&out)
858        } else {
859            slice(self)
860        }
861    }
862}
863
864impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {
865    fn as_ref_dtype(&self) -> &DataType {
866        self.dtype()
867    }
868}
869
870pub(crate) trait AsSinglePtr: AsRefDataType {
871    /// Rechunk and return a ptr to the start of the array
872    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
873        polars_bail!(opq = as_single_ptr, self.as_ref_dtype());
874    }
875}
876
877impl<T> AsSinglePtr for ChunkedArray<T>
878where
879    T: PolarsNumericType,
880{
881    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
882        self.rechunk_mut();
883        let a = self.data_views().next().unwrap();
884        let ptr = a.as_ptr();
885        Ok(ptr as usize)
886    }
887}
888
889impl AsSinglePtr for BooleanChunked {}
890impl AsSinglePtr for ListChunked {}
891#[cfg(feature = "dtype-array")]
892impl AsSinglePtr for ArrayChunked {}
893impl AsSinglePtr for StringChunked {}
894impl AsSinglePtr for BinaryChunked {}
895#[cfg(feature = "object")]
896impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}
897
898pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {
899    SingleNoNull(&'a T::Array),
900    Single(&'a T::Array),
901    MultiNoNull(&'a ChunkedArray<T>),
902    Multi(&'a ChunkedArray<T>),
903}
904
905impl<T> ChunkedArray<T>
906where
907    T: PolarsDataType,
908{
909    pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {
910        if self.chunks.len() == 1 {
911            let arr = self.downcast_iter().next().unwrap();
912            return if arr.null_count() == 0 {
913                ChunkedArrayLayout::SingleNoNull(arr)
914            } else {
915                ChunkedArrayLayout::Single(arr)
916            };
917        }
918
919        if self.downcast_iter().all(|a| a.null_count() == 0) {
920            ChunkedArrayLayout::MultiNoNull(self)
921        } else {
922            ChunkedArrayLayout::Multi(self)
923        }
924    }
925}
926
927impl<T> ChunkedArray<T>
928where
929    T: PolarsNumericType,
930{
931    /// Returns the values of the array as a contiguous slice.
932    pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {
933        polars_ensure!(
934            self.chunks.len() == 1 && self.chunks[0].null_count() == 0,
935            ComputeError: "chunked array is not contiguous"
936        );
937        Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())
938    }
939
940    /// Returns the values of the array as a contiguous mutable slice.
941    pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {
942        if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {
943            // SAFETY, we will not swap the PrimitiveArray.
944            let arr = unsafe { self.downcast_iter_mut().next().unwrap() };
945            arr.get_mut_values()
946        } else {
947            None
948        }
949    }
950
951    /// Get slices of the underlying arrow data.
952    /// NOTE: null values should be taken into account by the user of these slices as they are handled
953    /// separately
954    pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {
955        self.downcast_iter().map(|arr| arr.values().as_slice())
956    }
957
958    #[allow(clippy::wrong_self_convention)]
959    pub fn into_no_null_iter(
960        &self,
961    ) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen
962    {
963        // .copied was significantly slower in benchmark, next call did not inline?
964        #[allow(clippy::map_clone)]
965        // we know the iterators len
966        unsafe {
967            self.data_views()
968                .flatten()
969                .map(|v| *v)
970                .trust_my_length(self.len())
971        }
972    }
973}
974
975impl<T: PolarsDataType> Clone for ChunkedArray<T> {
976    fn clone(&self) -> Self {
977        ChunkedArray {
978            field: self.field.clone(),
979            chunks: self.chunks.clone(),
980            flags: self.flags.clone(),
981
982            _pd: Default::default(),
983            length: self.length,
984            null_count: self.null_count,
985        }
986    }
987}
988
989impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {
990    fn as_ref(&self) -> &ChunkedArray<T> {
991        self
992    }
993}
994
995impl ValueSize for ListChunked {
996    fn get_values_size(&self) -> usize {
997        self.chunks
998            .iter()
999            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1000    }
1001}
1002
1003#[cfg(feature = "dtype-array")]
1004impl ValueSize for ArrayChunked {
1005    fn get_values_size(&self) -> usize {
1006        self.chunks
1007            .iter()
1008            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1009    }
1010}
1011impl ValueSize for StringChunked {
1012    fn get_values_size(&self) -> usize {
1013        self.chunks
1014            .iter()
1015            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1016    }
1017}
1018
1019impl ValueSize for BinaryOffsetChunked {
1020    fn get_values_size(&self) -> usize {
1021        self.chunks
1022            .iter()
1023            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1024    }
1025}
1026
1027pub(crate) fn to_primitive<T: PolarsNumericType>(
1028    values: Vec<T::Native>,
1029    validity: Option<Bitmap>,
1030) -> PrimitiveArray<T::Native> {
1031    PrimitiveArray::new(
1032        T::get_static_dtype().to_arrow(CompatLevel::newest()),
1033        values.into(),
1034        validity,
1035    )
1036}
1037
1038pub(crate) fn to_array<T: PolarsNumericType>(
1039    values: Vec<T::Native>,
1040    validity: Option<Bitmap>,
1041) -> ArrayRef {
1042    Box::new(to_primitive::<T>(values, validity))
1043}
1044
1045impl<T: PolarsDataType> Default for ChunkedArray<T> {
1046    fn default() -> Self {
1047        let dtype = T::get_static_dtype();
1048        let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());
1049        ChunkedArray {
1050            field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),
1051            // Invariant: always has 1 chunk.
1052            chunks: vec![new_empty_array(arrow_dtype)],
1053            flags: StatisticsFlagsIM::empty(),
1054
1055            _pd: Default::default(),
1056            length: 0,
1057            null_count: 0,
1058        }
1059    }
1060}
1061
1062#[cfg(test)]
1063pub(crate) mod test {
1064    use crate::prelude::*;
1065
1066    pub(crate) fn get_chunked_array() -> Int32Chunked {
1067        ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])
1068    }
1069
1070    #[test]
1071    fn test_sort() {
1072        let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);
1073        let b = a
1074            .sort(false)
1075            .iter()
1076            .map(|opt| opt.unwrap())
1077            .collect::<Vec<_>>();
1078        assert_eq!(b, [1, 2, 3, 9]);
1079        let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);
1080        let a = a.sort(false);
1081        let b = a.iter().collect::<Vec<_>>();
1082        assert_eq!(b, [Some("a"), Some("b"), Some("c")]);
1083        assert!(a.is_sorted_ascending_flag());
1084    }
1085
1086    #[test]
1087    fn arithmetic() {
1088        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);
1089        let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);
1090
1091        // Not really asserting anything here but still making sure the code is exercised
1092        // This (and more) is properly tested from the integration test suite and Python bindings.
1093        println!("{:?}", a + b);
1094        println!("{:?}", a - b);
1095        println!("{:?}", a * b);
1096        println!("{:?}", a / b);
1097    }
1098
1099    #[test]
1100    fn iter() {
1101        let s1 = get_chunked_array();
1102        // sum
1103        assert_eq!(s1.iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)
1104    }
1105
1106    #[test]
1107    fn limit() {
1108        let a = get_chunked_array();
1109        let b = a.limit(2);
1110        println!("{b:?}");
1111        assert_eq!(b.len(), 2)
1112    }
1113
1114    #[test]
1115    fn filter() {
1116        let a = get_chunked_array();
1117        let b = a
1118            .filter(&BooleanChunked::new(
1119                PlSmallStr::from_static("filter"),
1120                &[true, false, false],
1121            ))
1122            .unwrap();
1123        assert_eq!(b.len(), 1);
1124        assert_eq!(b.iter().next(), Some(Some(1)));
1125    }
1126
1127    #[test]
1128    fn aggregates() {
1129        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);
1130        assert_eq!(a.max(), Some(100));
1131        assert_eq!(a.min(), Some(1));
1132        assert_eq!(a.sum(), Some(120))
1133    }
1134
1135    #[test]
1136    fn take() {
1137        let a = get_chunked_array();
1138        let new = a.take(&[0 as IdxSize, 1]).unwrap();
1139        assert_eq!(new.len(), 2)
1140    }
1141
1142    #[test]
1143    fn cast() {
1144        let a = get_chunked_array();
1145        let b = a.cast(&DataType::Int64).unwrap();
1146        assert_eq!(b.dtype(), &DataType::Int64)
1147    }
1148
1149    fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])
1150    where
1151        T: PolarsNumericType,
1152    {
1153        assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)
1154    }
1155
1156    #[test]
1157    fn slice() {
1158        let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);
1159        let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);
1160        first.append(&second).unwrap();
1161        assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);
1162        assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);
1163        assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);
1164        assert_slice_equal(&first.slice(3, 2), &[3, 4]);
1165        assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);
1166        assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);
1167        assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);
1168
1169        assert_eq!(first.slice(-7, 2).len(), 1);
1170        assert_eq!(first.slice(-3, 4).len(), 3);
1171        assert_eq!(first.slice(3, 4).len(), 3);
1172        assert_eq!(first.slice(10, 4).len(), 0);
1173    }
1174
1175    #[test]
1176    fn sorting() {
1177        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);
1178        let sorted = s.sort(false);
1179        assert_slice_equal(&sorted, &[2, 4, 9]);
1180        let sorted = s.sort(true);
1181        assert_slice_equal(&sorted, &[9, 4, 2]);
1182
1183        let s: StringChunked = ["b", "a", "z"].iter().collect();
1184        let sorted = s.sort(false);
1185        assert_eq!(
1186            sorted.iter().collect::<Vec<_>>(),
1187            &[Some("a"), Some("b"), Some("z")]
1188        );
1189        let sorted = s.sort(true);
1190        assert_eq!(
1191            sorted.iter().collect::<Vec<_>>(),
1192            &[Some("z"), Some("b"), Some("a")]
1193        );
1194        let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();
1195        let sorted = s.sort(false);
1196        assert_eq!(
1197            sorted.iter().collect::<Vec<_>>(),
1198            &[None, Some("b"), Some("z")]
1199        );
1200    }
1201
1202    #[test]
1203    fn reverse() {
1204        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);
1205        // path with continuous slice
1206        assert_slice_equal(&s.reverse(), &[3, 2, 1]);
1207        // path with options
1208        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);
1209        assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);
1210        let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);
1211        assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);
1212
1213        let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);
1214        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);
1215
1216        let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);
1217        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);
1218    }
1219
1220    #[test]
1221    #[cfg(feature = "dtype-categorical")]
1222    fn test_iter_categorical() {
1223        let ca = StringChunked::new(
1224            PlSmallStr::EMPTY,
1225            &[Some("foo"), None, Some("bar"), Some("ham")],
1226        );
1227        let cats = Categories::new(
1228            PlSmallStr::EMPTY,
1229            PlSmallStr::EMPTY,
1230            CategoricalPhysical::U32,
1231        );
1232        let ca = ca.cast(&DataType::from_categories(cats)).unwrap();
1233        let ca = ca.cat32().unwrap();
1234        let v: Vec<_> = ca.physical().iter().collect();
1235        assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);
1236    }
1237
1238    #[test]
1239    #[ignore]
1240    fn test_shrink_to_fit() {
1241        let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);
1242        builder.append_value("foo");
1243        let mut arr = builder.finish();
1244        let before = arr
1245            .chunks()
1246            .iter()
1247            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1248            .sum::<usize>();
1249        arr.shrink_to_fit();
1250        let after = arr
1251            .chunks()
1252            .iter()
1253            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1254            .sum::<usize>();
1255        assert!(before > after);
1256    }
1257}