polars_core/chunked_array/
mod.rs

1//! The typed heart of every Series column.
2#![allow(unsafe_op_in_unsafe_fn)]
3use std::sync::Arc;
4
5use arrow::array::*;
6use arrow::bitmap::Bitmap;
7use arrow::compute::concatenate::concatenate_unchecked;
8use polars_compute::filter::filter_with_bitmap;
9
10use crate::prelude::{ChunkTakeUnchecked, *};
11
12pub mod ops;
13#[macro_use]
14pub mod arithmetic;
15pub mod builder;
16pub mod cast;
17pub mod collect;
18pub mod comparison;
19pub mod flags;
20pub mod float;
21pub mod iterator;
22#[cfg(feature = "ndarray")]
23pub(crate) mod ndarray;
24
25pub mod arg_min_max;
26#[cfg(feature = "dtype-array")]
27pub(crate) mod array;
28mod binary;
29mod binary_offset;
30mod bitwise;
31#[cfg(feature = "object")]
32mod drop;
33mod from;
34mod from_iterator;
35pub mod from_iterator_par;
36pub(crate) mod list;
37pub(crate) mod logical;
38#[cfg(feature = "object")]
39pub mod object;
40#[cfg(feature = "random")]
41mod random;
42#[cfg(feature = "dtype-struct")]
43mod struct_;
44#[cfg(any(
45    feature = "temporal",
46    feature = "dtype-datetime",
47    feature = "dtype-date"
48))]
49pub mod temporal;
50mod to_vec;
51mod trusted_len;
52pub(crate) use arg_min_max::*;
53use arrow::legacy::prelude::*;
54#[cfg(feature = "dtype-struct")]
55pub use struct_::StructChunked;
56
57use self::flags::{StatisticsFlags, StatisticsFlagsIM};
58use crate::series::IsSorted;
59use crate::utils::{first_non_null, first_null, last_non_null};
60
61#[cfg(not(feature = "dtype-categorical"))]
62pub struct RevMapping {}
63
64pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;
65
66/// # ChunkedArray
67///
68/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows
69/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.
70/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].
71///
72/// ```rust
73/// # use polars_core::prelude::*;
74/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {
75///     ca.apply_values(|v| v.cos())
76/// }
77/// ```
78///
79/// ## Conversion between Series and ChunkedArrays
80/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.
81///
82/// ```rust
83/// # use polars_core::prelude::*;
84/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{
85///     series.i32()
86/// }
87///
88/// fn to_series(ca: Int32Chunked) -> Series {
89///     ca.into_series()
90/// }
91/// ```
92///
93/// # Iterators
94///
95/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
96/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby
97/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).
98///
99/// ```rust
100/// # use polars_core::prelude::*;
101///
102/// fn iter_forward(ca: &Float32Chunked) {
103///     ca.iter()
104///         .for_each(|opt_v| println!("{:?}", opt_v))
105/// }
106///
107/// fn iter_backward(ca: &Float32Chunked) {
108///     ca.iter()
109///         .rev()
110///         .for_each(|opt_v| println!("{:?}", opt_v))
111/// }
112/// ```
113///
114/// # Memory layout
115///
116/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.
117/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.
118///
119/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.
120/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).
121///
122/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection
123/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.
124/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.
125///
126/// If you want to have predictable performance
127/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after
128/// multiple append operations.
129///
130/// See also [`ChunkedArray::extend`] for appends within a chunk.
131///
132/// # Invariants
133/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].
134/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]
135///   chunks.
136/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the
137///   logical type given by the datatype.
138///
139/// [`List`]: crate::datatypes::DataType::List
140pub struct ChunkedArray<T: PolarsDataType> {
141    pub(crate) field: Arc<Field>,
142    pub(crate) chunks: Vec<ArrayRef>,
143
144    pub(crate) flags: StatisticsFlagsIM,
145
146    length: usize,
147    null_count: usize,
148    _pd: std::marker::PhantomData<T>,
149}
150
151impl<T: PolarsDataType> ChunkedArray<T> {
152    fn should_rechunk(&self) -> bool {
153        self.chunks.len() > 1 && self.chunks.len() > self.len() / 3
154    }
155
156    fn optional_rechunk(mut self) -> Self {
157        // Rechunk if we have many small chunks.
158        if self.should_rechunk() {
159            self.rechunk_mut()
160        }
161        self
162    }
163
164    pub(crate) fn as_any(&self) -> &dyn std::any::Any {
165        self
166    }
167
168    /// Series to [`ChunkedArray<T>`]
169    pub fn unpack_series_matching_type<'a>(
170        &self,
171        series: &'a Series,
172    ) -> PolarsResult<&'a ChunkedArray<T>> {
173        polars_ensure!(
174            self.dtype() == series.dtype(),
175            SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
176            series.dtype(),
177            self.dtype(),
178        );
179
180        // SAFETY: dtype will be correct.
181        Ok(unsafe { self.unpack_series_matching_physical_type(series) })
182    }
183
184    /// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.
185    ///
186    /// If you want to explicitly the `length` and `null_count`, look at
187    /// [`ChunkedArray::new_with_dims`]
188    fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {
189        unsafe {
190            let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);
191            chunked_arr.compute_len();
192            chunked_arr
193        }
194    }
195
196    /// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.
197    /// # Safety
198    /// The length and null_count must be correct.
199    pub unsafe fn new_with_dims(
200        field: Arc<Field>,
201        chunks: Vec<ArrayRef>,
202        length: usize,
203        null_count: usize,
204    ) -> Self {
205        Self {
206            field,
207            chunks,
208            flags: StatisticsFlagsIM::empty(),
209
210            _pd: Default::default(),
211            length,
212            null_count,
213        }
214    }
215
216    pub(crate) fn is_sorted_ascending_flag(&self) -> bool {
217        self.get_flags().is_sorted_ascending()
218    }
219
220    pub(crate) fn is_sorted_descending_flag(&self) -> bool {
221        self.get_flags().is_sorted_descending()
222    }
223
224    /// Whether `self` is sorted in any direction.
225    pub(crate) fn is_sorted_any(&self) -> bool {
226        self.get_flags().is_sorted_any()
227    }
228
229    pub fn unset_fast_explode_list(&mut self) {
230        self.set_fast_explode_list(false)
231    }
232
233    pub fn set_fast_explode_list(&mut self, value: bool) {
234        let mut flags = self.flags.get_mut();
235        flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);
236        self.flags.set_mut(flags);
237    }
238
239    pub fn get_fast_explode_list(&self) -> bool {
240        self.get_flags().can_fast_explode_list()
241    }
242
243    pub fn get_flags(&self) -> StatisticsFlags {
244        self.flags.get()
245    }
246
247    /// Set flags for the [`ChunkedArray`]
248    pub fn set_flags(&mut self, flags: StatisticsFlags) {
249        self.flags = StatisticsFlagsIM::new(flags);
250    }
251
252    pub fn is_sorted_flag(&self) -> IsSorted {
253        self.get_flags().is_sorted()
254    }
255
256    pub fn retain_flags_from<U: PolarsDataType>(
257        &mut self,
258        from: &ChunkedArray<U>,
259        retain_flags: StatisticsFlags,
260    ) {
261        let flags = from.flags.get();
262        // Try to avoid write contention.
263        if !flags.is_empty() {
264            self.set_flags(flags & retain_flags)
265        }
266    }
267
268    /// Set the 'sorted' bit meta info.
269    pub fn set_sorted_flag(&mut self, sorted: IsSorted) {
270        let mut flags = self.flags.get_mut();
271        flags.set_sorted(sorted);
272        self.flags.set_mut(flags);
273    }
274
275    /// Set the 'sorted' bit meta info.
276    pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {
277        let mut out = self.clone();
278        out.set_sorted_flag(sorted);
279        out
280    }
281
282    pub fn first_null(&self) -> Option<usize> {
283        if self.null_count() == 0 {
284            None
285        }
286        // We now know there is at least 1 non-null item in the array, and self.len() > 0
287        else if self.null_count() == self.len() {
288            Some(0)
289        } else if self.is_sorted_any() {
290            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
291                // nulls are all at the start
292                0
293            } else {
294                // nulls are all at the end
295                self.null_count()
296            };
297
298            debug_assert!(
299                // If we are lucky this catches something.
300                unsafe { self.get_unchecked(out) }.is_some(),
301                "incorrect sorted flag"
302            );
303
304            Some(out)
305        } else {
306            first_null(self.chunks().iter().map(|arr| arr.as_ref()))
307        }
308    }
309
310    /// Get the index of the first non null value in this [`ChunkedArray`].
311    pub fn first_non_null(&self) -> Option<usize> {
312        if self.null_count() == self.len() {
313            None
314        }
315        // We now know there is at least 1 non-null item in the array, and self.len() > 0
316        else if self.null_count() == 0 {
317            Some(0)
318        } else if self.is_sorted_any() {
319            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
320                // nulls are all at the start
321                self.null_count()
322            } else {
323                // nulls are all at the end
324                0
325            };
326
327            debug_assert!(
328                // If we are lucky this catches something.
329                unsafe { self.get_unchecked(out) }.is_some(),
330                "incorrect sorted flag"
331            );
332
333            Some(out)
334        } else {
335            first_non_null(self.chunks().iter().map(|arr| arr.as_ref()))
336        }
337    }
338
339    /// Get the index of the last non null value in this [`ChunkedArray`].
340    pub fn last_non_null(&self) -> Option<usize> {
341        if self.null_count() == self.len() {
342            None
343        }
344        // We now know there is at least 1 non-null item in the array, and self.len() > 0
345        else if self.null_count() == 0 {
346            Some(self.len() - 1)
347        } else if self.is_sorted_any() {
348            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
349                // nulls are all at the start
350                self.len() - 1
351            } else {
352                // nulls are all at the end
353                self.len() - self.null_count() - 1
354            };
355
356            debug_assert!(
357                // If we are lucky this catches something.
358                unsafe { self.get_unchecked(out) }.is_some(),
359                "incorrect sorted flag"
360            );
361
362            Some(out)
363        } else {
364            last_non_null(self.chunks().iter().map(|arr| arr.as_ref()), self.len())
365        }
366    }
367
368    pub fn drop_nulls(&self) -> Self {
369        if self.null_count() == 0 {
370            self.clone()
371        } else {
372            let chunks = self
373                .downcast_iter()
374                .map(|arr| {
375                    if arr.null_count() == 0 {
376                        arr.to_boxed()
377                    } else {
378                        filter_with_bitmap(arr, arr.validity().unwrap())
379                    }
380                })
381                .collect();
382            unsafe {
383                Self::new_with_dims(
384                    self.field.clone(),
385                    chunks,
386                    self.len() - self.null_count(),
387                    0,
388                )
389            }
390        }
391    }
392
393    /// Get the buffer of bits representing null values
394    #[inline]
395    #[allow(clippy::type_complexity)]
396    pub fn iter_validities(
397        &self,
398    ) -> impl ExactSizeIterator<Item = Option<&Bitmap>> + DoubleEndedIterator {
399        fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {
400            arr.validity()
401        }
402        self.chunks.iter().map(to_validity)
403    }
404
405    #[inline]
406    /// Return if any the chunks in this [`ChunkedArray`] have nulls.
407    pub fn has_nulls(&self) -> bool {
408        self.null_count > 0
409    }
410
411    /// Shrink the capacity of this array to fit its length.
412    pub fn shrink_to_fit(&mut self) {
413        self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];
414    }
415
416    pub fn clear(&self) -> Self {
417        // SAFETY: we keep the correct dtype
418        let mut ca = unsafe {
419            self.copy_with_chunks(vec![new_empty_array(
420                self.chunks.first().unwrap().dtype().clone(),
421            )])
422        };
423
424        use StatisticsFlags as F;
425        ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
426        ca
427    }
428
429    /// Unpack a [`Series`] to the same physical type.
430    ///
431    /// # Safety
432    ///
433    /// This is unsafe as the dtype may be incorrect and
434    /// is assumed to be correct in other safe code.
435    pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(
436        &self,
437        series: &'a Series,
438    ) -> &'a ChunkedArray<T> {
439        let series_trait = &**series;
440        if self.dtype() == series.dtype() {
441            &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
442        } else {
443            use DataType::*;
444            match (self.dtype(), series.dtype()) {
445                (Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {
446                    &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
447                },
448                _ => panic!(
449                    "cannot unpack series {:?} into matching type {:?}",
450                    series,
451                    self.dtype()
452                ),
453            }
454        }
455    }
456
457    /// Returns an iterator over the lengths of the chunks of the array.
458    pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {
459        self.chunks.iter().map(|chunk| chunk.len())
460    }
461
462    /// A reference to the chunks
463    #[inline]
464    pub fn chunks(&self) -> &Vec<ArrayRef> {
465        &self.chunks
466    }
467
468    /// A mutable reference to the chunks
469    ///
470    /// # Safety
471    /// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.
472    /// And the `null_count` remains correct.
473    #[inline]
474    pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
475        &mut self.chunks
476    }
477
478    /// Returns true if contains a single chunk and has no null values
479    pub fn is_optimal_aligned(&self) -> bool {
480        self.chunks.len() == 1 && self.null_count() == 0
481    }
482
483    /// Create a new [`ChunkedArray`] from self, where the chunks are replaced.
484    ///
485    /// # Safety
486    /// The caller must ensure the dtypes of the chunks are correct
487    unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
488        Self::new_with_compute_len(self.field.clone(), chunks)
489    }
490
491    /// Get data type of [`ChunkedArray`].
492    pub fn dtype(&self) -> &DataType {
493        self.field.dtype()
494    }
495
496    pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {
497        self.field = Arc::new(Field::new(self.name().clone(), dtype))
498    }
499
500    /// Name of the [`ChunkedArray`].
501    pub fn name(&self) -> &PlSmallStr {
502        self.field.name()
503    }
504
505    /// Get a reference to the field.
506    pub fn ref_field(&self) -> &Field {
507        &self.field
508    }
509
510    /// Rename this [`ChunkedArray`].
511    pub fn rename(&mut self, name: PlSmallStr) {
512        self.field = Arc::new(Field::new(name, self.field.dtype().clone()));
513    }
514
515    /// Return this [`ChunkedArray`] with a new name.
516    pub fn with_name(mut self, name: PlSmallStr) -> Self {
517        self.rename(name);
518        self
519    }
520}
521
522impl<T> ChunkedArray<T>
523where
524    T: PolarsDataType,
525{
526    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
527    /// indicates a NULL value.
528    ///
529    /// # Panics
530    /// This function will panic if `idx` is out of bounds.
531    #[inline]
532    pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {
533        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
534        assert!(
535            chunk_idx < self.chunks().len(),
536            "index: {} out of bounds for len: {}",
537            idx,
538            self.len()
539        );
540        unsafe {
541            let arr = self.downcast_get_unchecked(chunk_idx);
542            assert!(
543                arr_idx < arr.len(),
544                "index: {} out of bounds for len: {}",
545                idx,
546                self.len()
547            );
548            arr.get_unchecked(arr_idx)
549        }
550    }
551
552    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
553    /// indicates a NULL value.
554    ///
555    /// # Safety
556    /// It is the callers responsibility that the `idx < self.len()`.
557    #[inline]
558    pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {
559        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
560
561        unsafe {
562            // SAFETY: up to the caller to make sure the index is valid.
563            self.downcast_get_unchecked(chunk_idx)
564                .get_unchecked(arr_idx)
565        }
566    }
567
568    /// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned
569    /// value could be garbage if it was masked out by NULL. Note that the value always is initialized.
570    ///
571    /// # Safety
572    /// It is the callers responsibility that the `idx < self.len()`.
573    #[inline]
574    pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {
575        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
576
577        unsafe {
578            // SAFETY: up to the caller to make sure the index is valid.
579            self.downcast_get_unchecked(chunk_idx)
580                .value_unchecked(arr_idx)
581        }
582    }
583
584    /// # Panics
585    /// Panics if the [`ChunkedArray`] is empty.
586    #[inline]
587    pub fn first(&self) -> Option<T::Physical<'_>> {
588        self.iter().next().unwrap()
589    }
590
591    /// # Panics
592    /// Panics if the [`ChunkedArray`] is empty.
593    #[inline]
594    pub fn last(&self) -> Option<T::Physical<'_>> {
595        let arr = self
596            .downcast_iter()
597            .rev()
598            .find(|arr| !arr.is_empty())
599            .unwrap();
600        unsafe { arr.get_unchecked(arr.len() - 1) }
601    }
602
603    pub fn set_validity(&mut self, validity: &Bitmap) {
604        assert_eq!(self.len(), validity.len());
605        let mut i = 0;
606        for chunk in unsafe { self.chunks_mut() } {
607            *chunk = chunk.with_validity(Some(validity.clone().sliced(i, chunk.len())));
608            i += chunk.len();
609        }
610        self.null_count = validity.unset_bits();
611        self.set_fast_explode_list(false);
612    }
613}
614
615impl<T> ChunkedArray<T>
616where
617    T: PolarsDataType,
618    ChunkedArray<T>: ChunkTakeUnchecked<[IdxSize]>,
619{
620    /// Deposit values into nulls with a certain validity mask.
621    pub fn deposit(&self, validity: &Bitmap) -> Self {
622        let set_bits = validity.set_bits();
623
624        assert_eq!(self.null_count(), 0);
625        assert_eq!(self.len(), set_bits);
626
627        if set_bits == validity.len() {
628            return self.clone();
629        }
630
631        if set_bits == 0 {
632            return Self::full_null_like(self, validity.len());
633        }
634
635        let mut null_mask = validity.clone();
636
637        let mut gather_idxs = Vec::with_capacity(validity.len());
638        let leading_nulls = null_mask.take_leading_zeros();
639        gather_idxs.extend(std::iter::repeat_n(0, leading_nulls + 1));
640
641        let mut i = 0 as IdxSize;
642        gather_idxs.extend(null_mask.iter().skip(1).map(|v| {
643            i += IdxSize::from(v);
644            i
645        }));
646
647        let mut ca = unsafe { ChunkTakeUnchecked::take_unchecked(self, &gather_idxs) };
648        ca.set_validity(validity);
649        ca
650    }
651}
652
653impl ListChunked {
654    #[inline]
655    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
656        unsafe {
657            Some(Series::from_chunks_and_dtype_unchecked(
658                self.name().clone(),
659                vec![self.get(idx)?],
660                &self.inner_dtype().to_physical(),
661            ))
662        }
663    }
664
665    pub fn has_empty_lists(&self) -> bool {
666        for arr in self.downcast_iter() {
667            if arr.is_empty() {
668                continue;
669            }
670
671            if match arr.validity() {
672                None => arr.offsets().lengths().any(|l| l == 0),
673                Some(validity) => arr
674                    .offsets()
675                    .lengths()
676                    .enumerate()
677                    .any(|(i, l)| l == 0 && unsafe { validity.get_bit_unchecked(i) }),
678            } {
679                return true;
680            }
681        }
682
683        false
684    }
685
686    pub fn has_masked_out_values(&self) -> bool {
687        for arr in self.downcast_iter() {
688            if arr.is_empty() {
689                continue;
690            }
691
692            if *arr.offsets().first() != 0 || *arr.offsets().last() != arr.values().len() as i64 {
693                return true;
694            }
695
696            let Some(validity) = arr.validity() else {
697                continue;
698            };
699            if validity.set_bits() == 0 {
700                continue;
701            }
702
703            // @Performance: false_idx_iter
704            for i in (!validity).true_idx_iter() {
705                if arr.offsets().length_at(i) > 0 {
706                    return true;
707                }
708            }
709        }
710
711        false
712    }
713}
714
715#[cfg(feature = "dtype-array")]
716impl ArrayChunked {
717    #[inline]
718    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
719        unsafe {
720            Some(Series::from_chunks_and_dtype_unchecked(
721                self.name().clone(),
722                vec![self.get(idx)?],
723                &self.inner_dtype().to_physical(),
724            ))
725        }
726    }
727
728    pub fn from_aligned_values(
729        name: PlSmallStr,
730        inner_dtype: &DataType,
731        width: usize,
732        chunks: Vec<ArrayRef>,
733        length: usize,
734    ) -> Self {
735        let dtype = DataType::Array(Box::new(inner_dtype.clone()), width);
736        let arrow_dtype = dtype.to_arrow(CompatLevel::newest());
737        let field = Arc::new(Field::new(name, dtype));
738        if width == 0 {
739            use arrow::array::builder::{ArrayBuilder, make_builder};
740            let values = make_builder(&inner_dtype.to_arrow(CompatLevel::newest())).freeze();
741            return ArrayChunked::new_with_compute_len(
742                field,
743                vec![FixedSizeListArray::new(arrow_dtype, length, values, None).into_boxed()],
744            );
745        }
746        let mut total_len = 0;
747        let chunks = chunks
748            .into_iter()
749            .map(|chunk| {
750                debug_assert_eq!(chunk.len() % width, 0);
751                let chunk_len = chunk.len() / width;
752                total_len += chunk_len;
753                FixedSizeListArray::new(arrow_dtype.clone(), chunk_len, chunk, None).into_boxed()
754            })
755            .collect();
756        debug_assert_eq!(total_len, length);
757
758        unsafe { Self::new_with_dims(field, chunks, length, 0) }
759    }
760
761    /// Turn the ArrayChunked into the ListChunked with the same items.
762    ///
763    /// This will always zero copy the values into the ListChunked.
764    pub fn to_list(&self) -> ListChunked {
765        let inner_dtype = self.inner_dtype();
766        let chunks = self
767            .downcast_iter()
768            .map(|chunk| {
769                use arrow::offset::OffsetsBuffer;
770
771                let inner_dtype = chunk.dtype().inner_dtype().unwrap();
772                let dtype = inner_dtype.clone().to_large_list(true);
773
774                let offsets = (0..=chunk.len())
775                    .map(|i| (i * self.width()) as i64)
776                    .collect::<Vec<i64>>();
777
778                // SAFETY: We created our offsets in ascending manner.
779                let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) };
780
781                ListArray::<i64>::new(
782                    dtype,
783                    offsets,
784                    chunk.values().clone(),
785                    chunk.validity().cloned(),
786                )
787                .into_boxed()
788            })
789            .collect();
790
791        // SAFETY: All the items were mapped 1-1 with the validity staying the same.
792        let mut ca = unsafe {
793            ListChunked::new_with_dims(
794                Arc::new(Field::new(
795                    self.name().clone(),
796                    DataType::List(Box::new(inner_dtype.clone())),
797                )),
798                chunks,
799                self.len(),
800                self.null_count(),
801            )
802        };
803        ca.set_fast_explode_list(!self.has_nulls());
804        ca
805    }
806}
807
808impl<T> ChunkedArray<T>
809where
810    T: PolarsDataType,
811{
812    /// Should be used to match the chunk_id of another [`ChunkedArray`].
813    /// # Panics
814    /// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.
815    pub fn match_chunks<I>(&self, chunk_id: I) -> Self
816    where
817        I: Iterator<Item = usize>,
818    {
819        debug_assert!(self.chunks.len() == 1);
820        // Takes a ChunkedArray containing a single chunk.
821        let slice = |ca: &Self| {
822            let array = &ca.chunks[0];
823
824            let mut offset = 0;
825            let chunks = chunk_id
826                .map(|len| {
827                    // SAFETY: within bounds.
828                    debug_assert!((offset + len) <= array.len());
829                    let out = unsafe { array.sliced_unchecked(offset, len) };
830                    offset += len;
831                    out
832                })
833                .collect();
834
835            debug_assert_eq!(offset, array.len());
836
837            // SAFETY: We just slice the original chunks, their type will not change.
838            unsafe {
839                Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())
840            }
841        };
842
843        if self.chunks.len() != 1 {
844            let out = self.rechunk();
845            slice(&out)
846        } else {
847            slice(self)
848        }
849    }
850}
851
852impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {
853    fn as_ref_dtype(&self) -> &DataType {
854        self.dtype()
855    }
856}
857
858pub(crate) trait AsSinglePtr: AsRefDataType {
859    /// Rechunk and return a ptr to the start of the array
860    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
861        polars_bail!(opq = as_single_ptr, self.as_ref_dtype());
862    }
863}
864
865impl<T> AsSinglePtr for ChunkedArray<T>
866where
867    T: PolarsNumericType,
868{
869    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
870        self.rechunk_mut();
871        let a = self.data_views().next().unwrap();
872        let ptr = a.as_ptr();
873        Ok(ptr as usize)
874    }
875}
876
877impl AsSinglePtr for BooleanChunked {}
878impl AsSinglePtr for ListChunked {}
879#[cfg(feature = "dtype-array")]
880impl AsSinglePtr for ArrayChunked {}
881impl AsSinglePtr for StringChunked {}
882impl AsSinglePtr for BinaryChunked {}
883#[cfg(feature = "object")]
884impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}
885
886pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {
887    SingleNoNull(&'a T::Array),
888    Single(&'a T::Array),
889    MultiNoNull(&'a ChunkedArray<T>),
890    Multi(&'a ChunkedArray<T>),
891}
892
893impl<T> ChunkedArray<T>
894where
895    T: PolarsDataType,
896{
897    pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {
898        if self.chunks.len() == 1 {
899            let arr = self.downcast_iter().next().unwrap();
900            return if arr.null_count() == 0 {
901                ChunkedArrayLayout::SingleNoNull(arr)
902            } else {
903                ChunkedArrayLayout::Single(arr)
904            };
905        }
906
907        if self.downcast_iter().all(|a| a.null_count() == 0) {
908            ChunkedArrayLayout::MultiNoNull(self)
909        } else {
910            ChunkedArrayLayout::Multi(self)
911        }
912    }
913}
914
915impl<T> ChunkedArray<T>
916where
917    T: PolarsNumericType,
918{
919    /// Returns the values of the array as a contiguous slice.
920    pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {
921        polars_ensure!(
922            self.chunks.len() == 1 && self.chunks[0].null_count() == 0,
923            ComputeError: "chunked array is not contiguous"
924        );
925        Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())
926    }
927
928    /// Returns the values of the array as a contiguous mutable slice.
929    pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {
930        if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {
931            // SAFETY, we will not swap the PrimitiveArray.
932            let arr = unsafe { self.downcast_iter_mut().next().unwrap() };
933            arr.get_mut_values()
934        } else {
935            None
936        }
937    }
938
939    /// Get slices of the underlying arrow data.
940    /// NOTE: null values should be taken into account by the user of these slices as they are handled
941    /// separately
942    pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {
943        self.downcast_iter().map(|arr| arr.values().as_slice())
944    }
945
946    #[allow(clippy::wrong_self_convention)]
947    pub fn into_no_null_iter(
948        &self,
949    ) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen
950    {
951        // .copied was significantly slower in benchmark, next call did not inline?
952        #[allow(clippy::map_clone)]
953        // we know the iterators len
954        unsafe {
955            self.data_views()
956                .flatten()
957                .map(|v| *v)
958                .trust_my_length(self.len())
959        }
960    }
961}
962
963impl<T: PolarsDataType> Clone for ChunkedArray<T> {
964    fn clone(&self) -> Self {
965        ChunkedArray {
966            field: self.field.clone(),
967            chunks: self.chunks.clone(),
968            flags: self.flags.clone(),
969
970            _pd: Default::default(),
971            length: self.length,
972            null_count: self.null_count,
973        }
974    }
975}
976
977impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {
978    fn as_ref(&self) -> &ChunkedArray<T> {
979        self
980    }
981}
982
983impl ValueSize for ListChunked {
984    fn get_values_size(&self) -> usize {
985        self.chunks
986            .iter()
987            .fold(0usize, |acc, arr| acc + arr.get_values_size())
988    }
989}
990
991#[cfg(feature = "dtype-array")]
992impl ValueSize for ArrayChunked {
993    fn get_values_size(&self) -> usize {
994        self.chunks
995            .iter()
996            .fold(0usize, |acc, arr| acc + arr.get_values_size())
997    }
998}
999impl ValueSize for StringChunked {
1000    fn get_values_size(&self) -> usize {
1001        self.chunks
1002            .iter()
1003            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1004    }
1005}
1006
1007impl ValueSize for BinaryOffsetChunked {
1008    fn get_values_size(&self) -> usize {
1009        self.chunks
1010            .iter()
1011            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1012    }
1013}
1014
1015pub(crate) fn to_primitive<T: PolarsNumericType>(
1016    values: Vec<T::Native>,
1017    validity: Option<Bitmap>,
1018) -> PrimitiveArray<T::Native> {
1019    PrimitiveArray::new(
1020        T::get_static_dtype().to_arrow(CompatLevel::newest()),
1021        values.into(),
1022        validity,
1023    )
1024}
1025
1026pub(crate) fn to_array<T: PolarsNumericType>(
1027    values: Vec<T::Native>,
1028    validity: Option<Bitmap>,
1029) -> ArrayRef {
1030    Box::new(to_primitive::<T>(values, validity))
1031}
1032
1033impl<T: PolarsDataType> Default for ChunkedArray<T> {
1034    fn default() -> Self {
1035        let dtype = T::get_static_dtype();
1036        let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());
1037        ChunkedArray {
1038            field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),
1039            // Invariant: always has 1 chunk.
1040            chunks: vec![new_empty_array(arrow_dtype)],
1041            flags: StatisticsFlagsIM::empty(),
1042
1043            _pd: Default::default(),
1044            length: 0,
1045            null_count: 0,
1046        }
1047    }
1048}
1049
1050#[cfg(test)]
1051pub(crate) mod test {
1052    use crate::prelude::*;
1053
1054    pub(crate) fn get_chunked_array() -> Int32Chunked {
1055        ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])
1056    }
1057
1058    #[test]
1059    fn test_sort() {
1060        let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);
1061        let b = a
1062            .sort(false)
1063            .into_iter()
1064            .map(|opt| opt.unwrap())
1065            .collect::<Vec<_>>();
1066        assert_eq!(b, [1, 2, 3, 9]);
1067        let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);
1068        let a = a.sort(false);
1069        let b = a.into_iter().collect::<Vec<_>>();
1070        assert_eq!(b, [Some("a"), Some("b"), Some("c")]);
1071        assert!(a.is_sorted_ascending_flag());
1072    }
1073
1074    #[test]
1075    fn arithmetic() {
1076        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);
1077        let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);
1078
1079        // Not really asserting anything here but still making sure the code is exercised
1080        // This (and more) is properly tested from the integration test suite and Python bindings.
1081        println!("{:?}", a + b);
1082        println!("{:?}", a - b);
1083        println!("{:?}", a * b);
1084        println!("{:?}", a / b);
1085    }
1086
1087    #[test]
1088    fn iter() {
1089        let s1 = get_chunked_array();
1090        // sum
1091        assert_eq!(s1.into_iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)
1092    }
1093
1094    #[test]
1095    fn limit() {
1096        let a = get_chunked_array();
1097        let b = a.limit(2);
1098        println!("{b:?}");
1099        assert_eq!(b.len(), 2)
1100    }
1101
1102    #[test]
1103    fn filter() {
1104        let a = get_chunked_array();
1105        let b = a
1106            .filter(&BooleanChunked::new(
1107                PlSmallStr::from_static("filter"),
1108                &[true, false, false],
1109            ))
1110            .unwrap();
1111        assert_eq!(b.len(), 1);
1112        assert_eq!(b.into_iter().next(), Some(Some(1)));
1113    }
1114
1115    #[test]
1116    fn aggregates() {
1117        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);
1118        assert_eq!(a.max(), Some(100));
1119        assert_eq!(a.min(), Some(1));
1120        assert_eq!(a.sum(), Some(120))
1121    }
1122
1123    #[test]
1124    fn take() {
1125        let a = get_chunked_array();
1126        let new = a.take(&[0 as IdxSize, 1]).unwrap();
1127        assert_eq!(new.len(), 2)
1128    }
1129
1130    #[test]
1131    fn cast() {
1132        let a = get_chunked_array();
1133        let b = a.cast(&DataType::Int64).unwrap();
1134        assert_eq!(b.dtype(), &DataType::Int64)
1135    }
1136
1137    fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])
1138    where
1139        T: PolarsNumericType,
1140    {
1141        assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)
1142    }
1143
1144    #[test]
1145    fn slice() {
1146        let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);
1147        let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);
1148        first.append(&second).unwrap();
1149        assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);
1150        assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);
1151        assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);
1152        assert_slice_equal(&first.slice(3, 2), &[3, 4]);
1153        assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);
1154        assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);
1155        assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);
1156
1157        assert_eq!(first.slice(-7, 2).len(), 1);
1158        assert_eq!(first.slice(-3, 4).len(), 3);
1159        assert_eq!(first.slice(3, 4).len(), 3);
1160        assert_eq!(first.slice(10, 4).len(), 0);
1161    }
1162
1163    #[test]
1164    fn sorting() {
1165        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);
1166        let sorted = s.sort(false);
1167        assert_slice_equal(&sorted, &[2, 4, 9]);
1168        let sorted = s.sort(true);
1169        assert_slice_equal(&sorted, &[9, 4, 2]);
1170
1171        let s: StringChunked = ["b", "a", "z"].iter().collect();
1172        let sorted = s.sort(false);
1173        assert_eq!(
1174            sorted.into_iter().collect::<Vec<_>>(),
1175            &[Some("a"), Some("b"), Some("z")]
1176        );
1177        let sorted = s.sort(true);
1178        assert_eq!(
1179            sorted.into_iter().collect::<Vec<_>>(),
1180            &[Some("z"), Some("b"), Some("a")]
1181        );
1182        let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();
1183        let sorted = s.sort(false);
1184        assert_eq!(
1185            sorted.into_iter().collect::<Vec<_>>(),
1186            &[None, Some("b"), Some("z")]
1187        );
1188    }
1189
1190    #[test]
1191    fn reverse() {
1192        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);
1193        // path with continuous slice
1194        assert_slice_equal(&s.reverse(), &[3, 2, 1]);
1195        // path with options
1196        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);
1197        assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);
1198        let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);
1199        assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);
1200
1201        let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);
1202        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);
1203
1204        let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);
1205        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);
1206    }
1207
1208    #[test]
1209    #[cfg(feature = "dtype-categorical")]
1210    fn test_iter_categorical() {
1211        let ca = StringChunked::new(
1212            PlSmallStr::EMPTY,
1213            &[Some("foo"), None, Some("bar"), Some("ham")],
1214        );
1215        let cats = Categories::new(
1216            PlSmallStr::EMPTY,
1217            PlSmallStr::EMPTY,
1218            CategoricalPhysical::U32,
1219        );
1220        let ca = ca.cast(&DataType::from_categories(cats)).unwrap();
1221        let ca = ca.cat32().unwrap();
1222        let v: Vec<_> = ca.physical().into_iter().collect();
1223        assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);
1224    }
1225
1226    #[test]
1227    #[ignore]
1228    fn test_shrink_to_fit() {
1229        let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);
1230        builder.append_value("foo");
1231        let mut arr = builder.finish();
1232        let before = arr
1233            .chunks()
1234            .iter()
1235            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1236            .sum::<usize>();
1237        arr.shrink_to_fit();
1238        let after = arr
1239            .chunks()
1240            .iter()
1241            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1242            .sum::<usize>();
1243        assert!(before > after);
1244    }
1245}