polars_core/chunked_array/
mod.rs

1//! The typed heart of every Series column.
2#![allow(unsafe_op_in_unsafe_fn)]
3use std::sync::Arc;
4
5use arrow::array::*;
6use arrow::bitmap::Bitmap;
7use arrow::compute::concatenate::concatenate_unchecked;
8use polars_compute::filter::filter_with_bitmap;
9
10use crate::prelude::{ChunkTakeUnchecked, *};
11
12pub mod ops;
13#[macro_use]
14pub mod arithmetic;
15pub mod builder;
16pub mod cast;
17pub mod collect;
18pub mod comparison;
19pub mod flags;
20pub mod float;
21pub mod iterator;
22#[cfg(feature = "ndarray")]
23pub(crate) mod ndarray;
24
25#[cfg(feature = "dtype-array")]
26pub(crate) mod array;
27mod binary;
28mod binary_offset;
29mod bitwise;
30#[cfg(feature = "object")]
31mod drop;
32mod from;
33mod from_iterator;
34pub mod from_iterator_par;
35pub(crate) mod list;
36pub(crate) mod logical;
37#[cfg(feature = "object")]
38pub mod object;
39#[cfg(feature = "random")]
40mod random;
41#[cfg(feature = "dtype-struct")]
42mod struct_;
43#[cfg(any(
44    feature = "temporal",
45    feature = "dtype-datetime",
46    feature = "dtype-date"
47))]
48pub mod temporal;
49mod to_vec;
50mod trusted_len;
51
52use arrow::legacy::prelude::*;
53#[cfg(feature = "dtype-struct")]
54pub use struct_::StructChunked;
55
56use self::flags::{StatisticsFlags, StatisticsFlagsIM};
57use crate::series::IsSorted;
58use crate::utils::{first_non_null, first_null, last_non_null};
59
60#[cfg(not(feature = "dtype-categorical"))]
61pub struct RevMapping {}
62
63pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;
64
65/// # ChunkedArray
66///
67/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows
68/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.
69/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].
70///
71/// ```rust
72/// # use polars_core::prelude::*;
73/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {
74///     ca.apply_values(|v| v.cos())
75/// }
76/// ```
77///
78/// ## Conversion between Series and ChunkedArrays
79/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.
80///
81/// ```rust
82/// # use polars_core::prelude::*;
83/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{
84///     series.i32()
85/// }
86///
87/// fn to_series(ca: Int32Chunked) -> Series {
88///     ca.into_series()
89/// }
90/// ```
91///
92/// # Iterators
93///
94/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
95/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby
96/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).
97///
98/// ```rust
99/// # use polars_core::prelude::*;
100///
101/// fn iter_forward(ca: &Float32Chunked) {
102///     ca.iter()
103///         .for_each(|opt_v| println!("{:?}", opt_v))
104/// }
105///
106/// fn iter_backward(ca: &Float32Chunked) {
107///     ca.iter()
108///         .rev()
109///         .for_each(|opt_v| println!("{:?}", opt_v))
110/// }
111/// ```
112///
113/// # Memory layout
114///
115/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.
116/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.
117///
118/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.
119/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).
120///
121/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection
122/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.
123/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.
124///
125/// If you want to have predictable performance
126/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after
127/// multiple append operations.
128///
129/// See also [`ChunkedArray::extend`] for appends within a chunk.
130///
131/// # Invariants
132/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].
133/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]
134///   chunks.
135/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the
136///   logical type given by the datatype.
137///
138/// [`List`]: crate::datatypes::DataType::List
139pub struct ChunkedArray<T: PolarsDataType> {
140    pub(crate) field: Arc<Field>,
141    pub(crate) chunks: Vec<ArrayRef>,
142
143    pub(crate) flags: StatisticsFlagsIM,
144
145    length: usize,
146    null_count: usize,
147    _pd: std::marker::PhantomData<T>,
148}
149
150impl<T: PolarsDataType> ChunkedArray<T> {
151    fn should_rechunk(&self) -> bool {
152        self.chunks.len() > 1 && self.chunks.len() > self.len() / 3
153    }
154
155    fn optional_rechunk(mut self) -> Self {
156        // Rechunk if we have many small chunks.
157        if self.should_rechunk() {
158            self.rechunk_mut()
159        }
160        self
161    }
162
163    pub(crate) fn as_any(&self) -> &dyn std::any::Any {
164        self
165    }
166
167    /// Series to [`ChunkedArray<T>`]
168    pub fn unpack_series_matching_type<'a>(
169        &self,
170        series: &'a Series,
171    ) -> PolarsResult<&'a ChunkedArray<T>> {
172        polars_ensure!(
173            self.dtype() == series.dtype(),
174            SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
175            series.dtype(),
176            self.dtype(),
177        );
178
179        // SAFETY: dtype will be correct.
180        Ok(unsafe { self.unpack_series_matching_physical_type(series) })
181    }
182
183    /// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.
184    ///
185    /// If you want to explicitly the `length` and `null_count`, look at
186    /// [`ChunkedArray::new_with_dims`]
187    fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {
188        unsafe {
189            let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);
190            chunked_arr.compute_len();
191            chunked_arr
192        }
193    }
194
195    /// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.
196    /// # Safety
197    /// The length and null_count must be correct.
198    pub unsafe fn new_with_dims(
199        field: Arc<Field>,
200        chunks: Vec<ArrayRef>,
201        length: usize,
202        null_count: usize,
203    ) -> Self {
204        Self {
205            field,
206            chunks,
207            flags: StatisticsFlagsIM::empty(),
208
209            _pd: Default::default(),
210            length,
211            null_count,
212        }
213    }
214
215    pub(crate) fn is_sorted_ascending_flag(&self) -> bool {
216        self.get_flags().is_sorted_ascending()
217    }
218
219    pub(crate) fn is_sorted_descending_flag(&self) -> bool {
220        self.get_flags().is_sorted_descending()
221    }
222
223    /// Whether `self` is sorted in any direction.
224    pub(crate) fn is_sorted_any(&self) -> bool {
225        self.get_flags().is_sorted_any()
226    }
227
228    pub fn unset_fast_explode_list(&mut self) {
229        self.set_fast_explode_list(false)
230    }
231
232    pub fn set_fast_explode_list(&mut self, value: bool) {
233        let mut flags = self.flags.get_mut();
234        flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);
235        self.flags.set_mut(flags);
236    }
237
238    pub fn get_fast_explode_list(&self) -> bool {
239        self.get_flags().can_fast_explode_list()
240    }
241
242    pub fn get_flags(&self) -> StatisticsFlags {
243        self.flags.get()
244    }
245
246    /// Set flags for the [`ChunkedArray`]
247    pub fn set_flags(&mut self, flags: StatisticsFlags) {
248        self.flags = StatisticsFlagsIM::new(flags);
249    }
250
251    pub fn is_sorted_flag(&self) -> IsSorted {
252        self.get_flags().is_sorted()
253    }
254
255    pub fn retain_flags_from<U: PolarsDataType>(
256        &mut self,
257        from: &ChunkedArray<U>,
258        retain_flags: StatisticsFlags,
259    ) {
260        let flags = from.flags.get();
261        // Try to avoid write contention.
262        if !flags.is_empty() {
263            self.set_flags(flags & retain_flags)
264        }
265    }
266
267    /// Set the 'sorted' bit meta info.
268    pub fn set_sorted_flag(&mut self, sorted: IsSorted) {
269        let mut flags = self.flags.get_mut();
270        flags.set_sorted(sorted);
271        self.flags.set_mut(flags);
272    }
273
274    /// Set the 'sorted' bit meta info.
275    pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {
276        let mut out = self.clone();
277        out.set_sorted_flag(sorted);
278        out
279    }
280
281    pub fn first_null(&self) -> Option<usize> {
282        if self.null_count() == 0 {
283            None
284        }
285        // We now know there is at least 1 non-null item in the array, and self.len() > 0
286        else if self.null_count() == self.len() {
287            Some(0)
288        } else if self.is_sorted_any() {
289            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
290                // nulls are all at the start
291                0
292            } else {
293                // nulls are all at the end
294                self.null_count()
295            };
296
297            debug_assert!(
298                // If we are lucky this catches something.
299                unsafe { self.get_unchecked(out) }.is_some(),
300                "incorrect sorted flag"
301            );
302
303            Some(out)
304        } else {
305            first_null(self.chunks().iter().map(|arr| arr.as_ref()))
306        }
307    }
308
309    /// Get the index of the first non null value in this [`ChunkedArray`].
310    pub fn first_non_null(&self) -> Option<usize> {
311        if self.null_count() == self.len() {
312            None
313        }
314        // We now know there is at least 1 non-null item in the array, and self.len() > 0
315        else if self.null_count() == 0 {
316            Some(0)
317        } else if self.is_sorted_any() {
318            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
319                // nulls are all at the start
320                self.null_count()
321            } else {
322                // nulls are all at the end
323                0
324            };
325
326            debug_assert!(
327                // If we are lucky this catches something.
328                unsafe { self.get_unchecked(out) }.is_some(),
329                "incorrect sorted flag"
330            );
331
332            Some(out)
333        } else {
334            first_non_null(self.chunks().iter().map(|arr| arr.as_ref()))
335        }
336    }
337
338    /// Get the index of the last non null value in this [`ChunkedArray`].
339    pub fn last_non_null(&self) -> Option<usize> {
340        if self.null_count() == self.len() {
341            None
342        }
343        // We now know there is at least 1 non-null item in the array, and self.len() > 0
344        else if self.null_count() == 0 {
345            Some(self.len() - 1)
346        } else if self.is_sorted_any() {
347            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
348                // nulls are all at the start
349                self.len() - 1
350            } else {
351                // nulls are all at the end
352                self.len() - self.null_count() - 1
353            };
354
355            debug_assert!(
356                // If we are lucky this catches something.
357                unsafe { self.get_unchecked(out) }.is_some(),
358                "incorrect sorted flag"
359            );
360
361            Some(out)
362        } else {
363            last_non_null(self.chunks().iter().map(|arr| arr.as_ref()), self.len())
364        }
365    }
366
367    pub fn drop_nulls(&self) -> Self {
368        if self.null_count() == 0 {
369            self.clone()
370        } else {
371            let chunks = self
372                .downcast_iter()
373                .map(|arr| {
374                    if arr.null_count() == 0 {
375                        arr.to_boxed()
376                    } else {
377                        filter_with_bitmap(arr, arr.validity().unwrap())
378                    }
379                })
380                .collect();
381            unsafe {
382                Self::new_with_dims(
383                    self.field.clone(),
384                    chunks,
385                    self.len() - self.null_count(),
386                    0,
387                )
388            }
389        }
390    }
391
392    /// Get the buffer of bits representing null values
393    #[inline]
394    #[allow(clippy::type_complexity)]
395    pub fn iter_validities(
396        &self,
397    ) -> impl ExactSizeIterator<Item = Option<&Bitmap>> + DoubleEndedIterator {
398        fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {
399            arr.validity()
400        }
401        self.chunks.iter().map(to_validity)
402    }
403
404    #[inline]
405    /// Return if any the chunks in this [`ChunkedArray`] have nulls.
406    pub fn has_nulls(&self) -> bool {
407        self.null_count > 0
408    }
409
410    /// Shrink the capacity of this array to fit its length.
411    pub fn shrink_to_fit(&mut self) {
412        self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];
413    }
414
415    pub fn clear(&self) -> Self {
416        // SAFETY: we keep the correct dtype
417        let mut ca = unsafe {
418            self.copy_with_chunks(vec![new_empty_array(
419                self.chunks.first().unwrap().dtype().clone(),
420            )])
421        };
422
423        use StatisticsFlags as F;
424        ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
425        ca
426    }
427
428    /// Unpack a [`Series`] to the same physical type.
429    ///
430    /// # Safety
431    ///
432    /// This is unsafe as the dtype may be incorrect and
433    /// is assumed to be correct in other safe code.
434    pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(
435        &self,
436        series: &'a Series,
437    ) -> &'a ChunkedArray<T> {
438        let series_trait = &**series;
439        if self.dtype() == series.dtype() {
440            &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
441        } else {
442            use DataType::*;
443            match (self.dtype(), series.dtype()) {
444                (Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {
445                    &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
446                },
447                _ => panic!(
448                    "cannot unpack series {:?} into matching type {:?}",
449                    series,
450                    self.dtype()
451                ),
452            }
453        }
454    }
455
456    /// Returns an iterator over the lengths of the chunks of the array.
457    pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {
458        self.chunks.iter().map(|chunk| chunk.len())
459    }
460
461    /// A reference to the chunks
462    #[inline]
463    pub fn chunks(&self) -> &Vec<ArrayRef> {
464        &self.chunks
465    }
466
467    /// A mutable reference to the chunks
468    ///
469    /// # Safety
470    /// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.
471    /// And the `null_count` remains correct.
472    #[inline]
473    pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
474        &mut self.chunks
475    }
476
477    /// Returns true if contains a single chunk and has no null values
478    pub fn is_optimal_aligned(&self) -> bool {
479        self.chunks.len() == 1 && self.null_count() == 0
480    }
481
482    /// Create a new [`ChunkedArray`] from self, where the chunks are replaced.
483    ///
484    /// # Safety
485    /// The caller must ensure the dtypes of the chunks are correct
486    unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
487        Self::new_with_compute_len(self.field.clone(), chunks)
488    }
489
490    /// Get data type of [`ChunkedArray`].
491    pub fn dtype(&self) -> &DataType {
492        self.field.dtype()
493    }
494
495    pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {
496        self.field = Arc::new(Field::new(self.name().clone(), dtype))
497    }
498
499    /// Name of the [`ChunkedArray`].
500    pub fn name(&self) -> &PlSmallStr {
501        self.field.name()
502    }
503
504    /// Get a reference to the field.
505    pub fn ref_field(&self) -> &Field {
506        &self.field
507    }
508
509    /// Rename this [`ChunkedArray`].
510    pub fn rename(&mut self, name: PlSmallStr) {
511        self.field = Arc::new(Field::new(name, self.field.dtype().clone()));
512    }
513
514    /// Return this [`ChunkedArray`] with a new name.
515    pub fn with_name(mut self, name: PlSmallStr) -> Self {
516        self.rename(name);
517        self
518    }
519}
520
521impl<T> ChunkedArray<T>
522where
523    T: PolarsDataType,
524{
525    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
526    /// indicates a NULL value.
527    ///
528    /// # Panics
529    /// This function will panic if `idx` is out of bounds.
530    #[inline]
531    pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {
532        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
533        assert!(
534            chunk_idx < self.chunks().len(),
535            "index: {} out of bounds for len: {}",
536            idx,
537            self.len()
538        );
539        unsafe {
540            let arr = self.downcast_get_unchecked(chunk_idx);
541            assert!(
542                arr_idx < arr.len(),
543                "index: {} out of bounds for len: {}",
544                idx,
545                self.len()
546            );
547            arr.get_unchecked(arr_idx)
548        }
549    }
550
551    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
552    /// indicates a NULL value.
553    ///
554    /// # Safety
555    /// It is the callers responsibility that the `idx < self.len()`.
556    #[inline]
557    pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {
558        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
559
560        unsafe {
561            // SAFETY: up to the caller to make sure the index is valid.
562            self.downcast_get_unchecked(chunk_idx)
563                .get_unchecked(arr_idx)
564        }
565    }
566
567    /// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned
568    /// value could be garbage if it was masked out by NULL. Note that the value always is initialized.
569    ///
570    /// # Safety
571    /// It is the callers responsibility that the `idx < self.len()`.
572    #[inline]
573    pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {
574        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
575
576        unsafe {
577            // SAFETY: up to the caller to make sure the index is valid.
578            self.downcast_get_unchecked(chunk_idx)
579                .value_unchecked(arr_idx)
580        }
581    }
582
583    #[inline]
584    pub fn first(&self) -> Option<T::Physical<'_>> {
585        unsafe {
586            let arr = self.downcast_get_unchecked(0);
587            arr.get_unchecked(0)
588        }
589    }
590
591    #[inline]
592    pub fn last(&self) -> Option<T::Physical<'_>> {
593        unsafe {
594            let arr = self.downcast_get_unchecked(self.chunks.len().checked_sub(1)?);
595            arr.get_unchecked(arr.len().checked_sub(1)?)
596        }
597    }
598
599    pub fn set_validity(&mut self, validity: &Bitmap) {
600        assert_eq!(self.len(), validity.len());
601        let mut i = 0;
602        for chunk in unsafe { self.chunks_mut() } {
603            *chunk = chunk.with_validity(Some(validity.clone().sliced(i, chunk.len())));
604            i += chunk.len();
605        }
606        self.null_count = validity.unset_bits();
607        self.set_fast_explode_list(false);
608    }
609}
610
611impl<T> ChunkedArray<T>
612where
613    T: PolarsDataType,
614    ChunkedArray<T>: ChunkTakeUnchecked<[IdxSize]>,
615{
616    /// Deposit values into nulls with a certain validity mask.
617    pub fn deposit(&self, validity: &Bitmap) -> Self {
618        let set_bits = validity.set_bits();
619
620        assert_eq!(self.null_count(), 0);
621        assert_eq!(self.len(), set_bits);
622
623        if set_bits == validity.len() {
624            return self.clone();
625        }
626
627        if set_bits == 0 {
628            return Self::full_null_like(self, validity.len());
629        }
630
631        let mut null_mask = validity.clone();
632
633        let mut gather_idxs = Vec::with_capacity(validity.len());
634        let leading_nulls = null_mask.take_leading_zeros();
635        gather_idxs.extend(std::iter::repeat_n(0, leading_nulls + 1));
636
637        let mut i = 0 as IdxSize;
638        gather_idxs.extend(null_mask.iter().skip(1).map(|v| {
639            i += IdxSize::from(v);
640            i
641        }));
642
643        let mut ca = unsafe { ChunkTakeUnchecked::take_unchecked(self, &gather_idxs) };
644        ca.set_validity(validity);
645        ca
646    }
647}
648
649impl ListChunked {
650    #[inline]
651    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
652        unsafe {
653            Some(Series::from_chunks_and_dtype_unchecked(
654                self.name().clone(),
655                vec![self.get(idx)?],
656                &self.inner_dtype().to_physical(),
657            ))
658        }
659    }
660
661    pub fn has_empty_lists(&self) -> bool {
662        for arr in self.downcast_iter() {
663            if arr.is_empty() {
664                continue;
665            }
666
667            if match arr.validity() {
668                None => arr.offsets().lengths().any(|l| l == 0),
669                Some(validity) => arr
670                    .offsets()
671                    .lengths()
672                    .enumerate()
673                    .any(|(i, l)| l == 0 && unsafe { validity.get_bit_unchecked(i) }),
674            } {
675                return true;
676            }
677        }
678
679        false
680    }
681
682    pub fn has_masked_out_values(&self) -> bool {
683        for arr in self.downcast_iter() {
684            if arr.is_empty() {
685                continue;
686            }
687
688            if *arr.offsets().first() != 0 || *arr.offsets().last() != arr.values().len() as i64 {
689                return true;
690            }
691
692            let Some(validity) = arr.validity() else {
693                continue;
694            };
695            if validity.set_bits() == 0 {
696                continue;
697            }
698
699            // @Performance: false_idx_iter
700            for i in (!validity).true_idx_iter() {
701                if arr.offsets().length_at(i) > 0 {
702                    return true;
703                }
704            }
705        }
706
707        false
708    }
709}
710
711#[cfg(feature = "dtype-array")]
712impl ArrayChunked {
713    #[inline]
714    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
715        unsafe {
716            Some(Series::from_chunks_and_dtype_unchecked(
717                self.name().clone(),
718                vec![self.get(idx)?],
719                &self.inner_dtype().to_physical(),
720            ))
721        }
722    }
723
724    pub fn from_aligned_values(
725        name: PlSmallStr,
726        inner_dtype: &DataType,
727        width: usize,
728        chunks: Vec<ArrayRef>,
729        length: usize,
730    ) -> Self {
731        let dtype = DataType::Array(Box::new(inner_dtype.clone()), width);
732        let arrow_dtype = dtype.to_arrow(CompatLevel::newest());
733        let field = Arc::new(Field::new(name, dtype));
734        if width == 0 {
735            use arrow::array::builder::{ArrayBuilder, make_builder};
736            let values = make_builder(&inner_dtype.to_arrow(CompatLevel::newest())).freeze();
737            return ArrayChunked::new_with_compute_len(
738                field,
739                vec![FixedSizeListArray::new(arrow_dtype, length, values, None).into_boxed()],
740            );
741        }
742        let mut total_len = 0;
743        let chunks = chunks
744            .into_iter()
745            .map(|chunk| {
746                debug_assert_eq!(chunk.len() % width, 0);
747                let chunk_len = chunk.len() / width;
748                total_len += chunk_len;
749                FixedSizeListArray::new(arrow_dtype.clone(), chunk_len, chunk, None).into_boxed()
750            })
751            .collect();
752        debug_assert_eq!(total_len, length);
753
754        unsafe { Self::new_with_dims(field, chunks, length, 0) }
755    }
756
757    /// Turn the ArrayChunked into the ListChunked with the same items.
758    ///
759    /// This will always zero copy the values into the ListChunked.
760    pub fn to_list(&self) -> ListChunked {
761        let inner_dtype = self.inner_dtype();
762        let chunks = self
763            .downcast_iter()
764            .map(|chunk| {
765                use arrow::offset::OffsetsBuffer;
766
767                let inner_dtype = chunk.dtype().inner_dtype().unwrap();
768                let dtype = inner_dtype.clone().to_large_list(true);
769
770                let offsets = (0..=chunk.len())
771                    .map(|i| (i * self.width()) as i64)
772                    .collect::<Vec<i64>>();
773
774                // SAFETY: We created our offsets in ascending manner.
775                let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) };
776
777                ListArray::<i64>::new(
778                    dtype,
779                    offsets,
780                    chunk.values().clone(),
781                    chunk.validity().cloned(),
782                )
783                .into_boxed()
784            })
785            .collect();
786
787        // SAFETY: All the items were mapped 1-1 with the validity staying the same.
788        let mut ca = unsafe {
789            ListChunked::new_with_dims(
790                Arc::new(Field::new(
791                    self.name().clone(),
792                    DataType::List(Box::new(inner_dtype.clone())),
793                )),
794                chunks,
795                self.len(),
796                self.null_count(),
797            )
798        };
799        ca.set_fast_explode_list(!self.has_nulls());
800        ca
801    }
802}
803
804impl<T> ChunkedArray<T>
805where
806    T: PolarsDataType,
807{
808    /// Should be used to match the chunk_id of another [`ChunkedArray`].
809    /// # Panics
810    /// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.
811    pub fn match_chunks<I>(&self, chunk_id: I) -> Self
812    where
813        I: Iterator<Item = usize>,
814    {
815        debug_assert!(self.chunks.len() == 1);
816        // Takes a ChunkedArray containing a single chunk.
817        let slice = |ca: &Self| {
818            let array = &ca.chunks[0];
819
820            let mut offset = 0;
821            let chunks = chunk_id
822                .map(|len| {
823                    // SAFETY: within bounds.
824                    debug_assert!((offset + len) <= array.len());
825                    let out = unsafe { array.sliced_unchecked(offset, len) };
826                    offset += len;
827                    out
828                })
829                .collect();
830
831            debug_assert_eq!(offset, array.len());
832
833            // SAFETY: We just slice the original chunks, their type will not change.
834            unsafe {
835                Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())
836            }
837        };
838
839        if self.chunks.len() != 1 {
840            let out = self.rechunk();
841            slice(&out)
842        } else {
843            slice(self)
844        }
845    }
846}
847
848impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {
849    fn as_ref_dtype(&self) -> &DataType {
850        self.dtype()
851    }
852}
853
854pub(crate) trait AsSinglePtr: AsRefDataType {
855    /// Rechunk and return a ptr to the start of the array
856    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
857        polars_bail!(opq = as_single_ptr, self.as_ref_dtype());
858    }
859}
860
861impl<T> AsSinglePtr for ChunkedArray<T>
862where
863    T: PolarsNumericType,
864{
865    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
866        self.rechunk_mut();
867        let a = self.data_views().next().unwrap();
868        let ptr = a.as_ptr();
869        Ok(ptr as usize)
870    }
871}
872
873impl AsSinglePtr for BooleanChunked {}
874impl AsSinglePtr for ListChunked {}
875#[cfg(feature = "dtype-array")]
876impl AsSinglePtr for ArrayChunked {}
877impl AsSinglePtr for StringChunked {}
878impl AsSinglePtr for BinaryChunked {}
879#[cfg(feature = "object")]
880impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}
881
882pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {
883    SingleNoNull(&'a T::Array),
884    Single(&'a T::Array),
885    MultiNoNull(&'a ChunkedArray<T>),
886    Multi(&'a ChunkedArray<T>),
887}
888
889impl<T> ChunkedArray<T>
890where
891    T: PolarsDataType,
892{
893    pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {
894        if self.chunks.len() == 1 {
895            let arr = self.downcast_iter().next().unwrap();
896            return if arr.null_count() == 0 {
897                ChunkedArrayLayout::SingleNoNull(arr)
898            } else {
899                ChunkedArrayLayout::Single(arr)
900            };
901        }
902
903        if self.downcast_iter().all(|a| a.null_count() == 0) {
904            ChunkedArrayLayout::MultiNoNull(self)
905        } else {
906            ChunkedArrayLayout::Multi(self)
907        }
908    }
909}
910
911impl<T> ChunkedArray<T>
912where
913    T: PolarsNumericType,
914{
915    /// Returns the values of the array as a contiguous slice.
916    pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {
917        polars_ensure!(
918            self.chunks.len() == 1 && self.chunks[0].null_count() == 0,
919            ComputeError: "chunked array is not contiguous"
920        );
921        Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())
922    }
923
924    /// Returns the values of the array as a contiguous mutable slice.
925    pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {
926        if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {
927            // SAFETY, we will not swap the PrimitiveArray.
928            let arr = unsafe { self.downcast_iter_mut().next().unwrap() };
929            arr.get_mut_values()
930        } else {
931            None
932        }
933    }
934
935    /// Get slices of the underlying arrow data.
936    /// NOTE: null values should be taken into account by the user of these slices as they are handled
937    /// separately
938    pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {
939        self.downcast_iter().map(|arr| arr.values().as_slice())
940    }
941
942    #[allow(clippy::wrong_self_convention)]
943    pub fn into_no_null_iter(
944        &self,
945    ) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen
946    {
947        // .copied was significantly slower in benchmark, next call did not inline?
948        #[allow(clippy::map_clone)]
949        // we know the iterators len
950        unsafe {
951            self.data_views()
952                .flatten()
953                .map(|v| *v)
954                .trust_my_length(self.len())
955        }
956    }
957}
958
959impl<T: PolarsDataType> Clone for ChunkedArray<T> {
960    fn clone(&self) -> Self {
961        ChunkedArray {
962            field: self.field.clone(),
963            chunks: self.chunks.clone(),
964            flags: self.flags.clone(),
965
966            _pd: Default::default(),
967            length: self.length,
968            null_count: self.null_count,
969        }
970    }
971}
972
973impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {
974    fn as_ref(&self) -> &ChunkedArray<T> {
975        self
976    }
977}
978
979impl ValueSize for ListChunked {
980    fn get_values_size(&self) -> usize {
981        self.chunks
982            .iter()
983            .fold(0usize, |acc, arr| acc + arr.get_values_size())
984    }
985}
986
987#[cfg(feature = "dtype-array")]
988impl ValueSize for ArrayChunked {
989    fn get_values_size(&self) -> usize {
990        self.chunks
991            .iter()
992            .fold(0usize, |acc, arr| acc + arr.get_values_size())
993    }
994}
995impl ValueSize for StringChunked {
996    fn get_values_size(&self) -> usize {
997        self.chunks
998            .iter()
999            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1000    }
1001}
1002
1003impl ValueSize for BinaryOffsetChunked {
1004    fn get_values_size(&self) -> usize {
1005        self.chunks
1006            .iter()
1007            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1008    }
1009}
1010
1011pub(crate) fn to_primitive<T: PolarsNumericType>(
1012    values: Vec<T::Native>,
1013    validity: Option<Bitmap>,
1014) -> PrimitiveArray<T::Native> {
1015    PrimitiveArray::new(
1016        T::get_static_dtype().to_arrow(CompatLevel::newest()),
1017        values.into(),
1018        validity,
1019    )
1020}
1021
1022pub(crate) fn to_array<T: PolarsNumericType>(
1023    values: Vec<T::Native>,
1024    validity: Option<Bitmap>,
1025) -> ArrayRef {
1026    Box::new(to_primitive::<T>(values, validity))
1027}
1028
1029impl<T: PolarsDataType> Default for ChunkedArray<T> {
1030    fn default() -> Self {
1031        let dtype = T::get_static_dtype();
1032        let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());
1033        ChunkedArray {
1034            field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),
1035            // Invariant: always has 1 chunk.
1036            chunks: vec![new_empty_array(arrow_dtype)],
1037            flags: StatisticsFlagsIM::empty(),
1038
1039            _pd: Default::default(),
1040            length: 0,
1041            null_count: 0,
1042        }
1043    }
1044}
1045
1046#[cfg(test)]
1047pub(crate) mod test {
1048    use crate::prelude::*;
1049
1050    pub(crate) fn get_chunked_array() -> Int32Chunked {
1051        ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])
1052    }
1053
1054    #[test]
1055    fn test_sort() {
1056        let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);
1057        let b = a
1058            .sort(false)
1059            .into_iter()
1060            .map(|opt| opt.unwrap())
1061            .collect::<Vec<_>>();
1062        assert_eq!(b, [1, 2, 3, 9]);
1063        let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);
1064        let a = a.sort(false);
1065        let b = a.into_iter().collect::<Vec<_>>();
1066        assert_eq!(b, [Some("a"), Some("b"), Some("c")]);
1067        assert!(a.is_sorted_ascending_flag());
1068    }
1069
1070    #[test]
1071    fn arithmetic() {
1072        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);
1073        let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);
1074
1075        // Not really asserting anything here but still making sure the code is exercised
1076        // This (and more) is properly tested from the integration test suite and Python bindings.
1077        println!("{:?}", a + b);
1078        println!("{:?}", a - b);
1079        println!("{:?}", a * b);
1080        println!("{:?}", a / b);
1081    }
1082
1083    #[test]
1084    fn iter() {
1085        let s1 = get_chunked_array();
1086        // sum
1087        assert_eq!(s1.into_iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)
1088    }
1089
1090    #[test]
1091    fn limit() {
1092        let a = get_chunked_array();
1093        let b = a.limit(2);
1094        println!("{b:?}");
1095        assert_eq!(b.len(), 2)
1096    }
1097
1098    #[test]
1099    fn filter() {
1100        let a = get_chunked_array();
1101        let b = a
1102            .filter(&BooleanChunked::new(
1103                PlSmallStr::from_static("filter"),
1104                &[true, false, false],
1105            ))
1106            .unwrap();
1107        assert_eq!(b.len(), 1);
1108        assert_eq!(b.into_iter().next(), Some(Some(1)));
1109    }
1110
1111    #[test]
1112    fn aggregates() {
1113        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);
1114        assert_eq!(a.max(), Some(100));
1115        assert_eq!(a.min(), Some(1));
1116        assert_eq!(a.sum(), Some(120))
1117    }
1118
1119    #[test]
1120    fn take() {
1121        let a = get_chunked_array();
1122        let new = a.take(&[0 as IdxSize, 1]).unwrap();
1123        assert_eq!(new.len(), 2)
1124    }
1125
1126    #[test]
1127    fn cast() {
1128        let a = get_chunked_array();
1129        let b = a.cast(&DataType::Int64).unwrap();
1130        assert_eq!(b.dtype(), &DataType::Int64)
1131    }
1132
1133    fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])
1134    where
1135        T: PolarsNumericType,
1136    {
1137        assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)
1138    }
1139
1140    #[test]
1141    fn slice() {
1142        let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);
1143        let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);
1144        first.append(&second).unwrap();
1145        assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);
1146        assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);
1147        assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);
1148        assert_slice_equal(&first.slice(3, 2), &[3, 4]);
1149        assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);
1150        assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);
1151        assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);
1152
1153        assert_eq!(first.slice(-7, 2).len(), 1);
1154        assert_eq!(first.slice(-3, 4).len(), 3);
1155        assert_eq!(first.slice(3, 4).len(), 3);
1156        assert_eq!(first.slice(10, 4).len(), 0);
1157    }
1158
1159    #[test]
1160    fn sorting() {
1161        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);
1162        let sorted = s.sort(false);
1163        assert_slice_equal(&sorted, &[2, 4, 9]);
1164        let sorted = s.sort(true);
1165        assert_slice_equal(&sorted, &[9, 4, 2]);
1166
1167        let s: StringChunked = ["b", "a", "z"].iter().collect();
1168        let sorted = s.sort(false);
1169        assert_eq!(
1170            sorted.into_iter().collect::<Vec<_>>(),
1171            &[Some("a"), Some("b"), Some("z")]
1172        );
1173        let sorted = s.sort(true);
1174        assert_eq!(
1175            sorted.into_iter().collect::<Vec<_>>(),
1176            &[Some("z"), Some("b"), Some("a")]
1177        );
1178        let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();
1179        let sorted = s.sort(false);
1180        assert_eq!(
1181            sorted.into_iter().collect::<Vec<_>>(),
1182            &[None, Some("b"), Some("z")]
1183        );
1184    }
1185
1186    #[test]
1187    fn reverse() {
1188        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);
1189        // path with continuous slice
1190        assert_slice_equal(&s.reverse(), &[3, 2, 1]);
1191        // path with options
1192        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);
1193        assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);
1194        let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);
1195        assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);
1196
1197        let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);
1198        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);
1199
1200        let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);
1201        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);
1202    }
1203
1204    #[test]
1205    #[cfg(feature = "dtype-categorical")]
1206    fn test_iter_categorical() {
1207        let ca = StringChunked::new(
1208            PlSmallStr::EMPTY,
1209            &[Some("foo"), None, Some("bar"), Some("ham")],
1210        );
1211        let cats = Categories::new(
1212            PlSmallStr::EMPTY,
1213            PlSmallStr::EMPTY,
1214            CategoricalPhysical::U32,
1215        );
1216        let ca = ca.cast(&DataType::from_categories(cats)).unwrap();
1217        let ca = ca.cat32().unwrap();
1218        let v: Vec<_> = ca.physical().into_iter().collect();
1219        assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);
1220    }
1221
1222    #[test]
1223    #[ignore]
1224    fn test_shrink_to_fit() {
1225        let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);
1226        builder.append_value("foo");
1227        let mut arr = builder.finish();
1228        let before = arr
1229            .chunks()
1230            .iter()
1231            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1232            .sum::<usize>();
1233        arr.shrink_to_fit();
1234        let after = arr
1235            .chunks()
1236            .iter()
1237            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1238            .sum::<usize>();
1239        assert!(before > after);
1240    }
1241}