polars_core/chunked_array/
mod.rs

1//! The typed heart of every Series column.
2#![allow(unsafe_op_in_unsafe_fn)]
3use std::sync::Arc;
4
5use arrow::array::*;
6use arrow::bitmap::Bitmap;
7use arrow::compute::concatenate::concatenate_unchecked;
8use polars_compute::filter::filter_with_bitmap;
9
10use crate::prelude::{ChunkTakeUnchecked, *};
11
12pub mod ops;
13#[macro_use]
14pub mod arithmetic;
15pub mod builder;
16pub mod cast;
17pub mod collect;
18pub mod comparison;
19pub mod flags;
20pub mod float;
21pub mod iterator;
22#[cfg(feature = "ndarray")]
23pub(crate) mod ndarray;
24
25#[cfg(feature = "dtype-array")]
26pub(crate) mod array;
27mod binary;
28mod binary_offset;
29mod bitwise;
30#[cfg(feature = "object")]
31mod drop;
32mod from;
33mod from_iterator;
34pub mod from_iterator_par;
35pub(crate) mod list;
36pub(crate) mod logical;
37#[cfg(feature = "object")]
38pub mod object;
39#[cfg(feature = "random")]
40mod random;
41#[cfg(feature = "dtype-struct")]
42mod struct_;
43#[cfg(any(
44    feature = "temporal",
45    feature = "dtype-datetime",
46    feature = "dtype-date"
47))]
48pub mod temporal;
49mod to_vec;
50mod trusted_len;
51
52use arrow::legacy::prelude::*;
53#[cfg(feature = "dtype-struct")]
54pub use struct_::StructChunked;
55
56use self::flags::{StatisticsFlags, StatisticsFlagsIM};
57use crate::series::IsSorted;
58use crate::utils::{first_non_null, first_null, last_non_null};
59
60#[cfg(not(feature = "dtype-categorical"))]
61pub struct RevMapping {}
62
63pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;
64
65/// # ChunkedArray
66///
67/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows
68/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.
69/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].
70///
71/// ```rust
72/// # use polars_core::prelude::*;
73/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {
74///     ca.apply_values(|v| v.cos())
75/// }
76/// ```
77///
78/// ## Conversion between Series and ChunkedArrays
79/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.
80///
81/// ```rust
82/// # use polars_core::prelude::*;
83/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{
84///     series.i32()
85/// }
86///
87/// fn to_series(ca: Int32Chunked) -> Series {
88///     ca.into_series()
89/// }
90/// ```
91///
92/// # Iterators
93///
94/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
95/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby
96/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).
97///
98/// ```rust
99/// # use polars_core::prelude::*;
100///
101/// fn iter_forward(ca: &Float32Chunked) {
102///     ca.iter()
103///         .for_each(|opt_v| println!("{:?}", opt_v))
104/// }
105///
106/// fn iter_backward(ca: &Float32Chunked) {
107///     ca.iter()
108///         .rev()
109///         .for_each(|opt_v| println!("{:?}", opt_v))
110/// }
111/// ```
112///
113/// # Memory layout
114///
115/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.
116/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.
117///
118/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.
119/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).
120///
121/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection
122/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.
123/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.
124///
125/// If you want to have predictable performance
126/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after
127/// multiple append operations.
128///
129/// See also [`ChunkedArray::extend`] for appends within a chunk.
130///
131/// # Invariants
132/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].
133/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]
134///   chunks.
135/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the
136///   logical type given by the datatype.
137///
138/// [`List`]: crate::datatypes::DataType::List
139pub struct ChunkedArray<T: PolarsDataType> {
140    pub(crate) field: Arc<Field>,
141    pub(crate) chunks: Vec<ArrayRef>,
142
143    pub(crate) flags: StatisticsFlagsIM,
144
145    length: usize,
146    null_count: usize,
147    _pd: std::marker::PhantomData<T>,
148}
149
150impl<T: PolarsDataType> ChunkedArray<T> {
151    fn should_rechunk(&self) -> bool {
152        self.chunks.len() > 1 && self.chunks.len() > self.len() / 3
153    }
154
155    fn optional_rechunk(mut self) -> Self {
156        // Rechunk if we have many small chunks.
157        if self.should_rechunk() {
158            self.rechunk_mut()
159        }
160        self
161    }
162
163    pub(crate) fn as_any(&self) -> &dyn std::any::Any {
164        self
165    }
166
167    /// Series to [`ChunkedArray<T>`]
168    pub fn unpack_series_matching_type<'a>(
169        &self,
170        series: &'a Series,
171    ) -> PolarsResult<&'a ChunkedArray<T>> {
172        polars_ensure!(
173            self.dtype() == series.dtype(),
174            SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
175            series.dtype(),
176            self.dtype(),
177        );
178
179        // SAFETY: dtype will be correct.
180        Ok(unsafe { self.unpack_series_matching_physical_type(series) })
181    }
182
183    /// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.
184    ///
185    /// If you want to explicitly the `length` and `null_count`, look at
186    /// [`ChunkedArray::new_with_dims`]
187    fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {
188        unsafe {
189            let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);
190            chunked_arr.compute_len();
191            chunked_arr
192        }
193    }
194
195    /// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.
196    /// # Safety
197    /// The length and null_count must be correct.
198    pub unsafe fn new_with_dims(
199        field: Arc<Field>,
200        chunks: Vec<ArrayRef>,
201        length: usize,
202        null_count: usize,
203    ) -> Self {
204        Self {
205            field,
206            chunks,
207            flags: StatisticsFlagsIM::empty(),
208
209            _pd: Default::default(),
210            length,
211            null_count,
212        }
213    }
214
215    pub(crate) fn is_sorted_ascending_flag(&self) -> bool {
216        self.get_flags().is_sorted_ascending()
217    }
218
219    pub(crate) fn is_sorted_descending_flag(&self) -> bool {
220        self.get_flags().is_sorted_descending()
221    }
222
223    /// Whether `self` is sorted in any direction.
224    pub(crate) fn is_sorted_any(&self) -> bool {
225        self.get_flags().is_sorted_any()
226    }
227
228    pub fn unset_fast_explode_list(&mut self) {
229        self.set_fast_explode_list(false)
230    }
231
232    pub fn set_fast_explode_list(&mut self, value: bool) {
233        let mut flags = self.flags.get_mut();
234        flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);
235        self.flags.set_mut(flags);
236    }
237
238    pub fn get_fast_explode_list(&self) -> bool {
239        self.get_flags().can_fast_explode_list()
240    }
241
242    pub fn get_flags(&self) -> StatisticsFlags {
243        self.flags.get()
244    }
245
246    /// Set flags for the [`ChunkedArray`]
247    pub fn set_flags(&mut self, flags: StatisticsFlags) {
248        self.flags = StatisticsFlagsIM::new(flags);
249    }
250
251    pub fn is_sorted_flag(&self) -> IsSorted {
252        self.get_flags().is_sorted()
253    }
254
255    pub fn retain_flags_from<U: PolarsDataType>(
256        &mut self,
257        from: &ChunkedArray<U>,
258        retain_flags: StatisticsFlags,
259    ) {
260        let flags = from.flags.get();
261        // Try to avoid write contention.
262        if !flags.is_empty() {
263            self.set_flags(flags & retain_flags)
264        }
265    }
266
267    /// Set the 'sorted' bit meta info.
268    pub fn set_sorted_flag(&mut self, sorted: IsSorted) {
269        let mut flags = self.flags.get_mut();
270        flags.set_sorted(sorted);
271        self.flags.set_mut(flags);
272    }
273
274    /// Set the 'sorted' bit meta info.
275    pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {
276        let mut out = self.clone();
277        out.set_sorted_flag(sorted);
278        out
279    }
280
281    pub fn first_null(&self) -> Option<usize> {
282        if self.null_count() == 0 {
283            None
284        }
285        // We now know there is at least 1 non-null item in the array, and self.len() > 0
286        else if self.null_count() == self.len() {
287            Some(0)
288        } else if self.is_sorted_any() {
289            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
290                // nulls are all at the start
291                0
292            } else {
293                // nulls are all at the end
294                self.null_count()
295            };
296
297            debug_assert!(
298                // If we are lucky this catches something.
299                unsafe { self.get_unchecked(out) }.is_some(),
300                "incorrect sorted flag"
301            );
302
303            Some(out)
304        } else {
305            first_null(self.chunks().iter().map(|arr| arr.as_ref()))
306        }
307    }
308
309    /// Get the index of the first non null value in this [`ChunkedArray`].
310    pub fn first_non_null(&self) -> Option<usize> {
311        if self.null_count() == self.len() {
312            None
313        }
314        // We now know there is at least 1 non-null item in the array, and self.len() > 0
315        else if self.null_count() == 0 {
316            Some(0)
317        } else if self.is_sorted_any() {
318            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
319                // nulls are all at the start
320                self.null_count()
321            } else {
322                // nulls are all at the end
323                0
324            };
325
326            debug_assert!(
327                // If we are lucky this catches something.
328                unsafe { self.get_unchecked(out) }.is_some(),
329                "incorrect sorted flag"
330            );
331
332            Some(out)
333        } else {
334            first_non_null(self.chunks().iter().map(|arr| arr.as_ref()))
335        }
336    }
337
338    /// Get the index of the last non null value in this [`ChunkedArray`].
339    pub fn last_non_null(&self) -> Option<usize> {
340        if self.null_count() == self.len() {
341            None
342        }
343        // We now know there is at least 1 non-null item in the array, and self.len() > 0
344        else if self.null_count() == 0 {
345            Some(self.len() - 1)
346        } else if self.is_sorted_any() {
347            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
348                // nulls are all at the start
349                self.len() - 1
350            } else {
351                // nulls are all at the end
352                self.len() - self.null_count() - 1
353            };
354
355            debug_assert!(
356                // If we are lucky this catches something.
357                unsafe { self.get_unchecked(out) }.is_some(),
358                "incorrect sorted flag"
359            );
360
361            Some(out)
362        } else {
363            last_non_null(self.chunks().iter().map(|arr| arr.as_ref()), self.len())
364        }
365    }
366
367    pub fn drop_nulls(&self) -> Self {
368        if self.null_count() == 0 {
369            self.clone()
370        } else {
371            let chunks = self
372                .downcast_iter()
373                .map(|arr| {
374                    if arr.null_count() == 0 {
375                        arr.to_boxed()
376                    } else {
377                        filter_with_bitmap(arr, arr.validity().unwrap())
378                    }
379                })
380                .collect();
381            unsafe {
382                Self::new_with_dims(
383                    self.field.clone(),
384                    chunks,
385                    self.len() - self.null_count(),
386                    0,
387                )
388            }
389        }
390    }
391
392    /// Get the buffer of bits representing null values
393    #[inline]
394    #[allow(clippy::type_complexity)]
395    pub fn iter_validities(
396        &self,
397    ) -> impl ExactSizeIterator<Item = Option<&Bitmap>> + DoubleEndedIterator {
398        fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {
399            arr.validity()
400        }
401        self.chunks.iter().map(to_validity)
402    }
403
404    #[inline]
405    /// Return if any the chunks in this [`ChunkedArray`] have nulls.
406    pub fn has_nulls(&self) -> bool {
407        self.null_count > 0
408    }
409
410    /// Shrink the capacity of this array to fit its length.
411    pub fn shrink_to_fit(&mut self) {
412        self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];
413    }
414
415    pub fn clear(&self) -> Self {
416        // SAFETY: we keep the correct dtype
417        let mut ca = unsafe {
418            self.copy_with_chunks(vec![new_empty_array(
419                self.chunks.first().unwrap().dtype().clone(),
420            )])
421        };
422
423        use StatisticsFlags as F;
424        ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
425        ca
426    }
427
428    /// Unpack a [`Series`] to the same physical type.
429    ///
430    /// # Safety
431    ///
432    /// This is unsafe as the dtype may be incorrect and
433    /// is assumed to be correct in other safe code.
434    pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(
435        &self,
436        series: &'a Series,
437    ) -> &'a ChunkedArray<T> {
438        let series_trait = &**series;
439        if self.dtype() == series.dtype() {
440            &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
441        } else {
442            use DataType::*;
443            match (self.dtype(), series.dtype()) {
444                (Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {
445                    &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
446                },
447                _ => panic!(
448                    "cannot unpack series {:?} into matching type {:?}",
449                    series,
450                    self.dtype()
451                ),
452            }
453        }
454    }
455
456    /// Returns an iterator over the lengths of the chunks of the array.
457    pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {
458        self.chunks.iter().map(|chunk| chunk.len())
459    }
460
461    /// A reference to the chunks
462    #[inline]
463    pub fn chunks(&self) -> &Vec<ArrayRef> {
464        &self.chunks
465    }
466
467    /// A mutable reference to the chunks
468    ///
469    /// # Safety
470    /// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.
471    /// And the `null_count` remains correct.
472    #[inline]
473    pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
474        &mut self.chunks
475    }
476
477    /// Returns true if contains a single chunk and has no null values
478    pub fn is_optimal_aligned(&self) -> bool {
479        self.chunks.len() == 1 && self.null_count() == 0
480    }
481
482    /// Create a new [`ChunkedArray`] from self, where the chunks are replaced.
483    ///
484    /// # Safety
485    /// The caller must ensure the dtypes of the chunks are correct
486    unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
487        Self::new_with_compute_len(self.field.clone(), chunks)
488    }
489
490    /// Get data type of [`ChunkedArray`].
491    pub fn dtype(&self) -> &DataType {
492        self.field.dtype()
493    }
494
495    pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {
496        self.field = Arc::new(Field::new(self.name().clone(), dtype))
497    }
498
499    /// Name of the [`ChunkedArray`].
500    pub fn name(&self) -> &PlSmallStr {
501        self.field.name()
502    }
503
504    /// Get a reference to the field.
505    pub fn ref_field(&self) -> &Field {
506        &self.field
507    }
508
509    /// Rename this [`ChunkedArray`].
510    pub fn rename(&mut self, name: PlSmallStr) {
511        self.field = Arc::new(Field::new(name, self.field.dtype().clone()));
512    }
513
514    /// Return this [`ChunkedArray`] with a new name.
515    pub fn with_name(mut self, name: PlSmallStr) -> Self {
516        self.rename(name);
517        self
518    }
519}
520
521impl<T> ChunkedArray<T>
522where
523    T: PolarsDataType,
524{
525    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
526    /// indicates a NULL value.
527    ///
528    /// # Panics
529    /// This function will panic if `idx` is out of bounds.
530    #[inline]
531    pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {
532        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
533        assert!(
534            chunk_idx < self.chunks().len(),
535            "index: {} out of bounds for len: {}",
536            idx,
537            self.len()
538        );
539        unsafe {
540            let arr = self.downcast_get_unchecked(chunk_idx);
541            assert!(
542                arr_idx < arr.len(),
543                "index: {} out of bounds for len: {}",
544                idx,
545                self.len()
546            );
547            arr.get_unchecked(arr_idx)
548        }
549    }
550
551    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
552    /// indicates a NULL value.
553    ///
554    /// # Safety
555    /// It is the callers responsibility that the `idx < self.len()`.
556    #[inline]
557    pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {
558        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
559
560        unsafe {
561            // SAFETY: up to the caller to make sure the index is valid.
562            self.downcast_get_unchecked(chunk_idx)
563                .get_unchecked(arr_idx)
564        }
565    }
566
567    /// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned
568    /// value could be garbage if it was masked out by NULL. Note that the value always is initialized.
569    ///
570    /// # Safety
571    /// It is the callers responsibility that the `idx < self.len()`.
572    #[inline]
573    pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {
574        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
575
576        unsafe {
577            // SAFETY: up to the caller to make sure the index is valid.
578            self.downcast_get_unchecked(chunk_idx)
579                .value_unchecked(arr_idx)
580        }
581    }
582
583    /// # Panics
584    /// Panics if the [`ChunkedArray`] is empty.
585    #[inline]
586    pub fn first(&self) -> Option<T::Physical<'_>> {
587        self.iter().next().unwrap()
588    }
589
590    /// # Panics
591    /// Panics if the [`ChunkedArray`] is empty.
592    #[inline]
593    pub fn last(&self) -> Option<T::Physical<'_>> {
594        let arr = self
595            .downcast_iter()
596            .rev()
597            .find(|arr| !arr.is_empty())
598            .unwrap();
599        unsafe { arr.get_unchecked(arr.len() - 1) }
600    }
601
602    pub fn set_validity(&mut self, validity: &Bitmap) {
603        assert_eq!(self.len(), validity.len());
604        let mut i = 0;
605        for chunk in unsafe { self.chunks_mut() } {
606            *chunk = chunk.with_validity(Some(validity.clone().sliced(i, chunk.len())));
607            i += chunk.len();
608        }
609        self.null_count = validity.unset_bits();
610        self.set_fast_explode_list(false);
611    }
612}
613
614impl<T> ChunkedArray<T>
615where
616    T: PolarsDataType,
617    ChunkedArray<T>: ChunkTakeUnchecked<[IdxSize]>,
618{
619    /// Deposit values into nulls with a certain validity mask.
620    pub fn deposit(&self, validity: &Bitmap) -> Self {
621        let set_bits = validity.set_bits();
622
623        assert_eq!(self.null_count(), 0);
624        assert_eq!(self.len(), set_bits);
625
626        if set_bits == validity.len() {
627            return self.clone();
628        }
629
630        if set_bits == 0 {
631            return Self::full_null_like(self, validity.len());
632        }
633
634        let mut null_mask = validity.clone();
635
636        let mut gather_idxs = Vec::with_capacity(validity.len());
637        let leading_nulls = null_mask.take_leading_zeros();
638        gather_idxs.extend(std::iter::repeat_n(0, leading_nulls + 1));
639
640        let mut i = 0 as IdxSize;
641        gather_idxs.extend(null_mask.iter().skip(1).map(|v| {
642            i += IdxSize::from(v);
643            i
644        }));
645
646        let mut ca = unsafe { ChunkTakeUnchecked::take_unchecked(self, &gather_idxs) };
647        ca.set_validity(validity);
648        ca
649    }
650}
651
652impl ListChunked {
653    #[inline]
654    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
655        unsafe {
656            Some(Series::from_chunks_and_dtype_unchecked(
657                self.name().clone(),
658                vec![self.get(idx)?],
659                &self.inner_dtype().to_physical(),
660            ))
661        }
662    }
663
664    pub fn has_empty_lists(&self) -> bool {
665        for arr in self.downcast_iter() {
666            if arr.is_empty() {
667                continue;
668            }
669
670            if match arr.validity() {
671                None => arr.offsets().lengths().any(|l| l == 0),
672                Some(validity) => arr
673                    .offsets()
674                    .lengths()
675                    .enumerate()
676                    .any(|(i, l)| l == 0 && unsafe { validity.get_bit_unchecked(i) }),
677            } {
678                return true;
679            }
680        }
681
682        false
683    }
684
685    pub fn has_masked_out_values(&self) -> bool {
686        for arr in self.downcast_iter() {
687            if arr.is_empty() {
688                continue;
689            }
690
691            if *arr.offsets().first() != 0 || *arr.offsets().last() != arr.values().len() as i64 {
692                return true;
693            }
694
695            let Some(validity) = arr.validity() else {
696                continue;
697            };
698            if validity.set_bits() == 0 {
699                continue;
700            }
701
702            // @Performance: false_idx_iter
703            for i in (!validity).true_idx_iter() {
704                if arr.offsets().length_at(i) > 0 {
705                    return true;
706                }
707            }
708        }
709
710        false
711    }
712}
713
714#[cfg(feature = "dtype-array")]
715impl ArrayChunked {
716    #[inline]
717    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
718        unsafe {
719            Some(Series::from_chunks_and_dtype_unchecked(
720                self.name().clone(),
721                vec![self.get(idx)?],
722                &self.inner_dtype().to_physical(),
723            ))
724        }
725    }
726
727    pub fn from_aligned_values(
728        name: PlSmallStr,
729        inner_dtype: &DataType,
730        width: usize,
731        chunks: Vec<ArrayRef>,
732        length: usize,
733    ) -> Self {
734        let dtype = DataType::Array(Box::new(inner_dtype.clone()), width);
735        let arrow_dtype = dtype.to_arrow(CompatLevel::newest());
736        let field = Arc::new(Field::new(name, dtype));
737        if width == 0 {
738            use arrow::array::builder::{ArrayBuilder, make_builder};
739            let values = make_builder(&inner_dtype.to_arrow(CompatLevel::newest())).freeze();
740            return ArrayChunked::new_with_compute_len(
741                field,
742                vec![FixedSizeListArray::new(arrow_dtype, length, values, None).into_boxed()],
743            );
744        }
745        let mut total_len = 0;
746        let chunks = chunks
747            .into_iter()
748            .map(|chunk| {
749                debug_assert_eq!(chunk.len() % width, 0);
750                let chunk_len = chunk.len() / width;
751                total_len += chunk_len;
752                FixedSizeListArray::new(arrow_dtype.clone(), chunk_len, chunk, None).into_boxed()
753            })
754            .collect();
755        debug_assert_eq!(total_len, length);
756
757        unsafe { Self::new_with_dims(field, chunks, length, 0) }
758    }
759
760    /// Turn the ArrayChunked into the ListChunked with the same items.
761    ///
762    /// This will always zero copy the values into the ListChunked.
763    pub fn to_list(&self) -> ListChunked {
764        let inner_dtype = self.inner_dtype();
765        let chunks = self
766            .downcast_iter()
767            .map(|chunk| {
768                use arrow::offset::OffsetsBuffer;
769
770                let inner_dtype = chunk.dtype().inner_dtype().unwrap();
771                let dtype = inner_dtype.clone().to_large_list(true);
772
773                let offsets = (0..=chunk.len())
774                    .map(|i| (i * self.width()) as i64)
775                    .collect::<Vec<i64>>();
776
777                // SAFETY: We created our offsets in ascending manner.
778                let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) };
779
780                ListArray::<i64>::new(
781                    dtype,
782                    offsets,
783                    chunk.values().clone(),
784                    chunk.validity().cloned(),
785                )
786                .into_boxed()
787            })
788            .collect();
789
790        // SAFETY: All the items were mapped 1-1 with the validity staying the same.
791        let mut ca = unsafe {
792            ListChunked::new_with_dims(
793                Arc::new(Field::new(
794                    self.name().clone(),
795                    DataType::List(Box::new(inner_dtype.clone())),
796                )),
797                chunks,
798                self.len(),
799                self.null_count(),
800            )
801        };
802        ca.set_fast_explode_list(!self.has_nulls());
803        ca
804    }
805}
806
807impl<T> ChunkedArray<T>
808where
809    T: PolarsDataType,
810{
811    /// Should be used to match the chunk_id of another [`ChunkedArray`].
812    /// # Panics
813    /// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.
814    pub fn match_chunks<I>(&self, chunk_id: I) -> Self
815    where
816        I: Iterator<Item = usize>,
817    {
818        debug_assert!(self.chunks.len() == 1);
819        // Takes a ChunkedArray containing a single chunk.
820        let slice = |ca: &Self| {
821            let array = &ca.chunks[0];
822
823            let mut offset = 0;
824            let chunks = chunk_id
825                .map(|len| {
826                    // SAFETY: within bounds.
827                    debug_assert!((offset + len) <= array.len());
828                    let out = unsafe { array.sliced_unchecked(offset, len) };
829                    offset += len;
830                    out
831                })
832                .collect();
833
834            debug_assert_eq!(offset, array.len());
835
836            // SAFETY: We just slice the original chunks, their type will not change.
837            unsafe {
838                Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())
839            }
840        };
841
842        if self.chunks.len() != 1 {
843            let out = self.rechunk();
844            slice(&out)
845        } else {
846            slice(self)
847        }
848    }
849}
850
851impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {
852    fn as_ref_dtype(&self) -> &DataType {
853        self.dtype()
854    }
855}
856
857pub(crate) trait AsSinglePtr: AsRefDataType {
858    /// Rechunk and return a ptr to the start of the array
859    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
860        polars_bail!(opq = as_single_ptr, self.as_ref_dtype());
861    }
862}
863
864impl<T> AsSinglePtr for ChunkedArray<T>
865where
866    T: PolarsNumericType,
867{
868    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
869        self.rechunk_mut();
870        let a = self.data_views().next().unwrap();
871        let ptr = a.as_ptr();
872        Ok(ptr as usize)
873    }
874}
875
876impl AsSinglePtr for BooleanChunked {}
877impl AsSinglePtr for ListChunked {}
878#[cfg(feature = "dtype-array")]
879impl AsSinglePtr for ArrayChunked {}
880impl AsSinglePtr for StringChunked {}
881impl AsSinglePtr for BinaryChunked {}
882#[cfg(feature = "object")]
883impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}
884
885pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {
886    SingleNoNull(&'a T::Array),
887    Single(&'a T::Array),
888    MultiNoNull(&'a ChunkedArray<T>),
889    Multi(&'a ChunkedArray<T>),
890}
891
892impl<T> ChunkedArray<T>
893where
894    T: PolarsDataType,
895{
896    pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {
897        if self.chunks.len() == 1 {
898            let arr = self.downcast_iter().next().unwrap();
899            return if arr.null_count() == 0 {
900                ChunkedArrayLayout::SingleNoNull(arr)
901            } else {
902                ChunkedArrayLayout::Single(arr)
903            };
904        }
905
906        if self.downcast_iter().all(|a| a.null_count() == 0) {
907            ChunkedArrayLayout::MultiNoNull(self)
908        } else {
909            ChunkedArrayLayout::Multi(self)
910        }
911    }
912}
913
914impl<T> ChunkedArray<T>
915where
916    T: PolarsNumericType,
917{
918    /// Returns the values of the array as a contiguous slice.
919    pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {
920        polars_ensure!(
921            self.chunks.len() == 1 && self.chunks[0].null_count() == 0,
922            ComputeError: "chunked array is not contiguous"
923        );
924        Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())
925    }
926
927    /// Returns the values of the array as a contiguous mutable slice.
928    pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {
929        if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {
930            // SAFETY, we will not swap the PrimitiveArray.
931            let arr = unsafe { self.downcast_iter_mut().next().unwrap() };
932            arr.get_mut_values()
933        } else {
934            None
935        }
936    }
937
938    /// Get slices of the underlying arrow data.
939    /// NOTE: null values should be taken into account by the user of these slices as they are handled
940    /// separately
941    pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {
942        self.downcast_iter().map(|arr| arr.values().as_slice())
943    }
944
945    #[allow(clippy::wrong_self_convention)]
946    pub fn into_no_null_iter(
947        &self,
948    ) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen
949    {
950        // .copied was significantly slower in benchmark, next call did not inline?
951        #[allow(clippy::map_clone)]
952        // we know the iterators len
953        unsafe {
954            self.data_views()
955                .flatten()
956                .map(|v| *v)
957                .trust_my_length(self.len())
958        }
959    }
960}
961
962impl<T: PolarsDataType> Clone for ChunkedArray<T> {
963    fn clone(&self) -> Self {
964        ChunkedArray {
965            field: self.field.clone(),
966            chunks: self.chunks.clone(),
967            flags: self.flags.clone(),
968
969            _pd: Default::default(),
970            length: self.length,
971            null_count: self.null_count,
972        }
973    }
974}
975
976impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {
977    fn as_ref(&self) -> &ChunkedArray<T> {
978        self
979    }
980}
981
982impl ValueSize for ListChunked {
983    fn get_values_size(&self) -> usize {
984        self.chunks
985            .iter()
986            .fold(0usize, |acc, arr| acc + arr.get_values_size())
987    }
988}
989
990#[cfg(feature = "dtype-array")]
991impl ValueSize for ArrayChunked {
992    fn get_values_size(&self) -> usize {
993        self.chunks
994            .iter()
995            .fold(0usize, |acc, arr| acc + arr.get_values_size())
996    }
997}
998impl ValueSize for StringChunked {
999    fn get_values_size(&self) -> usize {
1000        self.chunks
1001            .iter()
1002            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1003    }
1004}
1005
1006impl ValueSize for BinaryOffsetChunked {
1007    fn get_values_size(&self) -> usize {
1008        self.chunks
1009            .iter()
1010            .fold(0usize, |acc, arr| acc + arr.get_values_size())
1011    }
1012}
1013
1014pub(crate) fn to_primitive<T: PolarsNumericType>(
1015    values: Vec<T::Native>,
1016    validity: Option<Bitmap>,
1017) -> PrimitiveArray<T::Native> {
1018    PrimitiveArray::new(
1019        T::get_static_dtype().to_arrow(CompatLevel::newest()),
1020        values.into(),
1021        validity,
1022    )
1023}
1024
1025pub(crate) fn to_array<T: PolarsNumericType>(
1026    values: Vec<T::Native>,
1027    validity: Option<Bitmap>,
1028) -> ArrayRef {
1029    Box::new(to_primitive::<T>(values, validity))
1030}
1031
1032impl<T: PolarsDataType> Default for ChunkedArray<T> {
1033    fn default() -> Self {
1034        let dtype = T::get_static_dtype();
1035        let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());
1036        ChunkedArray {
1037            field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),
1038            // Invariant: always has 1 chunk.
1039            chunks: vec![new_empty_array(arrow_dtype)],
1040            flags: StatisticsFlagsIM::empty(),
1041
1042            _pd: Default::default(),
1043            length: 0,
1044            null_count: 0,
1045        }
1046    }
1047}
1048
1049#[cfg(test)]
1050pub(crate) mod test {
1051    use crate::prelude::*;
1052
1053    pub(crate) fn get_chunked_array() -> Int32Chunked {
1054        ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])
1055    }
1056
1057    #[test]
1058    fn test_sort() {
1059        let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);
1060        let b = a
1061            .sort(false)
1062            .into_iter()
1063            .map(|opt| opt.unwrap())
1064            .collect::<Vec<_>>();
1065        assert_eq!(b, [1, 2, 3, 9]);
1066        let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);
1067        let a = a.sort(false);
1068        let b = a.into_iter().collect::<Vec<_>>();
1069        assert_eq!(b, [Some("a"), Some("b"), Some("c")]);
1070        assert!(a.is_sorted_ascending_flag());
1071    }
1072
1073    #[test]
1074    fn arithmetic() {
1075        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);
1076        let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);
1077
1078        // Not really asserting anything here but still making sure the code is exercised
1079        // This (and more) is properly tested from the integration test suite and Python bindings.
1080        println!("{:?}", a + b);
1081        println!("{:?}", a - b);
1082        println!("{:?}", a * b);
1083        println!("{:?}", a / b);
1084    }
1085
1086    #[test]
1087    fn iter() {
1088        let s1 = get_chunked_array();
1089        // sum
1090        assert_eq!(s1.into_iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)
1091    }
1092
1093    #[test]
1094    fn limit() {
1095        let a = get_chunked_array();
1096        let b = a.limit(2);
1097        println!("{b:?}");
1098        assert_eq!(b.len(), 2)
1099    }
1100
1101    #[test]
1102    fn filter() {
1103        let a = get_chunked_array();
1104        let b = a
1105            .filter(&BooleanChunked::new(
1106                PlSmallStr::from_static("filter"),
1107                &[true, false, false],
1108            ))
1109            .unwrap();
1110        assert_eq!(b.len(), 1);
1111        assert_eq!(b.into_iter().next(), Some(Some(1)));
1112    }
1113
1114    #[test]
1115    fn aggregates() {
1116        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);
1117        assert_eq!(a.max(), Some(100));
1118        assert_eq!(a.min(), Some(1));
1119        assert_eq!(a.sum(), Some(120))
1120    }
1121
1122    #[test]
1123    fn take() {
1124        let a = get_chunked_array();
1125        let new = a.take(&[0 as IdxSize, 1]).unwrap();
1126        assert_eq!(new.len(), 2)
1127    }
1128
1129    #[test]
1130    fn cast() {
1131        let a = get_chunked_array();
1132        let b = a.cast(&DataType::Int64).unwrap();
1133        assert_eq!(b.dtype(), &DataType::Int64)
1134    }
1135
1136    fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])
1137    where
1138        T: PolarsNumericType,
1139    {
1140        assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)
1141    }
1142
1143    #[test]
1144    fn slice() {
1145        let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);
1146        let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);
1147        first.append(&second).unwrap();
1148        assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);
1149        assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);
1150        assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);
1151        assert_slice_equal(&first.slice(3, 2), &[3, 4]);
1152        assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);
1153        assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);
1154        assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);
1155
1156        assert_eq!(first.slice(-7, 2).len(), 1);
1157        assert_eq!(first.slice(-3, 4).len(), 3);
1158        assert_eq!(first.slice(3, 4).len(), 3);
1159        assert_eq!(first.slice(10, 4).len(), 0);
1160    }
1161
1162    #[test]
1163    fn sorting() {
1164        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);
1165        let sorted = s.sort(false);
1166        assert_slice_equal(&sorted, &[2, 4, 9]);
1167        let sorted = s.sort(true);
1168        assert_slice_equal(&sorted, &[9, 4, 2]);
1169
1170        let s: StringChunked = ["b", "a", "z"].iter().collect();
1171        let sorted = s.sort(false);
1172        assert_eq!(
1173            sorted.into_iter().collect::<Vec<_>>(),
1174            &[Some("a"), Some("b"), Some("z")]
1175        );
1176        let sorted = s.sort(true);
1177        assert_eq!(
1178            sorted.into_iter().collect::<Vec<_>>(),
1179            &[Some("z"), Some("b"), Some("a")]
1180        );
1181        let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();
1182        let sorted = s.sort(false);
1183        assert_eq!(
1184            sorted.into_iter().collect::<Vec<_>>(),
1185            &[None, Some("b"), Some("z")]
1186        );
1187    }
1188
1189    #[test]
1190    fn reverse() {
1191        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);
1192        // path with continuous slice
1193        assert_slice_equal(&s.reverse(), &[3, 2, 1]);
1194        // path with options
1195        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);
1196        assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);
1197        let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);
1198        assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);
1199
1200        let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);
1201        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);
1202
1203        let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);
1204        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);
1205    }
1206
1207    #[test]
1208    #[cfg(feature = "dtype-categorical")]
1209    fn test_iter_categorical() {
1210        let ca = StringChunked::new(
1211            PlSmallStr::EMPTY,
1212            &[Some("foo"), None, Some("bar"), Some("ham")],
1213        );
1214        let cats = Categories::new(
1215            PlSmallStr::EMPTY,
1216            PlSmallStr::EMPTY,
1217            CategoricalPhysical::U32,
1218        );
1219        let ca = ca.cast(&DataType::from_categories(cats)).unwrap();
1220        let ca = ca.cat32().unwrap();
1221        let v: Vec<_> = ca.physical().into_iter().collect();
1222        assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);
1223    }
1224
1225    #[test]
1226    #[ignore]
1227    fn test_shrink_to_fit() {
1228        let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);
1229        builder.append_value("foo");
1230        let mut arr = builder.finish();
1231        let before = arr
1232            .chunks()
1233            .iter()
1234            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1235            .sum::<usize>();
1236        arr.shrink_to_fit();
1237        let after = arr
1238            .chunks()
1239            .iter()
1240            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1241            .sum::<usize>();
1242        assert!(before > after);
1243    }
1244}