polars_core/chunked_array/
mod.rs

1//! The typed heart of every Series column.
2#![allow(unsafe_op_in_unsafe_fn)]
3use std::iter::Map;
4use std::sync::Arc;
5
6use arrow::array::*;
7use arrow::bitmap::Bitmap;
8use arrow::compute::concatenate::concatenate_unchecked;
9use polars_compute::filter::filter_with_bitmap;
10
11use crate::prelude::{ChunkTakeUnchecked, *};
12
13pub mod ops;
14#[macro_use]
15pub mod arithmetic;
16pub mod builder;
17pub mod cast;
18pub mod collect;
19pub mod comparison;
20pub mod flags;
21pub mod float;
22pub mod iterator;
23#[cfg(feature = "ndarray")]
24pub(crate) mod ndarray;
25
26#[cfg(feature = "dtype-array")]
27pub(crate) mod array;
28mod binary;
29mod binary_offset;
30mod bitwise;
31#[cfg(feature = "object")]
32mod drop;
33mod from;
34mod from_iterator;
35pub mod from_iterator_par;
36pub(crate) mod list;
37pub(crate) mod logical;
38#[cfg(feature = "object")]
39pub mod object;
40#[cfg(feature = "random")]
41mod random;
42#[cfg(feature = "dtype-struct")]
43mod struct_;
44#[cfg(any(
45    feature = "temporal",
46    feature = "dtype-datetime",
47    feature = "dtype-date"
48))]
49pub mod temporal;
50mod to_vec;
51mod trusted_len;
52
53use std::slice::Iter;
54
55use arrow::legacy::prelude::*;
56#[cfg(feature = "dtype-struct")]
57pub use struct_::StructChunked;
58
59use self::flags::{StatisticsFlags, StatisticsFlagsIM};
60use crate::series::IsSorted;
61use crate::utils::{first_non_null, last_non_null};
62
63#[cfg(not(feature = "dtype-categorical"))]
64pub struct RevMapping {}
65
66pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;
67
68/// # ChunkedArray
69///
70/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows
71/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.
72/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].
73///
74/// ```rust
75/// # use polars_core::prelude::*;
76/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {
77///     ca.apply_values(|v| v.cos())
78/// }
79/// ```
80///
81/// ## Conversion between Series and ChunkedArrays
82/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.
83///
84/// ```rust
85/// # use polars_core::prelude::*;
86/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{
87///     series.i32()
88/// }
89///
90/// fn to_series(ca: Int32Chunked) -> Series {
91///     ca.into_series()
92/// }
93/// ```
94///
95/// # Iterators
96///
97/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
98/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby
99/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).
100///
101/// ```rust
102/// # use polars_core::prelude::*;
103///
104/// fn iter_forward(ca: &Float32Chunked) {
105///     ca.iter()
106///         .for_each(|opt_v| println!("{:?}", opt_v))
107/// }
108///
109/// fn iter_backward(ca: &Float32Chunked) {
110///     ca.iter()
111///         .rev()
112///         .for_each(|opt_v| println!("{:?}", opt_v))
113/// }
114/// ```
115///
116/// # Memory layout
117///
118/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.
119/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.
120///
121/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.
122/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).
123///
124/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection
125/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.
126/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.
127///
128/// If you want to have predictable performance
129/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after
130/// multiple append operations.
131///
132/// See also [`ChunkedArray::extend`] for appends within a chunk.
133///
134/// # Invariants
135/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].
136/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]
137///   chunks.
138/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the
139///   logical type given by the datatype.
140///
141/// [`List`]: crate::datatypes::DataType::List
142pub struct ChunkedArray<T: PolarsDataType> {
143    pub(crate) field: Arc<Field>,
144    pub(crate) chunks: Vec<ArrayRef>,
145
146    pub(crate) flags: StatisticsFlagsIM,
147
148    length: usize,
149    null_count: usize,
150    _pd: std::marker::PhantomData<T>,
151}
152
153impl<T: PolarsDataType> ChunkedArray<T> {
154    fn should_rechunk(&self) -> bool {
155        self.chunks.len() > 1 && self.chunks.len() > self.len() / 3
156    }
157
158    fn optional_rechunk(mut self) -> Self {
159        // Rechunk if we have many small chunks.
160        if self.should_rechunk() {
161            self.rechunk_mut()
162        }
163        self
164    }
165
166    pub(crate) fn as_any(&self) -> &dyn std::any::Any {
167        self
168    }
169
170    /// Series to [`ChunkedArray<T>`]
171    pub fn unpack_series_matching_type<'a>(
172        &self,
173        series: &'a Series,
174    ) -> PolarsResult<&'a ChunkedArray<T>> {
175        polars_ensure!(
176            self.dtype() == series.dtype(),
177            SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
178            series.dtype(),
179            self.dtype(),
180        );
181
182        // SAFETY: dtype will be correct.
183        Ok(unsafe { self.unpack_series_matching_physical_type(series) })
184    }
185
186    /// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.
187    ///
188    /// If you want to explicitly the `length` and `null_count`, look at
189    /// [`ChunkedArray::new_with_dims`]
190    fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {
191        unsafe {
192            let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);
193            chunked_arr.compute_len();
194            chunked_arr
195        }
196    }
197
198    /// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.
199    /// # Safety
200    /// The length and null_count must be correct.
201    pub unsafe fn new_with_dims(
202        field: Arc<Field>,
203        chunks: Vec<ArrayRef>,
204        length: usize,
205        null_count: usize,
206    ) -> Self {
207        Self {
208            field,
209            chunks,
210            flags: StatisticsFlagsIM::empty(),
211
212            _pd: Default::default(),
213            length,
214            null_count,
215        }
216    }
217
218    pub(crate) fn is_sorted_ascending_flag(&self) -> bool {
219        self.get_flags().is_sorted_ascending()
220    }
221
222    pub(crate) fn is_sorted_descending_flag(&self) -> bool {
223        self.get_flags().is_sorted_descending()
224    }
225
226    /// Whether `self` is sorted in any direction.
227    pub(crate) fn is_sorted_any(&self) -> bool {
228        self.get_flags().is_sorted_any()
229    }
230
231    pub fn unset_fast_explode_list(&mut self) {
232        self.set_fast_explode_list(false)
233    }
234
235    pub fn set_fast_explode_list(&mut self, value: bool) {
236        let mut flags = self.flags.get_mut();
237        flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);
238        self.flags.set_mut(flags);
239    }
240
241    pub fn get_fast_explode_list(&self) -> bool {
242        self.get_flags().can_fast_explode_list()
243    }
244
245    pub fn get_flags(&self) -> StatisticsFlags {
246        self.flags.get()
247    }
248
249    /// Set flags for the [`ChunkedArray`]
250    pub fn set_flags(&mut self, flags: StatisticsFlags) {
251        self.flags = StatisticsFlagsIM::new(flags);
252    }
253
254    pub fn is_sorted_flag(&self) -> IsSorted {
255        self.get_flags().is_sorted()
256    }
257
258    pub fn retain_flags_from<U: PolarsDataType>(
259        &mut self,
260        from: &ChunkedArray<U>,
261        retain_flags: StatisticsFlags,
262    ) {
263        let flags = from.flags.get();
264        // Try to avoid write contention.
265        if !flags.is_empty() {
266            self.set_flags(flags & retain_flags)
267        }
268    }
269
270    /// Set the 'sorted' bit meta info.
271    pub fn set_sorted_flag(&mut self, sorted: IsSorted) {
272        let mut flags = self.flags.get_mut();
273        flags.set_sorted(sorted);
274        self.flags.set_mut(flags);
275    }
276
277    /// Set the 'sorted' bit meta info.
278    pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {
279        let mut out = self.clone();
280        out.set_sorted_flag(sorted);
281        out
282    }
283
284    /// Get the index of the first non null value in this [`ChunkedArray`].
285    pub fn first_non_null(&self) -> Option<usize> {
286        if self.null_count() == self.len() {
287            None
288        }
289        // We now know there is at least 1 non-null item in the array, and self.len() > 0
290        else if self.null_count() == 0 {
291            Some(0)
292        } else if self.is_sorted_any() {
293            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
294                // nulls are all at the start
295                self.null_count()
296            } else {
297                // nulls are all at the end
298                0
299            };
300
301            debug_assert!(
302                // If we are lucky this catches something.
303                unsafe { self.get_unchecked(out) }.is_some(),
304                "incorrect sorted flag"
305            );
306
307            Some(out)
308        } else {
309            first_non_null(self.iter_validities())
310        }
311    }
312
313    /// Get the index of the last non null value in this [`ChunkedArray`].
314    pub fn last_non_null(&self) -> Option<usize> {
315        if self.null_count() == self.len() {
316            None
317        }
318        // We now know there is at least 1 non-null item in the array, and self.len() > 0
319        else if self.null_count() == 0 {
320            Some(self.len() - 1)
321        } else if self.is_sorted_any() {
322            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
323                // nulls are all at the start
324                self.len() - 1
325            } else {
326                // nulls are all at the end
327                self.len() - self.null_count() - 1
328            };
329
330            debug_assert!(
331                // If we are lucky this catches something.
332                unsafe { self.get_unchecked(out) }.is_some(),
333                "incorrect sorted flag"
334            );
335
336            Some(out)
337        } else {
338            last_non_null(self.iter_validities(), self.len())
339        }
340    }
341
342    pub fn drop_nulls(&self) -> Self {
343        if self.null_count() == 0 {
344            self.clone()
345        } else {
346            let chunks = self
347                .downcast_iter()
348                .map(|arr| {
349                    if arr.null_count() == 0 {
350                        arr.to_boxed()
351                    } else {
352                        filter_with_bitmap(arr, arr.validity().unwrap())
353                    }
354                })
355                .collect();
356            unsafe {
357                Self::new_with_dims(
358                    self.field.clone(),
359                    chunks,
360                    self.len() - self.null_count(),
361                    0,
362                )
363            }
364        }
365    }
366
367    /// Get the buffer of bits representing null values
368    #[inline]
369    #[allow(clippy::type_complexity)]
370    pub fn iter_validities(&self) -> Map<Iter<'_, ArrayRef>, fn(&ArrayRef) -> Option<&Bitmap>> {
371        fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {
372            arr.validity()
373        }
374        self.chunks.iter().map(to_validity)
375    }
376
377    #[inline]
378    /// Return if any the chunks in this [`ChunkedArray`] have nulls.
379    pub fn has_nulls(&self) -> bool {
380        self.null_count > 0
381    }
382
383    /// Shrink the capacity of this array to fit its length.
384    pub fn shrink_to_fit(&mut self) {
385        self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];
386    }
387
388    pub fn clear(&self) -> Self {
389        // SAFETY: we keep the correct dtype
390        let mut ca = unsafe {
391            self.copy_with_chunks(vec![new_empty_array(
392                self.chunks.first().unwrap().dtype().clone(),
393            )])
394        };
395
396        use StatisticsFlags as F;
397        ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
398        ca
399    }
400
401    /// Unpack a [`Series`] to the same physical type.
402    ///
403    /// # Safety
404    ///
405    /// This is unsafe as the dtype may be incorrect and
406    /// is assumed to be correct in other safe code.
407    pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(
408        &self,
409        series: &'a Series,
410    ) -> &'a ChunkedArray<T> {
411        let series_trait = &**series;
412        if self.dtype() == series.dtype() {
413            &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
414        } else {
415            use DataType::*;
416            match (self.dtype(), series.dtype()) {
417                (Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {
418                    &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
419                },
420                _ => panic!(
421                    "cannot unpack series {:?} into matching type {:?}",
422                    series,
423                    self.dtype()
424                ),
425            }
426        }
427    }
428
429    /// Returns an iterator over the lengths of the chunks of the array.
430    pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {
431        self.chunks.iter().map(|chunk| chunk.len())
432    }
433
434    /// A reference to the chunks
435    #[inline]
436    pub fn chunks(&self) -> &Vec<ArrayRef> {
437        &self.chunks
438    }
439
440    /// A mutable reference to the chunks
441    ///
442    /// # Safety
443    /// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.
444    /// And the `null_count` remains correct.
445    #[inline]
446    pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
447        &mut self.chunks
448    }
449
450    /// Returns true if contains a single chunk and has no null values
451    pub fn is_optimal_aligned(&self) -> bool {
452        self.chunks.len() == 1 && self.null_count() == 0
453    }
454
455    /// Create a new [`ChunkedArray`] from self, where the chunks are replaced.
456    ///
457    /// # Safety
458    /// The caller must ensure the dtypes of the chunks are correct
459    unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
460        Self::new_with_compute_len(self.field.clone(), chunks)
461    }
462
463    /// Get data type of [`ChunkedArray`].
464    pub fn dtype(&self) -> &DataType {
465        self.field.dtype()
466    }
467
468    pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {
469        self.field = Arc::new(Field::new(self.name().clone(), dtype))
470    }
471
472    /// Name of the [`ChunkedArray`].
473    pub fn name(&self) -> &PlSmallStr {
474        self.field.name()
475    }
476
477    /// Get a reference to the field.
478    pub fn ref_field(&self) -> &Field {
479        &self.field
480    }
481
482    /// Rename this [`ChunkedArray`].
483    pub fn rename(&mut self, name: PlSmallStr) {
484        self.field = Arc::new(Field::new(name, self.field.dtype().clone()));
485    }
486
487    /// Return this [`ChunkedArray`] with a new name.
488    pub fn with_name(mut self, name: PlSmallStr) -> Self {
489        self.rename(name);
490        self
491    }
492}
493
494impl<T> ChunkedArray<T>
495where
496    T: PolarsDataType,
497{
498    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
499    /// indicates a NULL value.
500    ///
501    /// # Panics
502    /// This function will panic if `idx` is out of bounds.
503    #[inline]
504    pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {
505        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
506        assert!(
507            chunk_idx < self.chunks().len(),
508            "index: {} out of bounds for len: {}",
509            idx,
510            self.len()
511        );
512        unsafe {
513            let arr = self.downcast_get_unchecked(chunk_idx);
514            assert!(
515                arr_idx < arr.len(),
516                "index: {} out of bounds for len: {}",
517                idx,
518                self.len()
519            );
520            arr.get_unchecked(arr_idx)
521        }
522    }
523
524    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
525    /// indicates a NULL value.
526    ///
527    /// # Safety
528    /// It is the callers responsibility that the `idx < self.len()`.
529    #[inline]
530    pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {
531        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
532
533        unsafe {
534            // SAFETY: up to the caller to make sure the index is valid.
535            self.downcast_get_unchecked(chunk_idx)
536                .get_unchecked(arr_idx)
537        }
538    }
539
540    /// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned
541    /// value could be garbage if it was masked out by NULL. Note that the value always is initialized.
542    ///
543    /// # Safety
544    /// It is the callers responsibility that the `idx < self.len()`.
545    #[inline]
546    pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {
547        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
548
549        unsafe {
550            // SAFETY: up to the caller to make sure the index is valid.
551            self.downcast_get_unchecked(chunk_idx)
552                .value_unchecked(arr_idx)
553        }
554    }
555
556    #[inline]
557    pub fn first(&self) -> Option<T::Physical<'_>> {
558        unsafe {
559            let arr = self.downcast_get_unchecked(0);
560            arr.get_unchecked(0)
561        }
562    }
563
564    #[inline]
565    pub fn last(&self) -> Option<T::Physical<'_>> {
566        unsafe {
567            let arr = self.downcast_get_unchecked(self.chunks.len().checked_sub(1)?);
568            arr.get_unchecked(arr.len().checked_sub(1)?)
569        }
570    }
571
572    pub fn set_validity(&mut self, validity: &Bitmap) {
573        assert_eq!(self.len(), validity.len());
574        let mut i = 0;
575        for chunk in unsafe { self.chunks_mut() } {
576            *chunk = chunk.with_validity(Some(validity.clone().sliced(i, chunk.len())));
577            i += chunk.len();
578        }
579        self.null_count = validity.unset_bits();
580        self.set_fast_explode_list(false);
581    }
582}
583
584impl<T> ChunkedArray<T>
585where
586    T: PolarsDataType,
587    ChunkedArray<T>: ChunkTakeUnchecked<[IdxSize]>,
588{
589    /// Deposit values into nulls with a certain validity mask.
590    pub fn deposit(&self, validity: &Bitmap) -> Self {
591        let set_bits = validity.set_bits();
592
593        assert_eq!(self.len(), set_bits);
594
595        if set_bits == validity.len() {
596            return self.clone();
597        }
598
599        if set_bits == 0 {
600            return Self::full_null_like(self, validity.len());
601        }
602
603        let mut null_mask = validity.clone();
604
605        let mut gather_idxs = Vec::with_capacity(validity.len());
606        let leading_nulls = null_mask.take_leading_zeros();
607        gather_idxs.extend(std::iter::repeat_n(0, leading_nulls + 1));
608
609        let mut i = 0 as IdxSize;
610        gather_idxs.extend(null_mask.iter().skip(1).map(|v| {
611            i += IdxSize::from(v);
612            i
613        }));
614
615        let mut ca = unsafe { ChunkTakeUnchecked::take_unchecked(self, &gather_idxs) };
616        ca.set_validity(validity);
617        ca
618    }
619}
620
621impl ListChunked {
622    #[inline]
623    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
624        unsafe {
625            Some(Series::from_chunks_and_dtype_unchecked(
626                self.name().clone(),
627                vec![self.get(idx)?],
628                &self.inner_dtype().to_physical(),
629            ))
630        }
631    }
632}
633
634#[cfg(feature = "dtype-array")]
635impl ArrayChunked {
636    #[inline]
637    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
638        unsafe {
639            Some(Series::from_chunks_and_dtype_unchecked(
640                self.name().clone(),
641                vec![self.get(idx)?],
642                &self.inner_dtype().to_physical(),
643            ))
644        }
645    }
646
647    pub fn from_aligned_values(
648        name: PlSmallStr,
649        inner_dtype: &DataType,
650        width: usize,
651        chunks: Vec<ArrayRef>,
652        length: usize,
653    ) -> Self {
654        let dtype = DataType::Array(Box::new(inner_dtype.clone()), width);
655        let arrow_dtype = dtype.to_arrow(CompatLevel::newest());
656        let field = Arc::new(Field::new(name, dtype));
657        if width == 0 {
658            use arrow::array::builder::{ArrayBuilder, make_builder};
659            let values = make_builder(&inner_dtype.to_arrow(CompatLevel::newest())).freeze();
660            return ArrayChunked::new_with_compute_len(
661                field,
662                vec![FixedSizeListArray::new(arrow_dtype, length, values, None).into_boxed()],
663            );
664        }
665
666        let chunks = chunks
667            .into_iter()
668            .map(|chunk| {
669                debug_assert_eq!(chunk.len() % width, 0);
670                FixedSizeListArray::new(arrow_dtype.clone(), length, chunk, None).into_boxed()
671            })
672            .collect();
673
674        unsafe { Self::new_with_dims(field, chunks, length, 0) }
675    }
676
677    /// Turn the ArrayChunked into the ListChunked with the same items.
678    ///
679    /// This will always zero copy the values into the ListChunked.
680    pub fn to_list(&self) -> ListChunked {
681        let inner_dtype = self.inner_dtype();
682        let chunks = self
683            .downcast_iter()
684            .map(|chunk| {
685                use arrow::offset::OffsetsBuffer;
686
687                let inner_dtype = chunk.dtype().inner_dtype().unwrap();
688                let dtype = inner_dtype.clone().to_large_list(true);
689
690                let offsets = (0..=chunk.len())
691                    .map(|i| (i * self.width()) as i64)
692                    .collect::<Vec<i64>>();
693
694                // SAFETY: We created our offsets in ascending manner.
695                let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) };
696
697                ListArray::<i64>::new(
698                    dtype,
699                    offsets,
700                    chunk.values().clone(),
701                    chunk.validity().cloned(),
702                )
703                .into_boxed()
704            })
705            .collect();
706
707        // SAFETY: All the items were mapped 1-1 with the validity staying the same.
708        let mut ca = unsafe {
709            ListChunked::new_with_dims(
710                Arc::new(Field::new(
711                    self.name().clone(),
712                    DataType::List(Box::new(inner_dtype.clone())),
713                )),
714                chunks,
715                self.len(),
716                self.null_count(),
717            )
718        };
719        ca.set_fast_explode_list(!self.has_nulls());
720        ca
721    }
722}
723
724impl<T> ChunkedArray<T>
725where
726    T: PolarsDataType,
727{
728    /// Should be used to match the chunk_id of another [`ChunkedArray`].
729    /// # Panics
730    /// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.
731    pub fn match_chunks<I>(&self, chunk_id: I) -> Self
732    where
733        I: Iterator<Item = usize>,
734    {
735        debug_assert!(self.chunks.len() == 1);
736        // Takes a ChunkedArray containing a single chunk.
737        let slice = |ca: &Self| {
738            let array = &ca.chunks[0];
739
740            let mut offset = 0;
741            let chunks = chunk_id
742                .map(|len| {
743                    // SAFETY: within bounds.
744                    debug_assert!((offset + len) <= array.len());
745                    let out = unsafe { array.sliced_unchecked(offset, len) };
746                    offset += len;
747                    out
748                })
749                .collect();
750
751            debug_assert_eq!(offset, array.len());
752
753            // SAFETY: We just slice the original chunks, their type will not change.
754            unsafe {
755                Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())
756            }
757        };
758
759        if self.chunks.len() != 1 {
760            let out = self.rechunk();
761            slice(&out)
762        } else {
763            slice(self)
764        }
765    }
766}
767
768impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {
769    fn as_ref_dtype(&self) -> &DataType {
770        self.dtype()
771    }
772}
773
774pub(crate) trait AsSinglePtr: AsRefDataType {
775    /// Rechunk and return a ptr to the start of the array
776    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
777        polars_bail!(opq = as_single_ptr, self.as_ref_dtype());
778    }
779}
780
781impl<T> AsSinglePtr for ChunkedArray<T>
782where
783    T: PolarsNumericType,
784{
785    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
786        self.rechunk_mut();
787        let a = self.data_views().next().unwrap();
788        let ptr = a.as_ptr();
789        Ok(ptr as usize)
790    }
791}
792
793impl AsSinglePtr for BooleanChunked {}
794impl AsSinglePtr for ListChunked {}
795#[cfg(feature = "dtype-array")]
796impl AsSinglePtr for ArrayChunked {}
797impl AsSinglePtr for StringChunked {}
798impl AsSinglePtr for BinaryChunked {}
799#[cfg(feature = "object")]
800impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}
801
802pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {
803    SingleNoNull(&'a T::Array),
804    Single(&'a T::Array),
805    MultiNoNull(&'a ChunkedArray<T>),
806    Multi(&'a ChunkedArray<T>),
807}
808
809impl<T> ChunkedArray<T>
810where
811    T: PolarsDataType,
812{
813    pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {
814        if self.chunks.len() == 1 {
815            let arr = self.downcast_iter().next().unwrap();
816            return if arr.null_count() == 0 {
817                ChunkedArrayLayout::SingleNoNull(arr)
818            } else {
819                ChunkedArrayLayout::Single(arr)
820            };
821        }
822
823        if self.downcast_iter().all(|a| a.null_count() == 0) {
824            ChunkedArrayLayout::MultiNoNull(self)
825        } else {
826            ChunkedArrayLayout::Multi(self)
827        }
828    }
829}
830
831impl<T> ChunkedArray<T>
832where
833    T: PolarsNumericType,
834{
835    /// Returns the values of the array as a contiguous slice.
836    pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {
837        polars_ensure!(
838            self.chunks.len() == 1 && self.chunks[0].null_count() == 0,
839            ComputeError: "chunked array is not contiguous"
840        );
841        Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())
842    }
843
844    /// Returns the values of the array as a contiguous mutable slice.
845    pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {
846        if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {
847            // SAFETY, we will not swap the PrimitiveArray.
848            let arr = unsafe { self.downcast_iter_mut().next().unwrap() };
849            arr.get_mut_values()
850        } else {
851            None
852        }
853    }
854
855    /// Get slices of the underlying arrow data.
856    /// NOTE: null values should be taken into account by the user of these slices as they are handled
857    /// separately
858    pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {
859        self.downcast_iter().map(|arr| arr.values().as_slice())
860    }
861
862    #[allow(clippy::wrong_self_convention)]
863    pub fn into_no_null_iter(
864        &self,
865    ) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen
866    {
867        // .copied was significantly slower in benchmark, next call did not inline?
868        #[allow(clippy::map_clone)]
869        // we know the iterators len
870        unsafe {
871            self.data_views()
872                .flatten()
873                .map(|v| *v)
874                .trust_my_length(self.len())
875        }
876    }
877}
878
879impl<T: PolarsDataType> Clone for ChunkedArray<T> {
880    fn clone(&self) -> Self {
881        ChunkedArray {
882            field: self.field.clone(),
883            chunks: self.chunks.clone(),
884            flags: self.flags.clone(),
885
886            _pd: Default::default(),
887            length: self.length,
888            null_count: self.null_count,
889        }
890    }
891}
892
893impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {
894    fn as_ref(&self) -> &ChunkedArray<T> {
895        self
896    }
897}
898
899impl ValueSize for ListChunked {
900    fn get_values_size(&self) -> usize {
901        self.chunks
902            .iter()
903            .fold(0usize, |acc, arr| acc + arr.get_values_size())
904    }
905}
906
907#[cfg(feature = "dtype-array")]
908impl ValueSize for ArrayChunked {
909    fn get_values_size(&self) -> usize {
910        self.chunks
911            .iter()
912            .fold(0usize, |acc, arr| acc + arr.get_values_size())
913    }
914}
915impl ValueSize for StringChunked {
916    fn get_values_size(&self) -> usize {
917        self.chunks
918            .iter()
919            .fold(0usize, |acc, arr| acc + arr.get_values_size())
920    }
921}
922
923impl ValueSize for BinaryOffsetChunked {
924    fn get_values_size(&self) -> usize {
925        self.chunks
926            .iter()
927            .fold(0usize, |acc, arr| acc + arr.get_values_size())
928    }
929}
930
931pub(crate) fn to_primitive<T: PolarsNumericType>(
932    values: Vec<T::Native>,
933    validity: Option<Bitmap>,
934) -> PrimitiveArray<T::Native> {
935    PrimitiveArray::new(
936        T::get_static_dtype().to_arrow(CompatLevel::newest()),
937        values.into(),
938        validity,
939    )
940}
941
942pub(crate) fn to_array<T: PolarsNumericType>(
943    values: Vec<T::Native>,
944    validity: Option<Bitmap>,
945) -> ArrayRef {
946    Box::new(to_primitive::<T>(values, validity))
947}
948
949impl<T: PolarsDataType> Default for ChunkedArray<T> {
950    fn default() -> Self {
951        let dtype = T::get_static_dtype();
952        let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());
953        ChunkedArray {
954            field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),
955            // Invariant: always has 1 chunk.
956            chunks: vec![new_empty_array(arrow_dtype)],
957            flags: StatisticsFlagsIM::empty(),
958
959            _pd: Default::default(),
960            length: 0,
961            null_count: 0,
962        }
963    }
964}
965
966#[cfg(test)]
967pub(crate) mod test {
968    use crate::prelude::*;
969
970    pub(crate) fn get_chunked_array() -> Int32Chunked {
971        ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])
972    }
973
974    #[test]
975    fn test_sort() {
976        let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);
977        let b = a
978            .sort(false)
979            .into_iter()
980            .map(|opt| opt.unwrap())
981            .collect::<Vec<_>>();
982        assert_eq!(b, [1, 2, 3, 9]);
983        let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);
984        let a = a.sort(false);
985        let b = a.into_iter().collect::<Vec<_>>();
986        assert_eq!(b, [Some("a"), Some("b"), Some("c")]);
987        assert!(a.is_sorted_ascending_flag());
988    }
989
990    #[test]
991    fn arithmetic() {
992        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);
993        let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);
994
995        // Not really asserting anything here but still making sure the code is exercised
996        // This (and more) is properly tested from the integration test suite and Python bindings.
997        println!("{:?}", a + b);
998        println!("{:?}", a - b);
999        println!("{:?}", a * b);
1000        println!("{:?}", a / b);
1001    }
1002
1003    #[test]
1004    fn iter() {
1005        let s1 = get_chunked_array();
1006        // sum
1007        assert_eq!(s1.into_iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)
1008    }
1009
1010    #[test]
1011    fn limit() {
1012        let a = get_chunked_array();
1013        let b = a.limit(2);
1014        println!("{b:?}");
1015        assert_eq!(b.len(), 2)
1016    }
1017
1018    #[test]
1019    fn filter() {
1020        let a = get_chunked_array();
1021        let b = a
1022            .filter(&BooleanChunked::new(
1023                PlSmallStr::from_static("filter"),
1024                &[true, false, false],
1025            ))
1026            .unwrap();
1027        assert_eq!(b.len(), 1);
1028        assert_eq!(b.into_iter().next(), Some(Some(1)));
1029    }
1030
1031    #[test]
1032    fn aggregates() {
1033        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);
1034        assert_eq!(a.max(), Some(100));
1035        assert_eq!(a.min(), Some(1));
1036        assert_eq!(a.sum(), Some(120))
1037    }
1038
1039    #[test]
1040    fn take() {
1041        let a = get_chunked_array();
1042        let new = a.take(&[0 as IdxSize, 1]).unwrap();
1043        assert_eq!(new.len(), 2)
1044    }
1045
1046    #[test]
1047    fn cast() {
1048        let a = get_chunked_array();
1049        let b = a.cast(&DataType::Int64).unwrap();
1050        assert_eq!(b.dtype(), &DataType::Int64)
1051    }
1052
1053    fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])
1054    where
1055        T: PolarsNumericType,
1056    {
1057        assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)
1058    }
1059
1060    #[test]
1061    fn slice() {
1062        let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);
1063        let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);
1064        first.append(&second).unwrap();
1065        assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);
1066        assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);
1067        assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);
1068        assert_slice_equal(&first.slice(3, 2), &[3, 4]);
1069        assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);
1070        assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);
1071        assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);
1072
1073        assert_eq!(first.slice(-7, 2).len(), 1);
1074        assert_eq!(first.slice(-3, 4).len(), 3);
1075        assert_eq!(first.slice(3, 4).len(), 3);
1076        assert_eq!(first.slice(10, 4).len(), 0);
1077    }
1078
1079    #[test]
1080    fn sorting() {
1081        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);
1082        let sorted = s.sort(false);
1083        assert_slice_equal(&sorted, &[2, 4, 9]);
1084        let sorted = s.sort(true);
1085        assert_slice_equal(&sorted, &[9, 4, 2]);
1086
1087        let s: StringChunked = ["b", "a", "z"].iter().collect();
1088        let sorted = s.sort(false);
1089        assert_eq!(
1090            sorted.into_iter().collect::<Vec<_>>(),
1091            &[Some("a"), Some("b"), Some("z")]
1092        );
1093        let sorted = s.sort(true);
1094        assert_eq!(
1095            sorted.into_iter().collect::<Vec<_>>(),
1096            &[Some("z"), Some("b"), Some("a")]
1097        );
1098        let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();
1099        let sorted = s.sort(false);
1100        assert_eq!(
1101            sorted.into_iter().collect::<Vec<_>>(),
1102            &[None, Some("b"), Some("z")]
1103        );
1104    }
1105
1106    #[test]
1107    fn reverse() {
1108        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);
1109        // path with continuous slice
1110        assert_slice_equal(&s.reverse(), &[3, 2, 1]);
1111        // path with options
1112        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);
1113        assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);
1114        let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);
1115        assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);
1116
1117        let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);
1118        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);
1119
1120        let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);
1121        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);
1122    }
1123
1124    #[test]
1125    #[cfg(feature = "dtype-categorical")]
1126    fn test_iter_categorical() {
1127        let ca = StringChunked::new(
1128            PlSmallStr::EMPTY,
1129            &[Some("foo"), None, Some("bar"), Some("ham")],
1130        );
1131        let cats = Categories::new(
1132            PlSmallStr::EMPTY,
1133            PlSmallStr::EMPTY,
1134            CategoricalPhysical::U32,
1135        );
1136        let ca = ca.cast(&DataType::from_categories(cats)).unwrap();
1137        let ca = ca.cat32().unwrap();
1138        let v: Vec<_> = ca.physical().into_iter().collect();
1139        assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);
1140    }
1141
1142    #[test]
1143    #[ignore]
1144    fn test_shrink_to_fit() {
1145        let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);
1146        builder.append_value("foo");
1147        let mut arr = builder.finish();
1148        let before = arr
1149            .chunks()
1150            .iter()
1151            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1152            .sum::<usize>();
1153        arr.shrink_to_fit();
1154        let after = arr
1155            .chunks()
1156            .iter()
1157            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1158            .sum::<usize>();
1159        assert!(before > after);
1160    }
1161}