polars_core/chunked_array/
mod.rs

1//! The typed heart of every Series column.
2#![allow(unsafe_op_in_unsafe_fn)]
3use std::iter::Map;
4use std::sync::Arc;
5
6use arrow::array::*;
7use arrow::bitmap::Bitmap;
8use arrow::compute::concatenate::concatenate_unchecked;
9use polars_compute::filter::filter_with_bitmap;
10
11use crate::prelude::*;
12
13pub mod ops;
14#[macro_use]
15pub mod arithmetic;
16pub mod builder;
17pub mod cast;
18pub mod collect;
19pub mod comparison;
20pub mod flags;
21pub mod float;
22pub mod iterator;
23#[cfg(feature = "ndarray")]
24pub(crate) mod ndarray;
25
26#[cfg(feature = "dtype-array")]
27pub(crate) mod array;
28mod binary;
29mod binary_offset;
30mod bitwise;
31#[cfg(feature = "object")]
32mod drop;
33mod from;
34mod from_iterator;
35pub mod from_iterator_par;
36pub(crate) mod list;
37pub(crate) mod logical;
38#[cfg(feature = "object")]
39pub mod object;
40#[cfg(feature = "random")]
41mod random;
42#[cfg(feature = "dtype-struct")]
43mod struct_;
44#[cfg(any(
45    feature = "temporal",
46    feature = "dtype-datetime",
47    feature = "dtype-date"
48))]
49pub mod temporal;
50mod to_vec;
51mod trusted_len;
52
53use std::slice::Iter;
54
55use arrow::legacy::prelude::*;
56#[cfg(feature = "dtype-struct")]
57pub use struct_::StructChunked;
58
59use self::flags::{StatisticsFlags, StatisticsFlagsIM};
60use crate::series::IsSorted;
61use crate::utils::{first_non_null, last_non_null};
62
63#[cfg(not(feature = "dtype-categorical"))]
64pub struct RevMapping {}
65
66pub type ChunkLenIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;
67
68/// # ChunkedArray
69///
70/// Every Series contains a [`ChunkedArray<T>`]. Unlike [`Series`], [`ChunkedArray`]s are typed. This allows
71/// us to apply closures to the data and collect the results to a [`ChunkedArray`] of the same type `T`.
72/// Below we use an apply to use the cosine function to the values of a [`ChunkedArray`].
73///
74/// ```rust
75/// # use polars_core::prelude::*;
76/// fn apply_cosine_and_cast(ca: &Float32Chunked) -> Float32Chunked {
77///     ca.apply_values(|v| v.cos())
78/// }
79/// ```
80///
81/// ## Conversion between Series and ChunkedArrays
82/// Conversion from a [`Series`] to a [`ChunkedArray`] is effortless.
83///
84/// ```rust
85/// # use polars_core::prelude::*;
86/// fn to_chunked_array(series: &Series) -> PolarsResult<&Int32Chunked>{
87///     series.i32()
88/// }
89///
90/// fn to_series(ca: Int32Chunked) -> Series {
91///     ca.into_series()
92/// }
93/// ```
94///
95/// # Iterators
96///
97/// [`ChunkedArray`]s fully support Rust native [Iterator](https://doc.rust-lang.org/std/iter/trait.Iterator.html)
98/// and [DoubleEndedIterator](https://doc.rust-lang.org/std/iter/trait.DoubleEndedIterator.html) traits, thereby
99/// giving access to all the excellent methods available for [Iterators](https://doc.rust-lang.org/std/iter/trait.Iterator.html).
100///
101/// ```rust
102/// # use polars_core::prelude::*;
103///
104/// fn iter_forward(ca: &Float32Chunked) {
105///     ca.iter()
106///         .for_each(|opt_v| println!("{:?}", opt_v))
107/// }
108///
109/// fn iter_backward(ca: &Float32Chunked) {
110///     ca.iter()
111///         .rev()
112///         .for_each(|opt_v| println!("{:?}", opt_v))
113/// }
114/// ```
115///
116/// # Memory layout
117///
118/// [`ChunkedArray`]s use [Apache Arrow](https://github.com/apache/arrow) as backend for the memory layout.
119/// Arrows memory is immutable which makes it possible to make multiple zero copy (sub)-views from a single array.
120///
121/// To be able to append data, Polars uses chunks to append new memory locations, hence the [`ChunkedArray<T>`] data structure.
122/// Appends are cheap, because it will not lead to a full reallocation of the whole array (as could be the case with a Rust Vec).
123///
124/// However, multiple chunks in a [`ChunkedArray`] will slow down many operations that need random access because we have an extra indirection
125/// and indexes need to be mapped to the proper chunk. Arithmetic may also be slowed down by this.
126/// When multiplying two [`ChunkedArray`]s with different chunk sizes they cannot utilize [SIMD](https://en.wikipedia.org/wiki/SIMD) for instance.
127///
128/// If you want to have predictable performance
129/// (no unexpected re-allocation of memory), it is advised to call the [`ChunkedArray::rechunk`] after
130/// multiple append operations.
131///
132/// See also [`ChunkedArray::extend`] for appends within a chunk.
133///
134/// # Invariants
135/// - A [`ChunkedArray`] should always have at least a single [`ArrayRef`].
136/// - The [`PolarsDataType`] `T` should always map to the correct [`ArrowDataType`] in the [`ArrayRef`]
137///   chunks.
138/// - Nested datatypes such as [`List`] and [`Array`] store the physical types instead of the
139///   logical type given by the datatype.
140///
141/// [`List`]: crate::datatypes::DataType::List
142pub struct ChunkedArray<T: PolarsDataType> {
143    pub(crate) field: Arc<Field>,
144    pub(crate) chunks: Vec<ArrayRef>,
145
146    pub(crate) flags: StatisticsFlagsIM,
147
148    length: usize,
149    null_count: usize,
150    _pd: std::marker::PhantomData<T>,
151}
152
153impl<T: PolarsDataType> ChunkedArray<T> {
154    fn should_rechunk(&self) -> bool {
155        self.chunks.len() > 1 && self.chunks.len() > self.len() / 3
156    }
157
158    fn optional_rechunk(mut self) -> Self {
159        // Rechunk if we have many small chunks.
160        if self.should_rechunk() {
161            self.rechunk_mut()
162        }
163        self
164    }
165
166    pub(crate) fn as_any(&self) -> &dyn std::any::Any {
167        self
168    }
169
170    /// Series to [`ChunkedArray<T>`]
171    pub fn unpack_series_matching_type<'a>(
172        &self,
173        series: &'a Series,
174    ) -> PolarsResult<&'a ChunkedArray<T>> {
175        match self.dtype() {
176            #[cfg(feature = "dtype-decimal")]
177            DataType::Decimal(_, _) => {
178                let logical = series.decimal()?;
179
180                let ca = logical.physical();
181                Ok(ca.as_any().downcast_ref::<ChunkedArray<T>>().unwrap())
182            },
183            dt => {
184                polars_ensure!(
185                    dt == series.dtype(),
186                    SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
187                    series.dtype(),
188                    dt,
189                );
190
191                // SAFETY:
192                // dtype will be correct.
193                Ok(unsafe { self.unpack_series_matching_physical_type(series) })
194            },
195        }
196    }
197
198    /// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.
199    ///
200    /// If you want to explicitly the `length` and `null_count`, look at
201    /// [`ChunkedArray::new_with_dims`]
202    fn new_with_compute_len(field: Arc<Field>, chunks: Vec<ArrayRef>) -> Self {
203        unsafe {
204            let mut chunked_arr = Self::new_with_dims(field, chunks, 0, 0);
205            chunked_arr.compute_len();
206            chunked_arr
207        }
208    }
209
210    /// Create a new [`ChunkedArray`] and explicitly set its `length` and `null_count`.
211    /// # Safety
212    /// The length and null_count must be correct.
213    pub unsafe fn new_with_dims(
214        field: Arc<Field>,
215        chunks: Vec<ArrayRef>,
216        length: usize,
217        null_count: usize,
218    ) -> Self {
219        Self {
220            field,
221            chunks,
222            flags: StatisticsFlagsIM::empty(),
223
224            _pd: Default::default(),
225            length,
226            null_count,
227        }
228    }
229
230    pub(crate) fn is_sorted_ascending_flag(&self) -> bool {
231        self.get_flags().is_sorted_ascending()
232    }
233
234    pub(crate) fn is_sorted_descending_flag(&self) -> bool {
235        self.get_flags().is_sorted_descending()
236    }
237
238    /// Whether `self` is sorted in any direction.
239    pub(crate) fn is_sorted_any(&self) -> bool {
240        self.get_flags().is_sorted_any()
241    }
242
243    pub fn unset_fast_explode_list(&mut self) {
244        self.set_fast_explode_list(false)
245    }
246
247    pub fn set_fast_explode_list(&mut self, value: bool) {
248        let mut flags = self.flags.get_mut();
249        flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value);
250        self.flags.set_mut(flags);
251    }
252
253    pub fn get_fast_explode_list(&self) -> bool {
254        self.get_flags().can_fast_explode_list()
255    }
256
257    pub fn get_flags(&self) -> StatisticsFlags {
258        self.flags.get()
259    }
260
261    /// Set flags for the [`ChunkedArray`]
262    pub fn set_flags(&mut self, flags: StatisticsFlags) {
263        self.flags = StatisticsFlagsIM::new(flags);
264    }
265
266    pub fn is_sorted_flag(&self) -> IsSorted {
267        self.get_flags().is_sorted()
268    }
269
270    pub fn retain_flags_from<U: PolarsDataType>(
271        &mut self,
272        from: &ChunkedArray<U>,
273        retain_flags: StatisticsFlags,
274    ) {
275        let flags = from.flags.get();
276        // Try to avoid write contention.
277        if !flags.is_empty() {
278            self.set_flags(flags & retain_flags)
279        }
280    }
281
282    /// Set the 'sorted' bit meta info.
283    pub fn set_sorted_flag(&mut self, sorted: IsSorted) {
284        let mut flags = self.flags.get_mut();
285        flags.set_sorted(sorted);
286        self.flags.set_mut(flags);
287    }
288
289    /// Set the 'sorted' bit meta info.
290    pub fn with_sorted_flag(&self, sorted: IsSorted) -> Self {
291        let mut out = self.clone();
292        out.set_sorted_flag(sorted);
293        out
294    }
295
296    /// Get the index of the first non null value in this [`ChunkedArray`].
297    pub fn first_non_null(&self) -> Option<usize> {
298        if self.null_count() == self.len() {
299            None
300        }
301        // We now know there is at least 1 non-null item in the array, and self.len() > 0
302        else if self.null_count() == 0 {
303            Some(0)
304        } else if self.is_sorted_any() {
305            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
306                // nulls are all at the start
307                self.null_count()
308            } else {
309                // nulls are all at the end
310                0
311            };
312
313            debug_assert!(
314                // If we are lucky this catches something.
315                unsafe { self.get_unchecked(out) }.is_some(),
316                "incorrect sorted flag"
317            );
318
319            Some(out)
320        } else {
321            first_non_null(self.iter_validities())
322        }
323    }
324
325    /// Get the index of the last non null value in this [`ChunkedArray`].
326    pub fn last_non_null(&self) -> Option<usize> {
327        if self.null_count() == self.len() {
328            None
329        }
330        // We now know there is at least 1 non-null item in the array, and self.len() > 0
331        else if self.null_count() == 0 {
332            Some(self.len() - 1)
333        } else if self.is_sorted_any() {
334            let out = if unsafe { self.downcast_get_unchecked(0).is_null_unchecked(0) } {
335                // nulls are all at the start
336                self.len() - 1
337            } else {
338                // nulls are all at the end
339                self.len() - self.null_count() - 1
340            };
341
342            debug_assert!(
343                // If we are lucky this catches something.
344                unsafe { self.get_unchecked(out) }.is_some(),
345                "incorrect sorted flag"
346            );
347
348            Some(out)
349        } else {
350            last_non_null(self.iter_validities(), self.len())
351        }
352    }
353
354    pub fn drop_nulls(&self) -> Self {
355        if self.null_count() == 0 {
356            self.clone()
357        } else {
358            let chunks = self
359                .downcast_iter()
360                .map(|arr| {
361                    if arr.null_count() == 0 {
362                        arr.to_boxed()
363                    } else {
364                        filter_with_bitmap(arr, arr.validity().unwrap())
365                    }
366                })
367                .collect();
368            unsafe {
369                Self::new_with_dims(
370                    self.field.clone(),
371                    chunks,
372                    self.len() - self.null_count(),
373                    0,
374                )
375            }
376        }
377    }
378
379    /// Get the buffer of bits representing null values
380    #[inline]
381    #[allow(clippy::type_complexity)]
382    pub fn iter_validities(&self) -> Map<Iter<'_, ArrayRef>, fn(&ArrayRef) -> Option<&Bitmap>> {
383        fn to_validity(arr: &ArrayRef) -> Option<&Bitmap> {
384            arr.validity()
385        }
386        self.chunks.iter().map(to_validity)
387    }
388
389    #[inline]
390    /// Return if any the chunks in this [`ChunkedArray`] have nulls.
391    pub fn has_nulls(&self) -> bool {
392        self.null_count > 0
393    }
394
395    /// Shrink the capacity of this array to fit its length.
396    pub fn shrink_to_fit(&mut self) {
397        self.chunks = vec![concatenate_unchecked(self.chunks.as_slice()).unwrap()];
398    }
399
400    pub fn clear(&self) -> Self {
401        // SAFETY: we keep the correct dtype
402        let mut ca = unsafe {
403            self.copy_with_chunks(vec![new_empty_array(
404                self.chunks.first().unwrap().dtype().clone(),
405            )])
406        };
407
408        use StatisticsFlags as F;
409        ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
410        ca
411    }
412
413    /// Unpack a [`Series`] to the same physical type.
414    ///
415    /// # Safety
416    ///
417    /// This is unsafe as the dtype may be incorrect and
418    /// is assumed to be correct in other safe code.
419    pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(
420        &self,
421        series: &'a Series,
422    ) -> &'a ChunkedArray<T> {
423        let series_trait = &**series;
424        if self.dtype() == series.dtype() {
425            &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
426        } else {
427            use DataType::*;
428            match (self.dtype(), series.dtype()) {
429                (Int64, Datetime(_, _)) | (Int64, Duration(_)) | (Int32, Date) => {
430                    &*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
431                },
432                _ => panic!(
433                    "cannot unpack series {:?} into matching type {:?}",
434                    series,
435                    self.dtype()
436                ),
437            }
438        }
439    }
440
441    /// Returns an iterator over the lengths of the chunks of the array.
442    pub fn chunk_lengths(&self) -> ChunkLenIter<'_> {
443        self.chunks.iter().map(|chunk| chunk.len())
444    }
445
446    /// A reference to the chunks
447    #[inline]
448    pub fn chunks(&self) -> &Vec<ArrayRef> {
449        &self.chunks
450    }
451
452    /// A mutable reference to the chunks
453    ///
454    /// # Safety
455    /// The caller must ensure to not change the [`DataType`] or `length` of any of the chunks.
456    /// And the `null_count` remains correct.
457    #[inline]
458    pub unsafe fn chunks_mut(&mut self) -> &mut Vec<ArrayRef> {
459        &mut self.chunks
460    }
461
462    /// Returns true if contains a single chunk and has no null values
463    pub fn is_optimal_aligned(&self) -> bool {
464        self.chunks.len() == 1 && self.null_count() == 0
465    }
466
467    /// Create a new [`ChunkedArray`] from self, where the chunks are replaced.
468    ///
469    /// # Safety
470    /// The caller must ensure the dtypes of the chunks are correct
471    unsafe fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
472        Self::new_with_compute_len(self.field.clone(), chunks)
473    }
474
475    /// Get data type of [`ChunkedArray`].
476    pub fn dtype(&self) -> &DataType {
477        self.field.dtype()
478    }
479
480    pub(crate) unsafe fn set_dtype(&mut self, dtype: DataType) {
481        self.field = Arc::new(Field::new(self.name().clone(), dtype))
482    }
483
484    /// Name of the [`ChunkedArray`].
485    pub fn name(&self) -> &PlSmallStr {
486        self.field.name()
487    }
488
489    /// Get a reference to the field.
490    pub fn ref_field(&self) -> &Field {
491        &self.field
492    }
493
494    /// Rename this [`ChunkedArray`].
495    pub fn rename(&mut self, name: PlSmallStr) {
496        self.field = Arc::new(Field::new(name, self.field.dtype().clone()));
497    }
498
499    /// Return this [`ChunkedArray`] with a new name.
500    pub fn with_name(mut self, name: PlSmallStr) -> Self {
501        self.rename(name);
502        self
503    }
504}
505
506impl<T> ChunkedArray<T>
507where
508    T: PolarsDataType,
509{
510    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
511    /// indicates a NULL value.
512    ///
513    /// # Panics
514    /// This function will panic if `idx` is out of bounds.
515    #[inline]
516    pub fn get(&self, idx: usize) -> Option<T::Physical<'_>> {
517        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
518        assert!(
519            chunk_idx < self.chunks().len(),
520            "index: {} out of bounds for len: {}",
521            idx,
522            self.len()
523        );
524        unsafe {
525            let arr = self.downcast_get_unchecked(chunk_idx);
526            assert!(
527                arr_idx < arr.len(),
528                "index: {} out of bounds for len: {}",
529                idx,
530                self.len()
531            );
532            arr.get_unchecked(arr_idx)
533        }
534    }
535
536    /// Get a single value from this [`ChunkedArray`]. If the return values is `None` this
537    /// indicates a NULL value.
538    ///
539    /// # Safety
540    /// It is the callers responsibility that the `idx < self.len()`.
541    #[inline]
542    pub unsafe fn get_unchecked(&self, idx: usize) -> Option<T::Physical<'_>> {
543        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
544
545        unsafe {
546            // SAFETY: up to the caller to make sure the index is valid.
547            self.downcast_get_unchecked(chunk_idx)
548                .get_unchecked(arr_idx)
549        }
550    }
551
552    /// Get a single value from this [`ChunkedArray`]. Null values are ignored and the returned
553    /// value could be garbage if it was masked out by NULL. Note that the value always is initialized.
554    ///
555    /// # Safety
556    /// It is the callers responsibility that the `idx < self.len()`.
557    #[inline]
558    pub unsafe fn value_unchecked(&self, idx: usize) -> T::Physical<'_> {
559        let (chunk_idx, arr_idx) = self.index_to_chunked_index(idx);
560
561        unsafe {
562            // SAFETY: up to the caller to make sure the index is valid.
563            self.downcast_get_unchecked(chunk_idx)
564                .value_unchecked(arr_idx)
565        }
566    }
567
568    #[inline]
569    pub fn first(&self) -> Option<T::Physical<'_>> {
570        unsafe {
571            let arr = self.downcast_get_unchecked(0);
572            arr.get_unchecked(0)
573        }
574    }
575
576    #[inline]
577    pub fn last(&self) -> Option<T::Physical<'_>> {
578        unsafe {
579            let arr = self.downcast_get_unchecked(self.chunks.len().checked_sub(1)?);
580            arr.get_unchecked(arr.len().checked_sub(1)?)
581        }
582    }
583}
584
585impl ListChunked {
586    #[inline]
587    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
588        unsafe {
589            Some(Series::from_chunks_and_dtype_unchecked(
590                self.name().clone(),
591                vec![self.get(idx)?],
592                &self.inner_dtype().to_physical(),
593            ))
594        }
595    }
596}
597
598#[cfg(feature = "dtype-array")]
599impl ArrayChunked {
600    #[inline]
601    pub fn get_as_series(&self, idx: usize) -> Option<Series> {
602        unsafe {
603            Some(Series::from_chunks_and_dtype_unchecked(
604                self.name().clone(),
605                vec![self.get(idx)?],
606                &self.inner_dtype().to_physical(),
607            ))
608        }
609    }
610}
611
612impl<T> ChunkedArray<T>
613where
614    T: PolarsDataType,
615{
616    /// Should be used to match the chunk_id of another [`ChunkedArray`].
617    /// # Panics
618    /// It is the callers responsibility to ensure that this [`ChunkedArray`] has a single chunk.
619    pub fn match_chunks<I>(&self, chunk_id: I) -> Self
620    where
621        I: Iterator<Item = usize>,
622    {
623        debug_assert!(self.chunks.len() == 1);
624        // Takes a ChunkedArray containing a single chunk.
625        let slice = |ca: &Self| {
626            let array = &ca.chunks[0];
627
628            let mut offset = 0;
629            let chunks = chunk_id
630                .map(|len| {
631                    // SAFETY: within bounds.
632                    debug_assert!((offset + len) <= array.len());
633                    let out = unsafe { array.sliced_unchecked(offset, len) };
634                    offset += len;
635                    out
636                })
637                .collect();
638
639            debug_assert_eq!(offset, array.len());
640
641            // SAFETY: We just slice the original chunks, their type will not change.
642            unsafe {
643                Self::from_chunks_and_dtype(self.name().clone(), chunks, self.dtype().clone())
644            }
645        };
646
647        if self.chunks.len() != 1 {
648            let out = self.rechunk();
649            slice(&out)
650        } else {
651            slice(self)
652        }
653    }
654}
655
656impl<T: PolarsDataType> AsRefDataType for ChunkedArray<T> {
657    fn as_ref_dtype(&self) -> &DataType {
658        self.dtype()
659    }
660}
661
662pub(crate) trait AsSinglePtr: AsRefDataType {
663    /// Rechunk and return a ptr to the start of the array
664    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
665        polars_bail!(opq = as_single_ptr, self.as_ref_dtype());
666    }
667}
668
669impl<T> AsSinglePtr for ChunkedArray<T>
670where
671    T: PolarsNumericType,
672{
673    fn as_single_ptr(&mut self) -> PolarsResult<usize> {
674        self.rechunk_mut();
675        let a = self.data_views().next().unwrap();
676        let ptr = a.as_ptr();
677        Ok(ptr as usize)
678    }
679}
680
681impl AsSinglePtr for BooleanChunked {}
682impl AsSinglePtr for ListChunked {}
683#[cfg(feature = "dtype-array")]
684impl AsSinglePtr for ArrayChunked {}
685impl AsSinglePtr for StringChunked {}
686impl AsSinglePtr for BinaryChunked {}
687#[cfg(feature = "object")]
688impl<T: PolarsObject> AsSinglePtr for ObjectChunked<T> {}
689
690pub enum ChunkedArrayLayout<'a, T: PolarsDataType> {
691    SingleNoNull(&'a T::Array),
692    Single(&'a T::Array),
693    MultiNoNull(&'a ChunkedArray<T>),
694    Multi(&'a ChunkedArray<T>),
695}
696
697impl<T> ChunkedArray<T>
698where
699    T: PolarsDataType,
700{
701    pub fn layout(&self) -> ChunkedArrayLayout<'_, T> {
702        if self.chunks.len() == 1 {
703            let arr = self.downcast_iter().next().unwrap();
704            return if arr.null_count() == 0 {
705                ChunkedArrayLayout::SingleNoNull(arr)
706            } else {
707                ChunkedArrayLayout::Single(arr)
708            };
709        }
710
711        if self.downcast_iter().all(|a| a.null_count() == 0) {
712            ChunkedArrayLayout::MultiNoNull(self)
713        } else {
714            ChunkedArrayLayout::Multi(self)
715        }
716    }
717}
718
719impl<T> ChunkedArray<T>
720where
721    T: PolarsNumericType,
722{
723    /// Returns the values of the array as a contiguous slice.
724    pub fn cont_slice(&self) -> PolarsResult<&[T::Native]> {
725        polars_ensure!(
726            self.chunks.len() == 1 && self.chunks[0].null_count() == 0,
727            ComputeError: "chunked array is not contiguous"
728        );
729        Ok(self.downcast_iter().next().map(|arr| arr.values()).unwrap())
730    }
731
732    /// Returns the values of the array as a contiguous mutable slice.
733    pub(crate) fn cont_slice_mut(&mut self) -> Option<&mut [T::Native]> {
734        if self.chunks.len() == 1 && self.chunks[0].null_count() == 0 {
735            // SAFETY, we will not swap the PrimitiveArray.
736            let arr = unsafe { self.downcast_iter_mut().next().unwrap() };
737            arr.get_mut_values()
738        } else {
739            None
740        }
741    }
742
743    /// Get slices of the underlying arrow data.
744    /// NOTE: null values should be taken into account by the user of these slices as they are handled
745    /// separately
746    pub fn data_views(&self) -> impl DoubleEndedIterator<Item = &[T::Native]> {
747        self.downcast_iter().map(|arr| arr.values().as_slice())
748    }
749
750    #[allow(clippy::wrong_self_convention)]
751    pub fn into_no_null_iter(
752        &self,
753    ) -> impl '_ + Send + Sync + ExactSizeIterator<Item = T::Native> + DoubleEndedIterator + TrustedLen
754    {
755        // .copied was significantly slower in benchmark, next call did not inline?
756        #[allow(clippy::map_clone)]
757        // we know the iterators len
758        unsafe {
759            self.data_views()
760                .flatten()
761                .map(|v| *v)
762                .trust_my_length(self.len())
763        }
764    }
765}
766
767impl<T: PolarsDataType> Clone for ChunkedArray<T> {
768    fn clone(&self) -> Self {
769        ChunkedArray {
770            field: self.field.clone(),
771            chunks: self.chunks.clone(),
772            flags: self.flags.clone(),
773
774            _pd: Default::default(),
775            length: self.length,
776            null_count: self.null_count,
777        }
778    }
779}
780
781impl<T: PolarsDataType> AsRef<ChunkedArray<T>> for ChunkedArray<T> {
782    fn as_ref(&self) -> &ChunkedArray<T> {
783        self
784    }
785}
786
787impl ValueSize for ListChunked {
788    fn get_values_size(&self) -> usize {
789        self.chunks
790            .iter()
791            .fold(0usize, |acc, arr| acc + arr.get_values_size())
792    }
793}
794
795#[cfg(feature = "dtype-array")]
796impl ValueSize for ArrayChunked {
797    fn get_values_size(&self) -> usize {
798        self.chunks
799            .iter()
800            .fold(0usize, |acc, arr| acc + arr.get_values_size())
801    }
802}
803impl ValueSize for StringChunked {
804    fn get_values_size(&self) -> usize {
805        self.chunks
806            .iter()
807            .fold(0usize, |acc, arr| acc + arr.get_values_size())
808    }
809}
810
811impl ValueSize for BinaryOffsetChunked {
812    fn get_values_size(&self) -> usize {
813        self.chunks
814            .iter()
815            .fold(0usize, |acc, arr| acc + arr.get_values_size())
816    }
817}
818
819pub(crate) fn to_primitive<T: PolarsNumericType>(
820    values: Vec<T::Native>,
821    validity: Option<Bitmap>,
822) -> PrimitiveArray<T::Native> {
823    PrimitiveArray::new(
824        T::get_static_dtype().to_arrow(CompatLevel::newest()),
825        values.into(),
826        validity,
827    )
828}
829
830pub(crate) fn to_array<T: PolarsNumericType>(
831    values: Vec<T::Native>,
832    validity: Option<Bitmap>,
833) -> ArrayRef {
834    Box::new(to_primitive::<T>(values, validity))
835}
836
837impl<T: PolarsDataType> Default for ChunkedArray<T> {
838    fn default() -> Self {
839        let dtype = T::get_static_dtype();
840        let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());
841        ChunkedArray {
842            field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)),
843            // Invariant: always has 1 chunk.
844            chunks: vec![new_empty_array(arrow_dtype)],
845            flags: StatisticsFlagsIM::empty(),
846
847            _pd: Default::default(),
848            length: 0,
849            null_count: 0,
850        }
851    }
852}
853
854#[cfg(test)]
855pub(crate) mod test {
856    use crate::prelude::*;
857
858    pub(crate) fn get_chunked_array() -> Int32Chunked {
859        ChunkedArray::new(PlSmallStr::from_static("a"), &[1, 2, 3])
860    }
861
862    #[test]
863    fn test_sort() {
864        let a = Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 9, 3, 2]);
865        let b = a
866            .sort(false)
867            .into_iter()
868            .map(|opt| opt.unwrap())
869            .collect::<Vec<_>>();
870        assert_eq!(b, [1, 2, 3, 9]);
871        let a = StringChunked::new(PlSmallStr::from_static("a"), &["b", "a", "c"]);
872        let a = a.sort(false);
873        let b = a.into_iter().collect::<Vec<_>>();
874        assert_eq!(b, [Some("a"), Some("b"), Some("c")]);
875        assert!(a.is_sorted_ascending_flag());
876    }
877
878    #[test]
879    fn arithmetic() {
880        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 6, 40]);
881        let b = &Int32Chunked::new(PlSmallStr::from_static("b"), &[-1, 2, 3, 4]);
882
883        // Not really asserting anything here but still making sure the code is exercised
884        // This (and more) is properly tested from the integration test suite and Python bindings.
885        println!("{:?}", a + b);
886        println!("{:?}", a - b);
887        println!("{:?}", a * b);
888        println!("{:?}", a / b);
889    }
890
891    #[test]
892    fn iter() {
893        let s1 = get_chunked_array();
894        // sum
895        assert_eq!(s1.into_iter().fold(0, |acc, val| { acc + val.unwrap() }), 6)
896    }
897
898    #[test]
899    fn limit() {
900        let a = get_chunked_array();
901        let b = a.limit(2);
902        println!("{b:?}");
903        assert_eq!(b.len(), 2)
904    }
905
906    #[test]
907    fn filter() {
908        let a = get_chunked_array();
909        let b = a
910            .filter(&BooleanChunked::new(
911                PlSmallStr::from_static("filter"),
912                &[true, false, false],
913            ))
914            .unwrap();
915        assert_eq!(b.len(), 1);
916        assert_eq!(b.into_iter().next(), Some(Some(1)));
917    }
918
919    #[test]
920    fn aggregates() {
921        let a = &Int32Chunked::new(PlSmallStr::from_static("a"), &[1, 100, 10, 9]);
922        assert_eq!(a.max(), Some(100));
923        assert_eq!(a.min(), Some(1));
924        assert_eq!(a.sum(), Some(120))
925    }
926
927    #[test]
928    fn take() {
929        let a = get_chunked_array();
930        let new = a.take(&[0 as IdxSize, 1]).unwrap();
931        assert_eq!(new.len(), 2)
932    }
933
934    #[test]
935    fn cast() {
936        let a = get_chunked_array();
937        let b = a.cast(&DataType::Int64).unwrap();
938        assert_eq!(b.dtype(), &DataType::Int64)
939    }
940
941    fn assert_slice_equal<T>(ca: &ChunkedArray<T>, eq: &[T::Native])
942    where
943        T: PolarsNumericType,
944    {
945        assert_eq!(ca.iter().map(|opt| opt.unwrap()).collect::<Vec<_>>(), eq)
946    }
947
948    #[test]
949    fn slice() {
950        let mut first = UInt32Chunked::new(PlSmallStr::from_static("first"), &[0, 1, 2]);
951        let second = UInt32Chunked::new(PlSmallStr::from_static("second"), &[3, 4, 5]);
952        first.append(&second).unwrap();
953        assert_slice_equal(&first.slice(0, 3), &[0, 1, 2]);
954        assert_slice_equal(&first.slice(0, 4), &[0, 1, 2, 3]);
955        assert_slice_equal(&first.slice(1, 4), &[1, 2, 3, 4]);
956        assert_slice_equal(&first.slice(3, 2), &[3, 4]);
957        assert_slice_equal(&first.slice(3, 3), &[3, 4, 5]);
958        assert_slice_equal(&first.slice(-3, 3), &[3, 4, 5]);
959        assert_slice_equal(&first.slice(-6, 6), &[0, 1, 2, 3, 4, 5]);
960
961        assert_eq!(first.slice(-7, 2).len(), 1);
962        assert_eq!(first.slice(-3, 4).len(), 3);
963        assert_eq!(first.slice(3, 4).len(), 3);
964        assert_eq!(first.slice(10, 4).len(), 0);
965    }
966
967    #[test]
968    fn sorting() {
969        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[9, 2, 4]);
970        let sorted = s.sort(false);
971        assert_slice_equal(&sorted, &[2, 4, 9]);
972        let sorted = s.sort(true);
973        assert_slice_equal(&sorted, &[9, 4, 2]);
974
975        let s: StringChunked = ["b", "a", "z"].iter().collect();
976        let sorted = s.sort(false);
977        assert_eq!(
978            sorted.into_iter().collect::<Vec<_>>(),
979            &[Some("a"), Some("b"), Some("z")]
980        );
981        let sorted = s.sort(true);
982        assert_eq!(
983            sorted.into_iter().collect::<Vec<_>>(),
984            &[Some("z"), Some("b"), Some("a")]
985        );
986        let s: StringChunked = [Some("b"), None, Some("z")].iter().copied().collect();
987        let sorted = s.sort(false);
988        assert_eq!(
989            sorted.into_iter().collect::<Vec<_>>(),
990            &[None, Some("b"), Some("z")]
991        );
992    }
993
994    #[test]
995    fn reverse() {
996        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[1, 2, 3]);
997        // path with continuous slice
998        assert_slice_equal(&s.reverse(), &[3, 2, 1]);
999        // path with options
1000        let s = UInt32Chunked::new(PlSmallStr::EMPTY, &[Some(1), None, Some(3)]);
1001        assert_eq!(Vec::from(&s.reverse()), &[Some(3), None, Some(1)]);
1002        let s = BooleanChunked::new(PlSmallStr::EMPTY, &[true, false]);
1003        assert_eq!(Vec::from(&s.reverse()), &[Some(false), Some(true)]);
1004
1005        let s = StringChunked::new(PlSmallStr::EMPTY, &["a", "b", "c"]);
1006        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), Some("b"), Some("a")]);
1007
1008        let s = StringChunked::new(PlSmallStr::EMPTY, &[Some("a"), None, Some("c")]);
1009        assert_eq!(Vec::from(&s.reverse()), &[Some("c"), None, Some("a")]);
1010    }
1011
1012    #[test]
1013    #[cfg(feature = "dtype-categorical")]
1014    fn test_iter_categorical() {
1015        let ca = StringChunked::new(
1016            PlSmallStr::EMPTY,
1017            &[Some("foo"), None, Some("bar"), Some("ham")],
1018        );
1019        let cats = Categories::new(
1020            PlSmallStr::EMPTY,
1021            PlSmallStr::EMPTY,
1022            CategoricalPhysical::U32,
1023        );
1024        let ca = ca.cast(&DataType::from_categories(cats)).unwrap();
1025        let ca = ca.cat32().unwrap();
1026        let v: Vec<_> = ca.physical().into_iter().collect();
1027        assert_eq!(v, &[Some(0), None, Some(1), Some(2)]);
1028    }
1029
1030    #[test]
1031    #[ignore]
1032    fn test_shrink_to_fit() {
1033        let mut builder = StringChunkedBuilder::new(PlSmallStr::from_static("foo"), 2048);
1034        builder.append_value("foo");
1035        let mut arr = builder.finish();
1036        let before = arr
1037            .chunks()
1038            .iter()
1039            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1040            .sum::<usize>();
1041        arr.shrink_to_fit();
1042        let after = arr
1043            .chunks()
1044            .iter()
1045            .map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
1046            .sum::<usize>();
1047        assert!(before > after);
1048    }
1049}