polars_core/chunked_array/ops/
chunkops.rs

1use std::borrow::Cow;
2use std::cell::Cell;
3
4use arrow::bitmap::{Bitmap, BitmapBuilder};
5use arrow::compute::concatenate::concatenate_unchecked;
6use polars_error::constants::LENGTH_LIMIT_MSG;
7
8use super::*;
9use crate::chunked_array::flags::StatisticsFlags;
10#[cfg(feature = "object")]
11use crate::chunked_array::object::builder::ObjectChunkedBuilder;
12use crate::utils::slice_offsets;
13
14pub(crate) fn split_at(
15    chunks: &[ArrayRef],
16    offset: i64,
17    own_length: usize,
18) -> (Vec<ArrayRef>, Vec<ArrayRef>) {
19    let mut new_chunks_left = Vec::with_capacity(1);
20    let mut new_chunks_right = Vec::with_capacity(1);
21    let (raw_offset, _) = slice_offsets(offset, 0, own_length);
22
23    let mut remaining_offset = raw_offset;
24    let mut iter = chunks.iter();
25
26    for chunk in &mut iter {
27        let chunk_len = chunk.len();
28        if remaining_offset > 0 && remaining_offset >= chunk_len {
29            remaining_offset -= chunk_len;
30            new_chunks_left.push(chunk.clone());
31            continue;
32        }
33
34        let (l, r) = chunk.split_at_boxed(remaining_offset);
35        new_chunks_left.push(l);
36        new_chunks_right.push(r);
37        break;
38    }
39
40    for chunk in iter {
41        new_chunks_right.push(chunk.clone())
42    }
43    if new_chunks_left.is_empty() {
44        new_chunks_left.push(chunks[0].sliced(0, 0));
45    }
46    if new_chunks_right.is_empty() {
47        new_chunks_right.push(chunks[0].sliced(0, 0));
48    }
49    (new_chunks_left, new_chunks_right)
50}
51
52pub(crate) fn slice(
53    chunks: &[ArrayRef],
54    offset: i64,
55    slice_length: usize,
56    own_length: usize,
57) -> (Vec<ArrayRef>, usize) {
58    let mut new_chunks = Vec::with_capacity(1);
59    let (raw_offset, slice_len) = slice_offsets(offset, slice_length, own_length);
60
61    let mut remaining_length = slice_len;
62    let mut remaining_offset = raw_offset;
63    let mut new_len = 0;
64
65    for chunk in chunks {
66        let chunk_len = chunk.len();
67        if remaining_offset > 0 && remaining_offset >= chunk_len {
68            remaining_offset -= chunk_len;
69            continue;
70        }
71        let take_len = if remaining_length + remaining_offset > chunk_len {
72            chunk_len - remaining_offset
73        } else {
74            remaining_length
75        };
76        new_len += take_len;
77
78        debug_assert!(remaining_offset + take_len <= chunk.len());
79        unsafe {
80            // SAFETY:
81            // this function ensures the slices are in bounds
82            new_chunks.push(chunk.sliced_unchecked(remaining_offset, take_len));
83        }
84        remaining_length -= take_len;
85        remaining_offset = 0;
86        if remaining_length == 0 {
87            break;
88        }
89    }
90    if new_chunks.is_empty() {
91        new_chunks.push(chunks[0].sliced(0, 0));
92    }
93    (new_chunks, new_len)
94}
95
96// When we deal with arrays and lists we can easily exceed the limit if
97// we take the underlying values array as a Series. This call stack
98// is hard to follow, so for this one case we make an exception
99// and use a thread local.
100thread_local!(static CHECK_LENGTH: Cell<bool> = const { Cell::new(true) });
101
102/// Meant for internal use. In very rare conditions this can be turned off.
103/// # Safety
104/// The caller must ensure the Series that exceeds the length get's deconstructed
105/// into array values or list values before and never is used.
106pub unsafe fn _set_check_length(check: bool) {
107    CHECK_LENGTH.set(check)
108}
109
110impl<T: PolarsDataType> ChunkedArray<T> {
111    /// Get the length of the ChunkedArray
112    #[inline]
113    pub fn len(&self) -> usize {
114        self.length
115    }
116
117    /// Return the number of null values in the ChunkedArray.
118    #[inline]
119    pub fn null_count(&self) -> usize {
120        self.null_count
121    }
122
123    /// Set the null count directly.
124    ///
125    /// This can be useful after mutably adjusting the validity of the
126    /// underlying arrays.
127    ///
128    /// # Safety
129    /// The new null count must match the total null count of the underlying
130    /// arrays.
131    pub unsafe fn set_null_count(&mut self, null_count: usize) {
132        self.null_count = null_count;
133    }
134
135    /// Check if ChunkedArray is empty.
136    pub fn is_empty(&self) -> bool {
137        self.len() == 0
138    }
139
140    /// Compute the length
141    pub(crate) fn compute_len(&mut self) {
142        fn inner(chunks: &[ArrayRef]) -> usize {
143            match chunks.len() {
144                // fast path
145                1 => chunks[0].len(),
146                _ => chunks.iter().fold(0, |acc, arr| acc + arr.len()),
147            }
148        }
149        let len = inner(&self.chunks);
150        // Length limit is `IdxSize::MAX - 1`. We use `IdxSize::MAX` to indicate `NULL` in indexing.
151        if len >= (IdxSize::MAX as usize) && CHECK_LENGTH.get() {
152            panic!("{}", LENGTH_LIMIT_MSG);
153        }
154        self.length = len;
155        self.null_count = self
156            .chunks
157            .iter()
158            .map(|arr| arr.null_count())
159            .sum::<usize>();
160    }
161
162    /// Rechunks this ChunkedArray, returning a new Cow::Owned ChunkedArray if it was
163    /// rechunked or simply a Cow::Borrowed of itself if it was already a single chunk.
164    pub fn rechunk(&self) -> Cow<'_, Self> {
165        match self.dtype() {
166            #[cfg(feature = "object")]
167            DataType::Object(_) => {
168                panic!("implementation error")
169            },
170            _ => {
171                if self.chunks.len() == 1 {
172                    Cow::Borrowed(self)
173                } else {
174                    let chunks = vec![concatenate_unchecked(&self.chunks).unwrap()];
175
176                    let mut ca = unsafe { self.copy_with_chunks(chunks) };
177                    use StatisticsFlags as F;
178                    ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
179                    Cow::Owned(ca)
180                }
181            },
182        }
183    }
184
185    /// Rechunks this ChunkedArray in-place.
186    pub fn rechunk_mut(&mut self) {
187        if self.chunks.len() > 1 {
188            let rechunked = concatenate_unchecked(&self.chunks).unwrap();
189            if self.chunks.capacity() <= 8 {
190                // Reuse chunk allocation if not excessive.
191                self.chunks.clear();
192                self.chunks.push(rechunked);
193            } else {
194                self.chunks = vec![rechunked];
195            }
196        }
197    }
198
199    pub fn rechunk_validity(&self) -> Option<Bitmap> {
200        if self.chunks.len() == 1 {
201            return self.chunks[0].validity().cloned();
202        }
203
204        if !self.has_nulls() || self.is_empty() {
205            return None;
206        }
207
208        let mut bm = BitmapBuilder::with_capacity(self.len());
209        for arr in self.downcast_iter() {
210            if let Some(v) = arr.validity() {
211                bm.extend_from_bitmap(v);
212            } else {
213                bm.extend_constant(arr.len(), true);
214            }
215        }
216        bm.into_opt_validity()
217    }
218
219    pub fn with_validities(&mut self, validities: &[Option<Bitmap>]) {
220        assert_eq!(validities.len(), self.chunks.len());
221
222        // SAFETY:
223        // We don't change the data type of the chunks, nor the length.
224        for (arr, validity) in unsafe { self.chunks_mut().iter_mut() }.zip(validities.iter()) {
225            *arr = arr.with_validity(validity.clone())
226        }
227    }
228
229    /// Split the array. The chunks are reallocated the underlying data slices are zero copy.
230    ///
231    /// When offset is negative it will be counted from the end of the array.
232    /// This method will never error,
233    /// and will slice the best match when offset, or length is out of bounds
234    pub fn split_at(&self, offset: i64) -> (Self, Self) {
235        // A normal slice, slice the buffers and thus keep the whole memory allocated.
236        let (l, r) = split_at(&self.chunks, offset, self.len());
237        let mut out_l = unsafe { self.copy_with_chunks(l) };
238        let mut out_r = unsafe { self.copy_with_chunks(r) };
239
240        use StatisticsFlags as F;
241        out_l.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
242        out_r.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
243
244        (out_l, out_r)
245    }
246
247    /// Slice the array. The chunks are reallocated the underlying data slices are zero copy.
248    ///
249    /// When offset is negative it will be counted from the end of the array.
250    /// This method will never error,
251    /// and will slice the best match when offset, or length is out of bounds
252    pub fn slice(&self, offset: i64, length: usize) -> Self {
253        // The len: 0 special cases ensure we release memory.
254        // A normal slice, slice the buffers and thus keep the whole memory allocated.
255        let exec = || {
256            let (chunks, len) = slice(&self.chunks, offset, length, self.len());
257            let mut out = unsafe { self.copy_with_chunks(chunks) };
258
259            use StatisticsFlags as F;
260            out.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
261            out.length = len;
262
263            out
264        };
265
266        match length {
267            0 => match self.dtype() {
268                #[cfg(feature = "object")]
269                DataType::Object(_) => exec(),
270                _ => self.clear(),
271            },
272            _ => exec(),
273        }
274    }
275
276    /// Take a view of top n elements
277    #[must_use]
278    pub fn limit(&self, num_elements: usize) -> Self
279    where
280        Self: Sized,
281    {
282        self.slice(0, num_elements)
283    }
284
285    /// Get the head of the [`ChunkedArray`]
286    #[must_use]
287    pub fn head(&self, length: Option<usize>) -> Self
288    where
289        Self: Sized,
290    {
291        match length {
292            Some(len) => self.slice(0, std::cmp::min(len, self.len())),
293            None => self.slice(0, std::cmp::min(10, self.len())),
294        }
295    }
296
297    /// Get the tail of the [`ChunkedArray`]
298    #[must_use]
299    pub fn tail(&self, length: Option<usize>) -> Self
300    where
301        Self: Sized,
302    {
303        let len = match length {
304            Some(len) => std::cmp::min(len, self.len()),
305            None => std::cmp::min(10, self.len()),
306        };
307        self.slice(-(len as i64), len)
308    }
309
310    /// Remove empty chunks.
311    pub fn prune_empty_chunks(&mut self) {
312        let mut count = 0u32;
313        unsafe {
314            self.chunks_mut().retain(|arr| {
315                count += 1;
316                // Always keep at least one chunk
317                if count == 1 {
318                    true
319                } else {
320                    // Remove the empty chunks
321                    !arr.is_empty()
322                }
323            })
324        }
325    }
326}
327
328#[cfg(feature = "object")]
329impl<T: PolarsObject> ObjectChunked<T> {
330    pub(crate) fn rechunk_object(&self) -> Self {
331        if self.chunks.len() == 1 {
332            self.clone()
333        } else {
334            let mut builder = ObjectChunkedBuilder::new(self.name().clone(), self.len());
335            let chunks = self.downcast_iter();
336
337            // todo! use iterators once implemented
338            // no_null path
339            if !self.has_nulls() {
340                for arr in chunks {
341                    for idx in 0..arr.len() {
342                        builder.append_value(arr.value(idx).clone())
343                    }
344                }
345            } else {
346                for arr in chunks {
347                    for idx in 0..arr.len() {
348                        if arr.is_valid(idx) {
349                            builder.append_value(arr.value(idx).clone())
350                        } else {
351                            builder.append_null()
352                        }
353                    }
354                }
355            }
356            builder.finish()
357        }
358    }
359}
360
361#[cfg(test)]
362mod test {
363    #[cfg(feature = "dtype-categorical")]
364    use crate::prelude::*;
365
366    #[test]
367    #[cfg(feature = "dtype-categorical")]
368    fn test_categorical_map_after_rechunk() {
369        let s = Series::new(PlSmallStr::EMPTY, &["foo", "bar", "spam"]);
370        let mut a = s
371            .cast(&DataType::Categorical(None, Default::default()))
372            .unwrap();
373
374        a.append(&a.slice(0, 2)).unwrap();
375        let a = a.rechunk();
376        assert!(a.categorical().unwrap().get_rev_map().len() > 0);
377    }
378}