polars_core/chunked_array/ops/
explode.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2use arrow::array::*;
3use arrow::bitmap::utils::set_bit_unchecked;
4use arrow::bitmap::{Bitmap, MutableBitmap};
5use arrow::legacy::prelude::*;
6
7use crate::prelude::*;
8use crate::series::implementations::null::NullChunked;
9
10pub(crate) trait ExplodeByOffsets {
11    fn explode_by_offsets(&self, offsets: &[i64]) -> Series;
12}
13
14unsafe fn unset_nulls(
15    start: usize,
16    last: usize,
17    validity_values: &Bitmap,
18    nulls: &mut Vec<usize>,
19    empty_row_idx: &[usize],
20    base_offset: usize,
21) {
22    for i in start..last {
23        if !validity_values.get_bit_unchecked(i) {
24            nulls.push(i + empty_row_idx.len() - base_offset);
25        }
26    }
27}
28
29fn get_capacity(offsets: &[i64]) -> usize {
30    (offsets[offsets.len() - 1] - offsets[0] + 1) as usize
31}
32
33impl<T> ExplodeByOffsets for ChunkedArray<T>
34where
35    T: PolarsIntegerType,
36{
37    fn explode_by_offsets(&self, offsets: &[i64]) -> Series {
38        debug_assert_eq!(self.chunks.len(), 1);
39        let arr = self.downcast_iter().next().unwrap();
40
41        // make sure that we don't look beyond the sliced array
42        let values = &arr.values().as_slice()[..offsets[offsets.len() - 1] as usize];
43
44        let mut empty_row_idx = vec![];
45        let mut nulls = vec![];
46
47        let mut start = offsets[0] as usize;
48        let base_offset = start;
49        let mut last = start;
50
51        let mut new_values = Vec::with_capacity(offsets[offsets.len() - 1] as usize - start + 1);
52
53        // we check all the offsets and in the case a consecutive offset is the same,
54        // e.g. 0, 1, 4, 4, 6
55        // the 4 4, means that that is an empty row.
56        // the empty row will be replaced with a None value.
57        //
58        // below we memcpy as much as possible and for the empty rows we add a default value
59        // that value will later be masked out by the validity bitmap
60
61        // in the case that the value array has got null values, we need to check every validity
62        // value and collect the indices.
63        // because the length of the array is not known, we first collect the null indexes, offset
64        // with the insertion of empty rows (as None) and later create a validity bitmap
65        if arr.null_count() > 0 {
66            let validity_values = arr.validity().unwrap();
67
68            for &o in &offsets[1..] {
69                let o = o as usize;
70                if o == last {
71                    if start != last {
72                        #[cfg(debug_assertions)]
73                        new_values.extend_from_slice(&values[start..last]);
74
75                        #[cfg(not(debug_assertions))]
76                        unsafe {
77                            new_values.extend_from_slice(values.get_unchecked(start..last))
78                        };
79
80                        // SAFETY:
81                        // we are in bounds
82                        unsafe {
83                            unset_nulls(
84                                start,
85                                last,
86                                validity_values,
87                                &mut nulls,
88                                &empty_row_idx,
89                                base_offset,
90                            )
91                        }
92                    }
93
94                    empty_row_idx.push(o + empty_row_idx.len() - base_offset);
95                    new_values.push(T::Native::default());
96                    start = o;
97                }
98                last = o;
99            }
100
101            // final null check
102            // SAFETY:
103            // we are in bounds
104            unsafe {
105                unset_nulls(
106                    start,
107                    last,
108                    validity_values,
109                    &mut nulls,
110                    &empty_row_idx,
111                    base_offset,
112                )
113            }
114        } else {
115            for &o in &offsets[1..] {
116                let o = o as usize;
117                if o == last {
118                    if start != last {
119                        unsafe { new_values.extend_from_slice(values.get_unchecked(start..last)) };
120                    }
121
122                    empty_row_idx.push(o + empty_row_idx.len() - base_offset);
123                    new_values.push(T::Native::default());
124                    start = o;
125                }
126                last = o;
127            }
128        }
129
130        // add remaining values
131        new_values.extend_from_slice(&values[start..]);
132
133        let mut validity = MutableBitmap::with_capacity(new_values.len());
134        validity.extend_constant(new_values.len(), true);
135        let validity_slice = validity.as_mut_slice();
136
137        for i in empty_row_idx {
138            unsafe { set_bit_unchecked(validity_slice, i, false) }
139        }
140        for i in nulls {
141            unsafe { set_bit_unchecked(validity_slice, i, false) }
142        }
143        let arr = PrimitiveArray::new(
144            T::get_dtype().to_arrow(CompatLevel::newest()),
145            new_values.into(),
146            Some(validity.into()),
147        );
148        Series::try_from((self.name().clone(), Box::new(arr) as ArrayRef)).unwrap()
149    }
150}
151
152impl ExplodeByOffsets for Float32Chunked {
153    fn explode_by_offsets(&self, offsets: &[i64]) -> Series {
154        self.apply_as_ints(|s| {
155            let ca = s.u32().unwrap();
156            ca.explode_by_offsets(offsets)
157        })
158    }
159}
160impl ExplodeByOffsets for Float64Chunked {
161    fn explode_by_offsets(&self, offsets: &[i64]) -> Series {
162        self.apply_as_ints(|s| {
163            let ca = s.u64().unwrap();
164            ca.explode_by_offsets(offsets)
165        })
166    }
167}
168
169impl ExplodeByOffsets for NullChunked {
170    fn explode_by_offsets(&self, offsets: &[i64]) -> Series {
171        let mut last_offset = offsets[0];
172
173        let mut len = 0;
174        for &offset in &offsets[1..] {
175            // If offset == last_offset we have an empty list and a new row is inserted,
176            // therefore we always increase at least 1.
177            len += std::cmp::max(offset - last_offset, 1) as usize;
178            last_offset = offset;
179        }
180        NullChunked::new(self.name.clone(), len).into_series()
181    }
182}
183
184impl ExplodeByOffsets for BooleanChunked {
185    fn explode_by_offsets(&self, offsets: &[i64]) -> Series {
186        debug_assert_eq!(self.chunks.len(), 1);
187        let arr = self.downcast_iter().next().unwrap();
188
189        let cap = get_capacity(offsets);
190        let mut builder = BooleanChunkedBuilder::new(self.name().clone(), cap);
191
192        let mut start = offsets[0] as usize;
193        let mut last = start;
194        for &o in &offsets[1..] {
195            let o = o as usize;
196            if o == last {
197                if start != last {
198                    let vals = arr.slice_typed(start, last - start);
199
200                    if vals.null_count() == 0 {
201                        builder
202                            .array_builder
203                            .extend_trusted_len_values(vals.values_iter())
204                    } else {
205                        builder.array_builder.extend_trusted_len(vals.into_iter());
206                    }
207                }
208                builder.append_null();
209                start = o;
210            }
211            last = o;
212        }
213        let vals = arr.slice_typed(start, last - start);
214        if vals.null_count() == 0 {
215            builder
216                .array_builder
217                .extend_trusted_len_values(vals.values_iter())
218        } else {
219            builder.array_builder.extend_trusted_len(vals.into_iter());
220        }
221        builder.finish().into()
222    }
223}
224
225/// Convert Arrow array offsets to indexes of the original list
226pub(crate) fn offsets_to_indexes(offsets: &[i64], capacity: usize) -> Vec<IdxSize> {
227    if offsets.is_empty() {
228        return vec![];
229    }
230
231    let mut idx = Vec::with_capacity(capacity);
232
233    let mut last_idx = 0;
234    for (offset_start, offset_end) in offsets.iter().zip(offsets[1..].iter()) {
235        if idx.len() >= capacity {
236            // significant speed-up in edge cases with many offsets,
237            // no measurable overhead in typical case due to branch prediction
238            break;
239        }
240
241        if offset_start == offset_end {
242            // if the previous offset is equal to the current offset, we have an empty
243            // list and we duplicate the previous index
244            idx.push(last_idx);
245        } else {
246            let width = (offset_end - offset_start) as usize;
247            for _ in 0..width {
248                idx.push(last_idx);
249            }
250        }
251
252        last_idx += 1;
253    }
254
255    // take the remaining values
256    for _ in 0..capacity.saturating_sub(idx.len()) {
257        idx.push(last_idx);
258    }
259    idx.truncate(capacity);
260    idx
261}
262
263#[cfg(test)]
264mod test {
265    use super::*;
266    use crate::chunked_array::builder::get_list_builder;
267
268    #[test]
269    fn test_explode_list() -> PolarsResult<()> {
270        let mut builder = get_list_builder(&DataType::Int32, 5, 5, PlSmallStr::from_static("a"));
271
272        builder
273            .append_series(&Series::new(PlSmallStr::EMPTY, &[1, 2, 3, 3]))
274            .unwrap();
275        builder
276            .append_series(&Series::new(PlSmallStr::EMPTY, &[1]))
277            .unwrap();
278        builder
279            .append_series(&Series::new(PlSmallStr::EMPTY, &[2]))
280            .unwrap();
281
282        let ca = builder.finish();
283        assert!(ca._can_fast_explode());
284
285        // normal explode
286        let exploded = ca.explode()?;
287        let out: Vec<_> = exploded.i32()?.into_no_null_iter().collect();
288        assert_eq!(out, &[1, 2, 3, 3, 1, 2]);
289
290        // sliced explode
291        let exploded = ca.slice(0, 1).explode()?;
292        let out: Vec<_> = exploded.i32()?.into_no_null_iter().collect();
293        assert_eq!(out, &[1, 2, 3, 3]);
294
295        Ok(())
296    }
297
298    #[test]
299    fn test_explode_empty_list_slot() -> PolarsResult<()> {
300        // primitive
301        let mut builder = get_list_builder(&DataType::Int32, 5, 5, PlSmallStr::from_static("a"));
302        builder
303            .append_series(&Series::new(PlSmallStr::EMPTY, &[1i32, 2]))
304            .unwrap();
305        builder
306            .append_series(&Int32Chunked::from_slice(PlSmallStr::EMPTY, &[]).into_series())
307            .unwrap();
308        builder
309            .append_series(&Series::new(PlSmallStr::EMPTY, &[3i32]))
310            .unwrap();
311
312        let ca = builder.finish();
313        let exploded = ca.explode()?;
314        assert_eq!(
315            Vec::from(exploded.i32()?),
316            &[Some(1), Some(2), None, Some(3)]
317        );
318
319        // more primitive
320        let mut builder = get_list_builder(&DataType::Int32, 5, 5, PlSmallStr::from_static("a"));
321        builder
322            .append_series(&Series::new(PlSmallStr::EMPTY, &[1i32]))
323            .unwrap();
324        builder
325            .append_series(&Int32Chunked::from_slice(PlSmallStr::EMPTY, &[]).into_series())
326            .unwrap();
327        builder
328            .append_series(&Series::new(PlSmallStr::EMPTY, &[2i32]))
329            .unwrap();
330        builder
331            .append_series(&Int32Chunked::from_slice(PlSmallStr::EMPTY, &[]).into_series())
332            .unwrap();
333        builder
334            .append_series(&Series::new(PlSmallStr::EMPTY, &[3, 4i32]))
335            .unwrap();
336
337        let ca = builder.finish();
338        let exploded = ca.explode()?;
339        assert_eq!(
340            Vec::from(exploded.i32()?),
341            &[Some(1), None, Some(2), None, Some(3), Some(4)]
342        );
343
344        // string
345        let mut builder = get_list_builder(&DataType::String, 5, 5, PlSmallStr::from_static("a"));
346        builder
347            .append_series(&Series::new(PlSmallStr::EMPTY, &["abc"]))
348            .unwrap();
349        builder
350            .append_series(
351                &<StringChunked as NewChunkedArray<StringType, &str>>::from_slice(
352                    PlSmallStr::EMPTY,
353                    &[],
354                )
355                .into_series(),
356            )
357            .unwrap();
358        builder
359            .append_series(&Series::new(PlSmallStr::EMPTY, &["de"]))
360            .unwrap();
361        builder
362            .append_series(
363                &<StringChunked as NewChunkedArray<StringType, &str>>::from_slice(
364                    PlSmallStr::EMPTY,
365                    &[],
366                )
367                .into_series(),
368            )
369            .unwrap();
370        builder
371            .append_series(&Series::new(PlSmallStr::EMPTY, &["fg"]))
372            .unwrap();
373        builder
374            .append_series(
375                &<StringChunked as NewChunkedArray<StringType, &str>>::from_slice(
376                    PlSmallStr::EMPTY,
377                    &[],
378                )
379                .into_series(),
380            )
381            .unwrap();
382
383        let ca = builder.finish();
384        let exploded = ca.explode()?;
385        assert_eq!(
386            Vec::from(exploded.str()?),
387            &[Some("abc"), None, Some("de"), None, Some("fg"), None]
388        );
389
390        // boolean
391        let mut builder = get_list_builder(&DataType::Boolean, 5, 5, PlSmallStr::from_static("a"));
392        builder
393            .append_series(&Series::new(PlSmallStr::EMPTY, &[true]))
394            .unwrap();
395        builder
396            .append_series(&BooleanChunked::from_slice(PlSmallStr::EMPTY, &[]).into_series())
397            .unwrap();
398        builder
399            .append_series(&Series::new(PlSmallStr::EMPTY, &[false]))
400            .unwrap();
401        builder
402            .append_series(&BooleanChunked::from_slice(PlSmallStr::EMPTY, &[]).into_series())
403            .unwrap();
404        builder
405            .append_series(&Series::new(PlSmallStr::EMPTY, &[true, true]))
406            .unwrap();
407
408        let ca = builder.finish();
409        let exploded = ca.explode()?;
410        assert_eq!(
411            Vec::from(exploded.bool()?),
412            &[Some(true), None, Some(false), None, Some(true), Some(true)]
413        );
414
415        Ok(())
416    }
417
418    #[test]
419    fn test_row_offsets() {
420        let offsets = &[0, 1, 2, 2, 3, 4, 4];
421        let out = offsets_to_indexes(offsets, 6);
422        assert_eq!(out, &[0, 1, 2, 3, 4, 5]);
423    }
424
425    #[test]
426    fn test_empty_row_offsets() {
427        let offsets = &[0, 0];
428        let out = offsets_to_indexes(offsets, 0);
429        let expected: Vec<IdxSize> = Vec::new();
430        assert_eq!(out, expected);
431    }
432
433    #[test]
434    fn test_row_offsets_over_capacity() {
435        let offsets = &[0, 1, 1, 2, 2];
436        let out = offsets_to_indexes(offsets, 2);
437        assert_eq!(out, &[0, 1]);
438    }
439
440    #[test]
441    fn test_row_offsets_nonzero_first_offset() {
442        let offsets = &[3, 6, 8];
443        let out = offsets_to_indexes(offsets, 10);
444        assert_eq!(out, &[0, 0, 0, 1, 1, 2, 2, 2, 2, 2]);
445    }
446}