polars_core/chunked_array/ops/
explode.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2use arrow::array::*;
3use arrow::bitmap::utils::set_bit_unchecked;
4use arrow::bitmap::{Bitmap, MutableBitmap};
5use arrow::legacy::prelude::*;
6
7use crate::prelude::*;
8use crate::series::implementations::null::NullChunked;
9
10pub(crate) trait ExplodeByOffsets {
11    fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series;
12}
13
14unsafe fn unset_nulls(
15    start: usize,
16    last: usize,
17    validity_values: &Bitmap,
18    nulls: &mut Vec<usize>,
19    empty_row_idx: &[usize],
20    base_offset: usize,
21) {
22    for i in start..last {
23        if !validity_values.get_bit_unchecked(i) {
24            nulls.push(i + empty_row_idx.len() - base_offset);
25        }
26    }
27}
28
29fn get_capacity(offsets: &[i64]) -> usize {
30    (offsets[offsets.len() - 1] - offsets[0] + 1) as usize
31}
32
33impl<T> ExplodeByOffsets for ChunkedArray<T>
34where
35    T: PolarsIntegerType,
36{
37    fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series {
38        debug_assert_eq!(self.chunks.len(), 1);
39        let arr = self.downcast_iter().next().unwrap();
40
41        // make sure that we don't look beyond the sliced array
42        let values = &arr.values().as_slice()[..offsets[offsets.len() - 1] as usize];
43
44        let mut empty_row_idx = vec![];
45        let mut nulls = vec![];
46
47        let mut start = offsets[0] as usize;
48        let base_offset = start;
49        let mut last = start;
50
51        let mut new_values = Vec::with_capacity(offsets[offsets.len() - 1] as usize - start + 1);
52
53        // we check all the offsets and in the case a consecutive offset is the same,
54        // e.g. 0, 1, 4, 4, 6
55        // the 4 4, means that that is an empty row.
56        // the empty row will be replaced with a None value.
57        //
58        // below we memcpy as much as possible and for the empty rows we add a default value
59        // that value will later be masked out by the validity bitmap
60
61        // in the case that the value array has got null values, we need to check every validity
62        // value and collect the indices.
63        // because the length of the array is not known, we first collect the null indexes, offset
64        // with the insertion of empty rows (as None) and later create a validity bitmap
65        if arr.null_count() > 0 {
66            let validity_values = arr.validity().unwrap();
67
68            for &o in &offsets[1..] {
69                let o = o as usize;
70                if options.empty_as_null && o == last {
71                    if start != last {
72                        #[cfg(debug_assertions)]
73                        new_values.extend_from_slice(&values[start..last]);
74
75                        #[cfg(not(debug_assertions))]
76                        unsafe {
77                            new_values.extend_from_slice(values.get_unchecked(start..last))
78                        };
79
80                        // SAFETY:
81                        // we are in bounds
82                        unsafe {
83                            unset_nulls(
84                                start,
85                                last,
86                                validity_values,
87                                &mut nulls,
88                                &empty_row_idx,
89                                base_offset,
90                            )
91                        }
92                    }
93
94                    empty_row_idx.push(o + empty_row_idx.len() - base_offset);
95                    new_values.push(T::Native::default());
96                    start = o;
97                }
98                last = o;
99            }
100
101            // final null check
102            // SAFETY:
103            // we are in bounds
104            unsafe {
105                unset_nulls(
106                    start,
107                    last,
108                    validity_values,
109                    &mut nulls,
110                    &empty_row_idx,
111                    base_offset,
112                )
113            }
114        } else {
115            for &o in &offsets[1..] {
116                let o = o as usize;
117                if options.empty_as_null && o == last {
118                    if start != last {
119                        unsafe { new_values.extend_from_slice(values.get_unchecked(start..last)) };
120                    }
121
122                    empty_row_idx.push(o + empty_row_idx.len() - base_offset);
123                    new_values.push(T::Native::default());
124                    start = o;
125                }
126                last = o;
127            }
128        }
129
130        // add remaining values
131        new_values.extend_from_slice(&values[start..]);
132
133        let mut validity = MutableBitmap::with_capacity(new_values.len());
134        validity.extend_constant(new_values.len(), true);
135        let validity_slice = validity.as_mut_slice();
136
137        for i in empty_row_idx {
138            unsafe { set_bit_unchecked(validity_slice, i, false) }
139        }
140        for i in nulls {
141            unsafe { set_bit_unchecked(validity_slice, i, false) }
142        }
143        let arr = PrimitiveArray::new(
144            T::get_static_dtype().to_arrow(CompatLevel::newest()),
145            new_values.into(),
146            Some(validity.into()),
147        );
148        Series::try_from((self.name().clone(), Box::new(arr) as ArrayRef)).unwrap()
149    }
150}
151
152#[cfg(feature = "dtype-f16")]
153impl ExplodeByOffsets for Float16Chunked {
154    fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series {
155        self.apply_as_ints(|s| {
156            let ca = s.u16().unwrap();
157            ca.explode_by_offsets(offsets, options)
158        })
159    }
160}
161
162impl ExplodeByOffsets for Float32Chunked {
163    fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series {
164        self.apply_as_ints(|s| {
165            let ca = s.u32().unwrap();
166            ca.explode_by_offsets(offsets, options)
167        })
168    }
169}
170impl ExplodeByOffsets for Float64Chunked {
171    fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series {
172        self.apply_as_ints(|s| {
173            let ca = s.u64().unwrap();
174            ca.explode_by_offsets(offsets, options)
175        })
176    }
177}
178
179impl ExplodeByOffsets for NullChunked {
180    fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series {
181        let mut last_offset = offsets[0];
182
183        let mut len = 0;
184        for &offset in &offsets[1..] {
185            // If offset == last_offset we have an empty list and a new row is inserted,
186            // therefore we always increase at least 1.
187            len += std::cmp::max(offset - last_offset, i64::from(options.empty_as_null)) as usize;
188            last_offset = offset;
189        }
190        NullChunked::new(self.name.clone(), len).into_series()
191    }
192}
193
194impl ExplodeByOffsets for BooleanChunked {
195    fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series {
196        debug_assert_eq!(self.chunks.len(), 1);
197        let arr = self.downcast_iter().next().unwrap();
198
199        let cap = get_capacity(offsets);
200        let mut builder = BooleanChunkedBuilder::new(self.name().clone(), cap);
201
202        let mut start = offsets[0] as usize;
203        let mut last = start;
204        for &o in &offsets[1..] {
205            let o = o as usize;
206            if options.empty_as_null && o == last {
207                if start != last {
208                    let vals = arr.slice_typed(start, last - start);
209
210                    if vals.null_count() == 0 {
211                        builder
212                            .array_builder
213                            .extend_trusted_len_values(vals.values_iter())
214                    } else {
215                        builder.array_builder.extend_trusted_len(vals.into_iter());
216                    }
217                }
218                builder.append_null();
219                start = o;
220            }
221            last = o;
222        }
223        let vals = arr.slice_typed(start, last - start);
224        if vals.null_count() == 0 {
225            builder
226                .array_builder
227                .extend_trusted_len_values(vals.values_iter())
228        } else {
229            builder.array_builder.extend_trusted_len(vals.into_iter());
230        }
231        builder.finish().into()
232    }
233}
234
235/// Convert Arrow array offsets to indexes of the original list
236pub(crate) fn offsets_to_indexes(
237    offsets: &[i64],
238    capacity: usize,
239    options: ExplodeOptions,
240    validity: Option<&Bitmap>,
241) -> Vec<IdxSize> {
242    if offsets.is_empty() {
243        return vec![];
244    }
245
246    let mut idx = Vec::with_capacity(capacity);
247
248    let mut last_idx = 0;
249    match validity {
250        None => {
251            for (offset_start, offset_end) in offsets.iter().zip(offsets[1..].iter()) {
252                if idx.len() >= capacity {
253                    // significant speed-up in edge cases with many offsets,
254                    // no measurable overhead in typical case due to branch prediction
255                    break;
256                }
257
258                if offset_start == offset_end {
259                    if options.empty_as_null {
260                        // if the previous offset is equal to the current offset, we have an empty
261                        // list and we duplicate the previous index
262                        idx.push(last_idx);
263                    }
264                } else {
265                    let width = (offset_end - offset_start) as usize;
266                    for _ in 0..width {
267                        idx.push(last_idx);
268                    }
269                }
270
271                last_idx += 1;
272            }
273        },
274        Some(validity) => {
275            for ((offset_start, offset_end), is_valid) in
276                offsets.iter().zip(offsets[1..].iter()).zip(validity.iter())
277            {
278                if idx.len() >= capacity {
279                    // significant speed-up in edge cases with many offsets,
280                    // no measurable overhead in typical case due to branch prediction
281                    break;
282                }
283
284                if offset_start == offset_end {
285                    if (is_valid && options.empty_as_null) || (!is_valid && options.keep_nulls) {
286                        // if the previous offset is equal to the current offset, we have an empty
287                        // list and we duplicate the previous index
288                        idx.push(last_idx);
289                    }
290                } else {
291                    let width = (offset_end - offset_start) as usize;
292                    for _ in 0..width {
293                        idx.push(last_idx);
294                    }
295                }
296
297                last_idx += 1;
298            }
299        },
300    }
301
302    // take the remaining values
303    for _ in 0..capacity.saturating_sub(idx.len()) {
304        idx.push(last_idx);
305    }
306    idx.truncate(capacity);
307    idx
308}
309
310#[cfg(test)]
311mod test {
312    use super::*;
313    use crate::chunked_array::builder::get_list_builder;
314
315    #[test]
316    fn test_explode_list() -> PolarsResult<()> {
317        let mut builder = get_list_builder(&DataType::Int32, 5, 5, PlSmallStr::from_static("a"));
318
319        builder
320            .append_series(&Series::new(PlSmallStr::EMPTY, &[1, 2, 3, 3]))
321            .unwrap();
322        builder
323            .append_series(&Series::new(PlSmallStr::EMPTY, &[1]))
324            .unwrap();
325        builder
326            .append_series(&Series::new(PlSmallStr::EMPTY, &[2]))
327            .unwrap();
328
329        let ca = builder.finish();
330        assert!(ca._can_fast_explode());
331
332        // normal explode
333        let exploded = ca.explode(ExplodeOptions {
334            empty_as_null: true,
335            keep_nulls: true,
336        })?;
337        let out: Vec<_> = exploded.i32()?.into_no_null_iter().collect();
338        assert_eq!(out, &[1, 2, 3, 3, 1, 2]);
339
340        // sliced explode
341        let exploded = ca.slice(0, 1).explode(ExplodeOptions {
342            empty_as_null: true,
343            keep_nulls: true,
344        })?;
345        let out: Vec<_> = exploded.i32()?.into_no_null_iter().collect();
346        assert_eq!(out, &[1, 2, 3, 3]);
347
348        Ok(())
349    }
350
351    #[test]
352    fn test_explode_empty_list_slot() -> PolarsResult<()> {
353        // primitive
354        let mut builder = get_list_builder(&DataType::Int32, 5, 5, PlSmallStr::from_static("a"));
355        builder
356            .append_series(&Series::new(PlSmallStr::EMPTY, &[1i32, 2]))
357            .unwrap();
358        builder
359            .append_series(&Int32Chunked::from_slice(PlSmallStr::EMPTY, &[]).into_series())
360            .unwrap();
361        builder
362            .append_series(&Series::new(PlSmallStr::EMPTY, &[3i32]))
363            .unwrap();
364
365        let ca = builder.finish();
366        let exploded = ca.explode(ExplodeOptions {
367            empty_as_null: true,
368            keep_nulls: true,
369        })?;
370        assert_eq!(
371            Vec::from(exploded.i32()?),
372            &[Some(1), Some(2), None, Some(3)]
373        );
374
375        // more primitive
376        let mut builder = get_list_builder(&DataType::Int32, 5, 5, PlSmallStr::from_static("a"));
377        builder
378            .append_series(&Series::new(PlSmallStr::EMPTY, &[1i32]))
379            .unwrap();
380        builder
381            .append_series(&Int32Chunked::from_slice(PlSmallStr::EMPTY, &[]).into_series())
382            .unwrap();
383        builder
384            .append_series(&Series::new(PlSmallStr::EMPTY, &[2i32]))
385            .unwrap();
386        builder
387            .append_series(&Int32Chunked::from_slice(PlSmallStr::EMPTY, &[]).into_series())
388            .unwrap();
389        builder
390            .append_series(&Series::new(PlSmallStr::EMPTY, &[3, 4i32]))
391            .unwrap();
392
393        let ca = builder.finish();
394        let exploded = ca.explode(ExplodeOptions {
395            empty_as_null: true,
396            keep_nulls: true,
397        })?;
398        assert_eq!(
399            Vec::from(exploded.i32()?),
400            &[Some(1), None, Some(2), None, Some(3), Some(4)]
401        );
402
403        // string
404        let mut builder = get_list_builder(&DataType::String, 5, 5, PlSmallStr::from_static("a"));
405        builder
406            .append_series(&Series::new(PlSmallStr::EMPTY, &["abc"]))
407            .unwrap();
408        builder
409            .append_series(
410                &<StringChunked as NewChunkedArray<StringType, &str>>::from_slice(
411                    PlSmallStr::EMPTY,
412                    &[],
413                )
414                .into_series(),
415            )
416            .unwrap();
417        builder
418            .append_series(&Series::new(PlSmallStr::EMPTY, &["de"]))
419            .unwrap();
420        builder
421            .append_series(
422                &<StringChunked as NewChunkedArray<StringType, &str>>::from_slice(
423                    PlSmallStr::EMPTY,
424                    &[],
425                )
426                .into_series(),
427            )
428            .unwrap();
429        builder
430            .append_series(&Series::new(PlSmallStr::EMPTY, &["fg"]))
431            .unwrap();
432        builder
433            .append_series(
434                &<StringChunked as NewChunkedArray<StringType, &str>>::from_slice(
435                    PlSmallStr::EMPTY,
436                    &[],
437                )
438                .into_series(),
439            )
440            .unwrap();
441
442        let ca = builder.finish();
443        let exploded = ca.explode(ExplodeOptions {
444            empty_as_null: true,
445            keep_nulls: true,
446        })?;
447        assert_eq!(
448            Vec::from(exploded.str()?),
449            &[Some("abc"), None, Some("de"), None, Some("fg"), None]
450        );
451
452        // boolean
453        let mut builder = get_list_builder(&DataType::Boolean, 5, 5, PlSmallStr::from_static("a"));
454        builder
455            .append_series(&Series::new(PlSmallStr::EMPTY, &[true]))
456            .unwrap();
457        builder
458            .append_series(&BooleanChunked::from_slice(PlSmallStr::EMPTY, &[]).into_series())
459            .unwrap();
460        builder
461            .append_series(&Series::new(PlSmallStr::EMPTY, &[false]))
462            .unwrap();
463        builder
464            .append_series(&BooleanChunked::from_slice(PlSmallStr::EMPTY, &[]).into_series())
465            .unwrap();
466        builder
467            .append_series(&Series::new(PlSmallStr::EMPTY, &[true, true]))
468            .unwrap();
469
470        let ca = builder.finish();
471        let exploded = ca.explode(ExplodeOptions {
472            empty_as_null: true,
473            keep_nulls: true,
474        })?;
475        assert_eq!(
476            Vec::from(exploded.bool()?),
477            &[Some(true), None, Some(false), None, Some(true), Some(true)]
478        );
479
480        Ok(())
481    }
482
483    #[test]
484    fn test_row_offsets() {
485        let offsets = &[0, 1, 2, 2, 3, 4, 4];
486        let out = offsets_to_indexes(
487            offsets,
488            6,
489            ExplodeOptions {
490                empty_as_null: true,
491                keep_nulls: true,
492            },
493            None,
494        );
495        assert_eq!(out, &[0, 1, 2, 3, 4, 5]);
496    }
497
498    #[test]
499    fn test_empty_row_offsets() {
500        let offsets = &[0, 0];
501        let out = offsets_to_indexes(
502            offsets,
503            0,
504            ExplodeOptions {
505                empty_as_null: true,
506                keep_nulls: true,
507            },
508            None,
509        );
510        let expected: Vec<IdxSize> = Vec::new();
511        assert_eq!(out, expected);
512    }
513
514    #[test]
515    fn test_row_offsets_over_capacity() {
516        let offsets = &[0, 1, 1, 2, 2];
517        let out = offsets_to_indexes(
518            offsets,
519            2,
520            ExplodeOptions {
521                empty_as_null: true,
522                keep_nulls: true,
523            },
524            None,
525        );
526        assert_eq!(out, &[0, 1]);
527    }
528
529    #[test]
530    fn test_row_offsets_nonzero_first_offset() {
531        let offsets = &[3, 6, 8];
532        let out = offsets_to_indexes(
533            offsets,
534            10,
535            ExplodeOptions {
536                empty_as_null: true,
537                keep_nulls: true,
538            },
539            None,
540        );
541        assert_eq!(out, &[0, 0, 0, 1, 1, 2, 2, 2, 2, 2]);
542    }
543}