polars_core/chunked_array/ops/
chunkops.rs1use std::borrow::Cow;
2use std::cell::Cell;
3
4use arrow::bitmap::{Bitmap, BitmapBuilder};
5use arrow::compute::concatenate::concatenate_unchecked;
6use polars_error::constants::LENGTH_LIMIT_MSG;
7
8use super::*;
9use crate::chunked_array::flags::StatisticsFlags;
10#[cfg(feature = "object")]
11use crate::chunked_array::object::builder::ObjectChunkedBuilder;
12use crate::utils::slice_offsets;
13
14pub(crate) fn split_at(
15 chunks: &[ArrayRef],
16 offset: i64,
17 own_length: usize,
18) -> (Vec<ArrayRef>, Vec<ArrayRef>) {
19 let mut new_chunks_left = Vec::with_capacity(1);
20 let mut new_chunks_right = Vec::with_capacity(1);
21 let (raw_offset, _) = slice_offsets(offset, 0, own_length);
22
23 let mut remaining_offset = raw_offset;
24 let mut iter = chunks.iter();
25
26 for chunk in &mut iter {
27 let chunk_len = chunk.len();
28 if remaining_offset > 0 && remaining_offset >= chunk_len {
29 remaining_offset -= chunk_len;
30 new_chunks_left.push(chunk.clone());
31 continue;
32 }
33
34 let (l, r) = chunk.split_at_boxed(remaining_offset);
35 new_chunks_left.push(l);
36 new_chunks_right.push(r);
37 break;
38 }
39
40 for chunk in iter {
41 new_chunks_right.push(chunk.clone())
42 }
43 if new_chunks_left.is_empty() {
44 new_chunks_left.push(chunks[0].sliced(0, 0));
45 }
46 if new_chunks_right.is_empty() {
47 new_chunks_right.push(chunks[0].sliced(0, 0));
48 }
49 (new_chunks_left, new_chunks_right)
50}
51
52pub(crate) fn slice(
53 chunks: &[ArrayRef],
54 offset: i64,
55 slice_length: usize,
56 own_length: usize,
57) -> (Vec<ArrayRef>, usize) {
58 let mut new_chunks = Vec::with_capacity(1);
59 let (raw_offset, slice_len) = slice_offsets(offset, slice_length, own_length);
60
61 let mut remaining_length = slice_len;
62 let mut remaining_offset = raw_offset;
63 let mut new_len = 0;
64
65 for chunk in chunks {
66 let chunk_len = chunk.len();
67 if remaining_offset > 0 && remaining_offset >= chunk_len {
68 remaining_offset -= chunk_len;
69 continue;
70 }
71 let take_len = if remaining_length + remaining_offset > chunk_len {
72 chunk_len - remaining_offset
73 } else {
74 remaining_length
75 };
76 new_len += take_len;
77
78 debug_assert!(remaining_offset + take_len <= chunk.len());
79 unsafe {
80 new_chunks.push(chunk.sliced_unchecked(remaining_offset, take_len));
83 }
84 remaining_length -= take_len;
85 remaining_offset = 0;
86 if remaining_length == 0 {
87 break;
88 }
89 }
90 if new_chunks.is_empty() {
91 new_chunks.push(chunks[0].sliced(0, 0));
92 }
93 (new_chunks, new_len)
94}
95
96thread_local!(static CHECK_LENGTH: Cell<bool> = const { Cell::new(true) });
101
102pub unsafe fn _set_check_length(check: bool) {
107 CHECK_LENGTH.set(check)
108}
109
110impl<T: PolarsDataType> ChunkedArray<T> {
111 #[inline]
113 pub fn len(&self) -> usize {
114 self.length
115 }
116
117 #[inline]
119 pub fn null_count(&self) -> usize {
120 self.null_count
121 }
122
123 pub unsafe fn set_null_count(&mut self, null_count: usize) {
132 self.null_count = null_count;
133 }
134
135 pub fn is_empty(&self) -> bool {
137 self.len() == 0
138 }
139
140 pub(crate) fn compute_len(&mut self) {
142 fn inner(chunks: &[ArrayRef]) -> usize {
143 match chunks.len() {
144 1 => chunks[0].len(),
146 _ => chunks.iter().fold(0, |acc, arr| acc + arr.len()),
147 }
148 }
149 let len = inner(&self.chunks);
150 if len >= (IdxSize::MAX as usize) && CHECK_LENGTH.get() {
152 panic!("{}", LENGTH_LIMIT_MSG);
153 }
154 self.length = len;
155 self.null_count = self
156 .chunks
157 .iter()
158 .map(|arr| arr.null_count())
159 .sum::<usize>();
160 }
161
162 pub fn rechunk(&self) -> Cow<'_, Self> {
165 match self.dtype() {
166 #[cfg(feature = "object")]
167 DataType::Object(_) => {
168 panic!("implementation error")
169 },
170 _ => {
171 if self.chunks.len() == 1 {
172 Cow::Borrowed(self)
173 } else {
174 let chunks = vec![concatenate_unchecked(&self.chunks).unwrap()];
175
176 let mut ca = unsafe { self.copy_with_chunks(chunks) };
177 use StatisticsFlags as F;
178 ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
179 Cow::Owned(ca)
180 }
181 },
182 }
183 }
184
185 pub fn rechunk_mut(&mut self) {
187 if self.chunks.len() > 1 {
188 let rechunked = concatenate_unchecked(&self.chunks).unwrap();
189 if self.chunks.capacity() <= 8 {
190 self.chunks.clear();
192 self.chunks.push(rechunked);
193 } else {
194 self.chunks = vec![rechunked];
195 }
196 }
197 }
198
199 pub fn rechunk_validity(&self) -> Option<Bitmap> {
200 if self.chunks.len() == 1 {
201 return self.chunks[0].validity().cloned();
202 }
203
204 if !self.has_nulls() || self.is_empty() {
205 return None;
206 }
207
208 let mut bm = BitmapBuilder::with_capacity(self.len());
209 for arr in self.downcast_iter() {
210 if let Some(v) = arr.validity() {
211 bm.extend_from_bitmap(v);
212 } else {
213 bm.extend_constant(arr.len(), true);
214 }
215 }
216 bm.into_opt_validity()
217 }
218
219 pub fn with_validities(&mut self, validities: &[Option<Bitmap>]) {
220 assert_eq!(validities.len(), self.chunks.len());
221
222 for (arr, validity) in unsafe { self.chunks_mut().iter_mut() }.zip(validities.iter()) {
225 *arr = arr.with_validity(validity.clone())
226 }
227 }
228
229 pub fn split_at(&self, offset: i64) -> (Self, Self) {
235 let (l, r) = split_at(&self.chunks, offset, self.len());
237 let mut out_l = unsafe { self.copy_with_chunks(l) };
238 let mut out_r = unsafe { self.copy_with_chunks(r) };
239
240 use StatisticsFlags as F;
241 out_l.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
242 out_r.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
243
244 (out_l, out_r)
245 }
246
247 pub fn slice(&self, offset: i64, length: usize) -> Self {
253 let exec = || {
256 let (chunks, len) = slice(&self.chunks, offset, length, self.len());
257 let mut out = unsafe { self.copy_with_chunks(chunks) };
258
259 use StatisticsFlags as F;
260 out.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST);
261 out.length = len;
262
263 out
264 };
265
266 match length {
267 0 => match self.dtype() {
268 #[cfg(feature = "object")]
269 DataType::Object(_) => exec(),
270 _ => self.clear(),
271 },
272 _ => exec(),
273 }
274 }
275
276 #[must_use]
278 pub fn limit(&self, num_elements: usize) -> Self
279 where
280 Self: Sized,
281 {
282 self.slice(0, num_elements)
283 }
284
285 #[must_use]
287 pub fn head(&self, length: Option<usize>) -> Self
288 where
289 Self: Sized,
290 {
291 match length {
292 Some(len) => self.slice(0, std::cmp::min(len, self.len())),
293 None => self.slice(0, std::cmp::min(10, self.len())),
294 }
295 }
296
297 #[must_use]
299 pub fn tail(&self, length: Option<usize>) -> Self
300 where
301 Self: Sized,
302 {
303 let len = match length {
304 Some(len) => std::cmp::min(len, self.len()),
305 None => std::cmp::min(10, self.len()),
306 };
307 self.slice(-(len as i64), len)
308 }
309
310 pub fn prune_empty_chunks(&mut self) {
312 let mut count = 0u32;
313 unsafe {
314 self.chunks_mut().retain(|arr| {
315 count += 1;
316 if count == 1 {
318 true
319 } else {
320 !arr.is_empty()
322 }
323 })
324 }
325 }
326}
327
328#[cfg(feature = "object")]
329impl<T: PolarsObject> ObjectChunked<T> {
330 pub(crate) fn rechunk_object(&self) -> Self {
331 if self.chunks.len() == 1 {
332 self.clone()
333 } else {
334 let mut builder = ObjectChunkedBuilder::new(self.name().clone(), self.len());
335 let chunks = self.downcast_iter();
336
337 if !self.has_nulls() {
340 for arr in chunks {
341 for idx in 0..arr.len() {
342 builder.append_value(arr.value(idx).clone())
343 }
344 }
345 } else {
346 for arr in chunks {
347 for idx in 0..arr.len() {
348 if arr.is_valid(idx) {
349 builder.append_value(arr.value(idx).clone())
350 } else {
351 builder.append_null()
352 }
353 }
354 }
355 }
356 builder.finish()
357 }
358 }
359}
360
361#[cfg(test)]
362mod test {
363 #[cfg(feature = "dtype-categorical")]
364 use crate::prelude::*;
365
366 #[test]
367 #[cfg(feature = "dtype-categorical")]
368 fn test_categorical_map_after_rechunk() {
369 let s = Series::new(PlSmallStr::EMPTY, &["foo", "bar", "spam"]);
370 let mut a = s
371 .cast(&DataType::Categorical(None, Default::default()))
372 .unwrap();
373
374 a.append(&a.slice(0, 2)).unwrap();
375 let a = a.rechunk();
376 assert!(a.categorical().unwrap().get_rev_map().len() > 0);
377 }
378}