polars_core/chunked_array/ops/unique/
mod.rs

1use std::hash::Hash;
2use std::ops::Deref;
3
4use arrow::bitmap::MutableBitmap;
5use polars_compute::unique::BooleanUniqueKernelState;
6use polars_utils::total_ord::{ToTotalOrd, TotalHash, TotalOrdWrap};
7
8use crate::hashing::_HASHMAP_INIT_SIZE;
9use crate::prelude::*;
10use crate::series::IsSorted;
11
12fn finish_is_unique_helper(
13    unique_idx: Vec<IdxSize>,
14    len: IdxSize,
15    setter: bool,
16    default: bool,
17) -> BooleanChunked {
18    let mut values = MutableBitmap::with_capacity(len as usize);
19    values.extend_constant(len as usize, default);
20
21    for idx in unique_idx {
22        unsafe { values.set_unchecked(idx as usize, setter) }
23    }
24    let arr = BooleanArray::from_data_default(values.into(), None);
25    arr.into()
26}
27
28pub(crate) fn is_unique_helper(
29    groups: &GroupPositions,
30    len: IdxSize,
31    unique_val: bool,
32    duplicated_val: bool,
33) -> BooleanChunked {
34    debug_assert_ne!(unique_val, duplicated_val);
35
36    let idx = match groups.deref() {
37        GroupsType::Idx(groups) => groups
38            .iter()
39            .filter_map(|(first, g)| if g.len() == 1 { Some(first) } else { None })
40            .collect::<Vec<_>>(),
41        GroupsType::Slice { groups, .. } => groups
42            .iter()
43            .filter_map(|[first, len]| if *len == 1 { Some(*first) } else { None })
44            .collect(),
45    };
46    finish_is_unique_helper(idx, len, unique_val, duplicated_val)
47}
48
49#[cfg(feature = "object")]
50impl<T: PolarsObject> ChunkUnique for ObjectChunked<T> {
51    fn unique(&self) -> PolarsResult<ChunkedArray<ObjectType<T>>> {
52        polars_bail!(opq = unique, self.dtype());
53    }
54
55    fn arg_unique(&self) -> PolarsResult<IdxCa> {
56        polars_bail!(opq = arg_unique, self.dtype());
57    }
58
59    fn unique_id(&self) -> PolarsResult<(IdxSize, Vec<IdxSize>)> {
60        polars_bail!(opq = unique_id, self.dtype());
61    }
62}
63
64fn arg_unique<T>(a: impl Iterator<Item = T>, capacity: usize) -> Vec<IdxSize>
65where
66    T: ToTotalOrd,
67    <T as ToTotalOrd>::TotalOrdItem: Hash + Eq,
68{
69    let mut set = PlHashSet::new();
70    let mut unique = Vec::with_capacity(capacity);
71    a.enumerate().for_each(|(idx, val)| {
72        if set.insert(val.to_total_ord()) {
73            unique.push(idx as IdxSize)
74        }
75    });
76    unique
77}
78
79macro_rules! arg_unique_ca {
80    ($ca:expr) => {{
81        match $ca.has_nulls() {
82            false => arg_unique($ca.into_no_null_iter(), $ca.len()),
83            _ => arg_unique($ca.iter(), $ca.len()),
84        }
85    }};
86}
87
88impl<T> ChunkUnique for ChunkedArray<T>
89where
90    T: PolarsNumericType,
91    T::Native: TotalHash + TotalEq + ToTotalOrd,
92    <T::Native as ToTotalOrd>::TotalOrdItem: Hash + Eq + Ord,
93    ChunkedArray<T>: for<'a> ChunkCompareEq<&'a ChunkedArray<T>, Item = BooleanChunked>,
94{
95    fn unique(&self) -> PolarsResult<Self> {
96        // prevent stackoverflow repeated sorted.unique call
97        if self.is_empty() {
98            return Ok(self.clone());
99        }
100        match self.is_sorted_flag() {
101            IsSorted::Ascending | IsSorted::Descending => {
102                if self.null_count() > 0 {
103                    let mut arr = MutablePrimitiveArray::with_capacity(self.len());
104
105                    if !self.is_empty() {
106                        let mut iter = self.iter();
107                        let last = iter.next().unwrap();
108                        arr.push(last);
109                        let mut last = last.to_total_ord();
110
111                        let to_extend = iter.filter(|opt_val| {
112                            let opt_val_tot_ord = opt_val.to_total_ord();
113                            let out = opt_val_tot_ord != last;
114                            last = opt_val_tot_ord;
115                            out
116                        });
117
118                        arr.extend(to_extend);
119                    }
120
121                    let arr: PrimitiveArray<T::Native> = arr.into();
122                    Ok(ChunkedArray::with_chunk(self.name().clone(), arr))
123                } else {
124                    let mask = self.not_equal_missing(&self.shift(1));
125                    self.filter(&mask)
126                }
127            },
128            IsSorted::Not => {
129                let sorted = self.sort(false);
130                sorted.unique()
131            },
132        }
133    }
134
135    fn arg_unique(&self) -> PolarsResult<IdxCa> {
136        Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
137    }
138
139    fn n_unique(&self) -> PolarsResult<usize> {
140        // prevent stackoverflow repeated sorted.unique call
141        if self.is_empty() {
142            return Ok(0);
143        }
144        match self.is_sorted_flag() {
145            IsSorted::Ascending | IsSorted::Descending => {
146                if self.null_count() > 0 {
147                    let mut count = 0;
148
149                    if self.is_empty() {
150                        return Ok(count);
151                    }
152
153                    let mut iter = self.iter();
154                    let mut last = iter.next().unwrap().to_total_ord();
155
156                    count += 1;
157
158                    iter.for_each(|opt_val| {
159                        let opt_val = opt_val.to_total_ord();
160                        if opt_val != last {
161                            last = opt_val;
162                            count += 1;
163                        }
164                    });
165
166                    Ok(count)
167                } else {
168                    let mask = self.not_equal_missing(&self.shift(1));
169                    Ok(mask.sum().unwrap() as usize)
170                }
171            },
172            IsSorted::Not => {
173                let sorted = self.sort(false);
174                sorted.n_unique()
175            },
176        }
177    }
178
179    fn unique_id(&self) -> PolarsResult<(IdxSize, Vec<IdxSize>)> {
180        let mut n = IdxSize::from(self.has_nulls());
181        let mut indices = PlHashMap::new();
182        let ids = self
183            .iter()
184            .map(|v| match v {
185                None => 0,
186                Some(v) => *indices.entry(TotalOrdWrap(v)).or_insert_with(|| {
187                    let i = n;
188                    n += 1;
189                    i
190                }),
191            })
192            .collect_trusted();
193        Ok((n, ids))
194    }
195}
196
197impl ChunkUnique for StringChunked {
198    fn unique(&self) -> PolarsResult<Self> {
199        let out = self.as_binary().unique()?;
200        Ok(unsafe { out.to_string_unchecked() })
201    }
202
203    fn arg_unique(&self) -> PolarsResult<IdxCa> {
204        self.as_binary().arg_unique()
205    }
206
207    fn n_unique(&self) -> PolarsResult<usize> {
208        self.as_binary().n_unique()
209    }
210
211    fn unique_id(&self) -> PolarsResult<(IdxSize, Vec<IdxSize>)> {
212        self.as_binary().unique_id()
213    }
214}
215
216impl ChunkUnique for BinaryChunked {
217    fn unique(&self) -> PolarsResult<Self> {
218        match self.null_count() {
219            0 => {
220                let mut set =
221                    PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
222                for arr in self.downcast_iter() {
223                    set.extend(arr.values_iter())
224                }
225                Ok(BinaryChunked::from_iter_values(
226                    self.name().clone(),
227                    set.iter().copied(),
228                ))
229            },
230            _ => {
231                let mut set =
232                    PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
233                for arr in self.downcast_iter() {
234                    set.extend(arr.iter())
235                }
236                Ok(BinaryChunked::from_iter_options(
237                    self.name().clone(),
238                    set.iter().copied(),
239                ))
240            },
241        }
242    }
243
244    fn arg_unique(&self) -> PolarsResult<IdxCa> {
245        Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
246    }
247
248    fn n_unique(&self) -> PolarsResult<usize> {
249        let mut set: PlHashSet<&[u8]> = PlHashSet::new();
250        if self.null_count() > 0 {
251            for arr in self.downcast_iter() {
252                set.extend(arr.into_iter().flatten())
253            }
254            Ok(set.len() + 1)
255        } else {
256            for arr in self.downcast_iter() {
257                set.extend(arr.values_iter())
258            }
259            Ok(set.len())
260        }
261    }
262
263    fn unique_id(&self) -> PolarsResult<(IdxSize, Vec<IdxSize>)> {
264        let mut n = IdxSize::from(self.has_nulls());
265        let mut indices = PlHashMap::new();
266        let ids = self
267            .iter()
268            .map(|v| match v {
269                None => 0,
270                Some(v) => *indices.entry(v).or_insert_with(|| {
271                    let i = n;
272                    n += 1;
273                    i
274                }),
275            })
276            .collect_trusted();
277        Ok((n, ids))
278    }
279}
280
281impl ChunkUnique for BinaryOffsetChunked {
282    fn unique(&self) -> PolarsResult<Self> {
283        match self.null_count() {
284            0 => {
285                let mut set =
286                    PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
287                for arr in self.downcast_iter() {
288                    set.extend(arr.values_iter())
289                }
290                Ok(set.iter().copied().collect_ca(self.name().clone()))
291            },
292            _ => {
293                let mut set =
294                    PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
295                for arr in self.downcast_iter() {
296                    set.extend(arr.iter())
297                }
298                Ok(set.iter().copied().collect_ca(self.name().clone()))
299            },
300        }
301    }
302
303    fn arg_unique(&self) -> PolarsResult<IdxCa> {
304        Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
305    }
306
307    fn n_unique(&self) -> PolarsResult<usize> {
308        let mut set: PlHashSet<&[u8]> = PlHashSet::new();
309        if self.null_count() > 0 {
310            for arr in self.downcast_iter() {
311                set.extend(arr.into_iter().flatten())
312            }
313            Ok(set.len() + 1)
314        } else {
315            for arr in self.downcast_iter() {
316                set.extend(arr.values_iter())
317            }
318            Ok(set.len())
319        }
320    }
321
322    fn unique_id(&self) -> PolarsResult<(IdxSize, Vec<IdxSize>)> {
323        let mut n = IdxSize::from(self.has_nulls());
324        let mut indices = PlHashMap::new();
325        let ids = self
326            .iter()
327            .map(|v| match v {
328                None => 0,
329                Some(v) => *indices.entry(v).or_insert_with(|| {
330                    let i = n;
331                    n += 1;
332                    i
333                }),
334            })
335            .collect_trusted();
336        Ok((n, ids))
337    }
338}
339
340impl ChunkUnique for BooleanChunked {
341    fn unique(&self) -> PolarsResult<Self> {
342        use polars_compute::unique::RangedUniqueKernel;
343
344        let mut state = BooleanUniqueKernelState::new();
345
346        for arr in self.downcast_iter() {
347            state.append(arr);
348
349            if state.has_seen_all() {
350                break;
351            }
352        }
353
354        let unique = state.finalize_unique();
355
356        Ok(Self::with_chunk(self.name().clone(), unique))
357    }
358
359    fn arg_unique(&self) -> PolarsResult<IdxCa> {
360        Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
361    }
362
363    fn unique_id(&self) -> PolarsResult<(IdxSize, Vec<IdxSize>)> {
364        let num_nulls = self.null_count();
365        let num_trues = self.num_trues();
366
367        let true_idx = IdxSize::from(num_nulls > 0);
368        let false_idx = IdxSize::from(num_nulls > 0) + IdxSize::from(num_trues > 0);
369        let ids = self
370            .iter()
371            .map(|v| match v {
372                None => 0,
373                Some(false) => false_idx,
374                Some(true) => true_idx,
375            })
376            .collect_trusted();
377        Ok((false_idx + 1, ids))
378    }
379}
380
381#[cfg(test)]
382mod test {
383    use crate::prelude::*;
384
385    #[test]
386    fn unique() {
387        let ca =
388            ChunkedArray::<Int32Type>::from_slice(PlSmallStr::from_static("a"), &[1, 2, 3, 2, 1]);
389        assert_eq!(
390            ca.unique()
391                .unwrap()
392                .sort(false)
393                .into_iter()
394                .collect::<Vec<_>>(),
395            vec![Some(1), Some(2), Some(3)]
396        );
397        let ca = BooleanChunked::from_slice(PlSmallStr::from_static("a"), &[true, false, true]);
398        assert_eq!(
399            ca.unique().unwrap().into_iter().collect::<Vec<_>>(),
400            vec![Some(false), Some(true)]
401        );
402
403        let ca = StringChunked::new(
404            PlSmallStr::EMPTY,
405            &[Some("a"), None, Some("a"), Some("b"), None],
406        );
407        assert_eq!(
408            Vec::from(&ca.unique().unwrap().sort(false)),
409            &[None, Some("a"), Some("b")]
410        );
411    }
412
413    #[test]
414    fn arg_unique() {
415        let ca =
416            ChunkedArray::<Int32Type>::from_slice(PlSmallStr::from_static("a"), &[1, 2, 1, 1, 3]);
417        assert_eq!(
418            ca.arg_unique().unwrap().into_iter().collect::<Vec<_>>(),
419            vec![Some(0), Some(1), Some(4)]
420        );
421    }
422}