polars_core/chunked_array/ops/unique/
mod.rs

1use std::hash::Hash;
2use std::ops::Deref;
3
4use arrow::bitmap::MutableBitmap;
5use polars_compute::unique::BooleanUniqueKernelState;
6use polars_utils::total_ord::{ToTotalOrd, TotalHash};
7
8use crate::hashing::_HASHMAP_INIT_SIZE;
9use crate::prelude::*;
10use crate::series::IsSorted;
11
12fn finish_is_unique_helper(
13    unique_idx: Vec<IdxSize>,
14    len: IdxSize,
15    setter: bool,
16    default: bool,
17) -> BooleanChunked {
18    let mut values = MutableBitmap::with_capacity(len as usize);
19    values.extend_constant(len as usize, default);
20
21    for idx in unique_idx {
22        unsafe { values.set_unchecked(idx as usize, setter) }
23    }
24    let arr = BooleanArray::from_data_default(values.into(), None);
25    arr.into()
26}
27
28pub(crate) fn is_unique_helper(
29    groups: &GroupPositions,
30    len: IdxSize,
31    unique_val: bool,
32    duplicated_val: bool,
33) -> BooleanChunked {
34    debug_assert_ne!(unique_val, duplicated_val);
35
36    let idx = match groups.deref() {
37        GroupsType::Idx(groups) => groups
38            .iter()
39            .filter_map(|(first, g)| if g.len() == 1 { Some(first) } else { None })
40            .collect::<Vec<_>>(),
41        GroupsType::Slice { groups, .. } => groups
42            .iter()
43            .filter_map(|[first, len]| if *len == 1 { Some(*first) } else { None })
44            .collect(),
45    };
46    finish_is_unique_helper(idx, len, unique_val, duplicated_val)
47}
48
49#[cfg(feature = "object")]
50impl<T: PolarsObject> ChunkUnique for ObjectChunked<T> {
51    fn unique(&self) -> PolarsResult<ChunkedArray<ObjectType<T>>> {
52        polars_bail!(opq = unique, self.dtype());
53    }
54
55    fn arg_unique(&self) -> PolarsResult<IdxCa> {
56        polars_bail!(opq = arg_unique, self.dtype());
57    }
58}
59
60fn arg_unique<T>(a: impl Iterator<Item = T>, capacity: usize) -> Vec<IdxSize>
61where
62    T: ToTotalOrd,
63    <T as ToTotalOrd>::TotalOrdItem: Hash + Eq,
64{
65    let mut set = PlHashSet::new();
66    let mut unique = Vec::with_capacity(capacity);
67    a.enumerate().for_each(|(idx, val)| {
68        if set.insert(val.to_total_ord()) {
69            unique.push(idx as IdxSize)
70        }
71    });
72    unique
73}
74
75macro_rules! arg_unique_ca {
76    ($ca:expr) => {{
77        match $ca.has_nulls() {
78            false => arg_unique($ca.into_no_null_iter(), $ca.len()),
79            _ => arg_unique($ca.iter(), $ca.len()),
80        }
81    }};
82}
83
84impl<T> ChunkUnique for ChunkedArray<T>
85where
86    T: PolarsNumericType,
87    T::Native: TotalHash + TotalEq + ToTotalOrd,
88    <T::Native as ToTotalOrd>::TotalOrdItem: Hash + Eq + Ord,
89    ChunkedArray<T>: for<'a> ChunkCompareEq<&'a ChunkedArray<T>, Item = BooleanChunked>,
90{
91    fn unique(&self) -> PolarsResult<Self> {
92        // prevent stackoverflow repeated sorted.unique call
93        if self.is_empty() {
94            return Ok(self.clone());
95        }
96        match self.is_sorted_flag() {
97            IsSorted::Ascending | IsSorted::Descending => {
98                if self.null_count() > 0 {
99                    let mut arr = MutablePrimitiveArray::with_capacity(self.len());
100
101                    if !self.is_empty() {
102                        let mut iter = self.iter();
103                        let last = iter.next().unwrap();
104                        arr.push(last);
105                        let mut last = last.to_total_ord();
106
107                        let to_extend = iter.filter(|opt_val| {
108                            let opt_val_tot_ord = opt_val.to_total_ord();
109                            let out = opt_val_tot_ord != last;
110                            last = opt_val_tot_ord;
111                            out
112                        });
113
114                        arr.extend(to_extend);
115                    }
116
117                    let arr: PrimitiveArray<T::Native> = arr.into();
118                    Ok(ChunkedArray::with_chunk(self.name().clone(), arr))
119                } else {
120                    let mask = self.not_equal_missing(&self.shift(1));
121                    self.filter(&mask)
122                }
123            },
124            IsSorted::Not => {
125                let sorted = self.sort(false);
126                sorted.unique()
127            },
128        }
129    }
130
131    fn arg_unique(&self) -> PolarsResult<IdxCa> {
132        Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
133    }
134
135    fn n_unique(&self) -> PolarsResult<usize> {
136        // prevent stackoverflow repeated sorted.unique call
137        if self.is_empty() {
138            return Ok(0);
139        }
140        match self.is_sorted_flag() {
141            IsSorted::Ascending | IsSorted::Descending => {
142                if self.null_count() > 0 {
143                    let mut count = 0;
144
145                    if self.is_empty() {
146                        return Ok(count);
147                    }
148
149                    let mut iter = self.iter();
150                    let mut last = iter.next().unwrap().to_total_ord();
151
152                    count += 1;
153
154                    iter.for_each(|opt_val| {
155                        let opt_val = opt_val.to_total_ord();
156                        if opt_val != last {
157                            last = opt_val;
158                            count += 1;
159                        }
160                    });
161
162                    Ok(count)
163                } else {
164                    let mask = self.not_equal_missing(&self.shift(1));
165                    Ok(mask.sum().unwrap() as usize)
166                }
167            },
168            IsSorted::Not => {
169                let sorted = self.sort(false);
170                sorted.n_unique()
171            },
172        }
173    }
174}
175
176impl ChunkUnique for StringChunked {
177    fn unique(&self) -> PolarsResult<Self> {
178        let out = self.as_binary().unique()?;
179        Ok(unsafe { out.to_string_unchecked() })
180    }
181
182    fn arg_unique(&self) -> PolarsResult<IdxCa> {
183        self.as_binary().arg_unique()
184    }
185
186    fn n_unique(&self) -> PolarsResult<usize> {
187        self.as_binary().n_unique()
188    }
189}
190
191impl ChunkUnique for BinaryChunked {
192    fn unique(&self) -> PolarsResult<Self> {
193        match self.null_count() {
194            0 => {
195                let mut set =
196                    PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
197                for arr in self.downcast_iter() {
198                    set.extend(arr.values_iter())
199                }
200                Ok(BinaryChunked::from_iter_values(
201                    self.name().clone(),
202                    set.iter().copied(),
203                ))
204            },
205            _ => {
206                let mut set =
207                    PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
208                for arr in self.downcast_iter() {
209                    set.extend(arr.iter())
210                }
211                Ok(BinaryChunked::from_iter_options(
212                    self.name().clone(),
213                    set.iter().copied(),
214                ))
215            },
216        }
217    }
218
219    fn arg_unique(&self) -> PolarsResult<IdxCa> {
220        Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
221    }
222
223    fn n_unique(&self) -> PolarsResult<usize> {
224        let mut set: PlHashSet<&[u8]> = PlHashSet::new();
225        if self.null_count() > 0 {
226            for arr in self.downcast_iter() {
227                set.extend(arr.into_iter().flatten())
228            }
229            Ok(set.len() + 1)
230        } else {
231            for arr in self.downcast_iter() {
232                set.extend(arr.values_iter())
233            }
234            Ok(set.len())
235        }
236    }
237}
238
239impl ChunkUnique for BooleanChunked {
240    fn unique(&self) -> PolarsResult<Self> {
241        use polars_compute::unique::RangedUniqueKernel;
242
243        let mut state = BooleanUniqueKernelState::new();
244
245        for arr in self.downcast_iter() {
246            state.append(arr);
247
248            if state.has_seen_all() {
249                break;
250            }
251        }
252
253        let unique = state.finalize_unique();
254
255        Ok(Self::with_chunk(self.name().clone(), unique))
256    }
257
258    fn arg_unique(&self) -> PolarsResult<IdxCa> {
259        Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
260    }
261}
262
263#[cfg(test)]
264mod test {
265    use crate::prelude::*;
266
267    #[test]
268    fn unique() {
269        let ca =
270            ChunkedArray::<Int32Type>::from_slice(PlSmallStr::from_static("a"), &[1, 2, 3, 2, 1]);
271        assert_eq!(
272            ca.unique()
273                .unwrap()
274                .sort(false)
275                .into_iter()
276                .collect::<Vec<_>>(),
277            vec![Some(1), Some(2), Some(3)]
278        );
279        let ca = BooleanChunked::from_slice(PlSmallStr::from_static("a"), &[true, false, true]);
280        assert_eq!(
281            ca.unique().unwrap().into_iter().collect::<Vec<_>>(),
282            vec![Some(false), Some(true)]
283        );
284
285        let ca = StringChunked::new(
286            PlSmallStr::EMPTY,
287            &[Some("a"), None, Some("a"), Some("b"), None],
288        );
289        assert_eq!(
290            Vec::from(&ca.unique().unwrap().sort(false)),
291            &[None, Some("a"), Some("b")]
292        );
293    }
294
295    #[test]
296    fn arg_unique() {
297        let ca =
298            ChunkedArray::<Int32Type>::from_slice(PlSmallStr::from_static("a"), &[1, 2, 1, 1, 3]);
299        assert_eq!(
300            ca.arg_unique().unwrap().into_iter().collect::<Vec<_>>(),
301            vec![Some(0), Some(1), Some(4)]
302        );
303    }
304}