polars_core/chunked_array/ops/unique/
mod.rs1use std::hash::Hash;
2use std::ops::Deref;
3
4use arrow::bitmap::MutableBitmap;
5use polars_compute::unique::BooleanUniqueKernelState;
6use polars_utils::total_ord::{ToTotalOrd, TotalHash, TotalOrdWrap};
7
8use crate::hashing::_HASHMAP_INIT_SIZE;
9use crate::prelude::*;
10use crate::series::IsSorted;
11
12fn finish_is_unique_helper(
13 unique_idx: Vec<IdxSize>,
14 len: IdxSize,
15 setter: bool,
16 default: bool,
17) -> BooleanChunked {
18 let mut values = MutableBitmap::with_capacity(len as usize);
19 values.extend_constant(len as usize, default);
20
21 for idx in unique_idx {
22 unsafe { values.set_unchecked(idx as usize, setter) }
23 }
24 let arr = BooleanArray::from_data_default(values.into(), None);
25 arr.into()
26}
27
28pub(crate) fn is_unique_helper(
29 groups: &GroupPositions,
30 len: IdxSize,
31 unique_val: bool,
32 duplicated_val: bool,
33) -> BooleanChunked {
34 debug_assert_ne!(unique_val, duplicated_val);
35
36 let idx = match groups.deref() {
37 GroupsType::Idx(groups) => groups
38 .iter()
39 .filter_map(|(first, g)| if g.len() == 1 { Some(first) } else { None })
40 .collect::<Vec<_>>(),
41 GroupsType::Slice { groups, .. } => groups
42 .iter()
43 .filter_map(|[first, len]| if *len == 1 { Some(*first) } else { None })
44 .collect(),
45 };
46 finish_is_unique_helper(idx, len, unique_val, duplicated_val)
47}
48
49#[cfg(feature = "object")]
50impl<T: PolarsObject> ChunkUnique for ObjectChunked<T> {
51 fn unique(&self) -> PolarsResult<ChunkedArray<ObjectType<T>>> {
52 polars_bail!(opq = unique, self.dtype());
53 }
54
55 fn arg_unique(&self) -> PolarsResult<IdxCa> {
56 polars_bail!(opq = arg_unique, self.dtype());
57 }
58
59 fn unique_id(&self) -> PolarsResult<(IdxSize, Vec<IdxSize>)> {
60 polars_bail!(opq = unique_id, self.dtype());
61 }
62}
63
64fn arg_unique<T>(a: impl Iterator<Item = T>, capacity: usize) -> Vec<IdxSize>
65where
66 T: ToTotalOrd,
67 <T as ToTotalOrd>::TotalOrdItem: Hash + Eq,
68{
69 let mut set = PlHashSet::new();
70 let mut unique = Vec::with_capacity(capacity);
71 a.enumerate().for_each(|(idx, val)| {
72 if set.insert(val.to_total_ord()) {
73 unique.push(idx as IdxSize)
74 }
75 });
76 unique
77}
78
79macro_rules! arg_unique_ca {
80 ($ca:expr) => {{
81 match $ca.has_nulls() {
82 false => arg_unique($ca.into_no_null_iter(), $ca.len()),
83 _ => arg_unique($ca.iter(), $ca.len()),
84 }
85 }};
86}
87
88impl<T> ChunkUnique for ChunkedArray<T>
89where
90 T: PolarsNumericType,
91 T::Native: TotalHash + TotalEq + ToTotalOrd,
92 <T::Native as ToTotalOrd>::TotalOrdItem: Hash + Eq + Ord,
93 ChunkedArray<T>: for<'a> ChunkCompareEq<&'a ChunkedArray<T>, Item = BooleanChunked>,
94{
95 fn unique(&self) -> PolarsResult<Self> {
96 if self.is_empty() {
98 return Ok(self.clone());
99 }
100 match self.is_sorted_flag() {
101 IsSorted::Ascending | IsSorted::Descending => {
102 if self.null_count() > 0 {
103 let mut arr = MutablePrimitiveArray::with_capacity(self.len());
104
105 if !self.is_empty() {
106 let mut iter = self.iter();
107 let last = iter.next().unwrap();
108 arr.push(last);
109 let mut last = last.to_total_ord();
110
111 let to_extend = iter.filter(|opt_val| {
112 let opt_val_tot_ord = opt_val.to_total_ord();
113 let out = opt_val_tot_ord != last;
114 last = opt_val_tot_ord;
115 out
116 });
117
118 arr.extend(to_extend);
119 }
120
121 let arr: PrimitiveArray<T::Native> = arr.into();
122 Ok(ChunkedArray::with_chunk(self.name().clone(), arr))
123 } else {
124 let mask = self.not_equal_missing(&self.shift(1));
125 self.filter(&mask)
126 }
127 },
128 IsSorted::Not => {
129 let sorted = self.sort(false);
130 sorted.unique()
131 },
132 }
133 }
134
135 fn arg_unique(&self) -> PolarsResult<IdxCa> {
136 Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
137 }
138
139 fn n_unique(&self) -> PolarsResult<usize> {
140 if self.is_empty() {
142 return Ok(0);
143 }
144 match self.is_sorted_flag() {
145 IsSorted::Ascending | IsSorted::Descending => {
146 if self.null_count() > 0 {
147 let mut count = 0;
148
149 if self.is_empty() {
150 return Ok(count);
151 }
152
153 let mut iter = self.iter();
154 let mut last = iter.next().unwrap().to_total_ord();
155
156 count += 1;
157
158 iter.for_each(|opt_val| {
159 let opt_val = opt_val.to_total_ord();
160 if opt_val != last {
161 last = opt_val;
162 count += 1;
163 }
164 });
165
166 Ok(count)
167 } else {
168 let mask = self.not_equal_missing(&self.shift(1));
169 Ok(mask.sum().unwrap() as usize)
170 }
171 },
172 IsSorted::Not => {
173 let sorted = self.sort(false);
174 sorted.n_unique()
175 },
176 }
177 }
178
179 fn unique_id(&self) -> PolarsResult<(IdxSize, Vec<IdxSize>)> {
180 let mut n = IdxSize::from(self.has_nulls());
181 let mut indices = PlHashMap::new();
182 let ids = self
183 .iter()
184 .map(|v| match v {
185 None => 0,
186 Some(v) => *indices.entry(TotalOrdWrap(v)).or_insert_with(|| {
187 let i = n;
188 n += 1;
189 i
190 }),
191 })
192 .collect_trusted();
193 Ok((n, ids))
194 }
195}
196
197impl ChunkUnique for StringChunked {
198 fn unique(&self) -> PolarsResult<Self> {
199 let out = self.as_binary().unique()?;
200 Ok(unsafe { out.to_string_unchecked() })
201 }
202
203 fn arg_unique(&self) -> PolarsResult<IdxCa> {
204 self.as_binary().arg_unique()
205 }
206
207 fn n_unique(&self) -> PolarsResult<usize> {
208 self.as_binary().n_unique()
209 }
210
211 fn unique_id(&self) -> PolarsResult<(IdxSize, Vec<IdxSize>)> {
212 self.as_binary().unique_id()
213 }
214}
215
216impl ChunkUnique for BinaryChunked {
217 fn unique(&self) -> PolarsResult<Self> {
218 match self.null_count() {
219 0 => {
220 let mut set =
221 PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
222 for arr in self.downcast_iter() {
223 set.extend(arr.values_iter())
224 }
225 Ok(BinaryChunked::from_iter_values(
226 self.name().clone(),
227 set.iter().copied(),
228 ))
229 },
230 _ => {
231 let mut set =
232 PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
233 for arr in self.downcast_iter() {
234 set.extend(arr.iter())
235 }
236 Ok(BinaryChunked::from_iter_options(
237 self.name().clone(),
238 set.iter().copied(),
239 ))
240 },
241 }
242 }
243
244 fn arg_unique(&self) -> PolarsResult<IdxCa> {
245 Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
246 }
247
248 fn n_unique(&self) -> PolarsResult<usize> {
249 let mut set: PlHashSet<&[u8]> = PlHashSet::new();
250 if self.null_count() > 0 {
251 for arr in self.downcast_iter() {
252 set.extend(arr.into_iter().flatten())
253 }
254 Ok(set.len() + 1)
255 } else {
256 for arr in self.downcast_iter() {
257 set.extend(arr.values_iter())
258 }
259 Ok(set.len())
260 }
261 }
262
263 fn unique_id(&self) -> PolarsResult<(IdxSize, Vec<IdxSize>)> {
264 let mut n = IdxSize::from(self.has_nulls());
265 let mut indices = PlHashMap::new();
266 let ids = self
267 .iter()
268 .map(|v| match v {
269 None => 0,
270 Some(v) => *indices.entry(v).or_insert_with(|| {
271 let i = n;
272 n += 1;
273 i
274 }),
275 })
276 .collect_trusted();
277 Ok((n, ids))
278 }
279}
280
281impl ChunkUnique for BinaryOffsetChunked {
282 fn unique(&self) -> PolarsResult<Self> {
283 match self.null_count() {
284 0 => {
285 let mut set =
286 PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
287 for arr in self.downcast_iter() {
288 set.extend(arr.values_iter())
289 }
290 Ok(set.iter().copied().collect_ca(self.name().clone()))
291 },
292 _ => {
293 let mut set =
294 PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
295 for arr in self.downcast_iter() {
296 set.extend(arr.iter())
297 }
298 Ok(set.iter().copied().collect_ca(self.name().clone()))
299 },
300 }
301 }
302
303 fn arg_unique(&self) -> PolarsResult<IdxCa> {
304 Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
305 }
306
307 fn n_unique(&self) -> PolarsResult<usize> {
308 let mut set: PlHashSet<&[u8]> = PlHashSet::new();
309 if self.null_count() > 0 {
310 for arr in self.downcast_iter() {
311 set.extend(arr.into_iter().flatten())
312 }
313 Ok(set.len() + 1)
314 } else {
315 for arr in self.downcast_iter() {
316 set.extend(arr.values_iter())
317 }
318 Ok(set.len())
319 }
320 }
321
322 fn unique_id(&self) -> PolarsResult<(IdxSize, Vec<IdxSize>)> {
323 let mut n = IdxSize::from(self.has_nulls());
324 let mut indices = PlHashMap::new();
325 let ids = self
326 .iter()
327 .map(|v| match v {
328 None => 0,
329 Some(v) => *indices.entry(v).or_insert_with(|| {
330 let i = n;
331 n += 1;
332 i
333 }),
334 })
335 .collect_trusted();
336 Ok((n, ids))
337 }
338}
339
340impl ChunkUnique for BooleanChunked {
341 fn unique(&self) -> PolarsResult<Self> {
342 use polars_compute::unique::RangedUniqueKernel;
343
344 let mut state = BooleanUniqueKernelState::new();
345
346 for arr in self.downcast_iter() {
347 state.append(arr);
348
349 if state.has_seen_all() {
350 break;
351 }
352 }
353
354 let unique = state.finalize_unique();
355
356 Ok(Self::with_chunk(self.name().clone(), unique))
357 }
358
359 fn arg_unique(&self) -> PolarsResult<IdxCa> {
360 Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
361 }
362
363 fn unique_id(&self) -> PolarsResult<(IdxSize, Vec<IdxSize>)> {
364 let num_nulls = self.null_count();
365 let num_trues = self.num_trues();
366
367 let true_idx = IdxSize::from(num_nulls > 0);
368 let false_idx = IdxSize::from(num_nulls > 0) + IdxSize::from(num_trues > 0);
369 let ids = self
370 .iter()
371 .map(|v| match v {
372 None => 0,
373 Some(false) => false_idx,
374 Some(true) => true_idx,
375 })
376 .collect_trusted();
377 Ok((false_idx + 1, ids))
378 }
379}
380
381#[cfg(test)]
382mod test {
383 use crate::prelude::*;
384
385 #[test]
386 fn unique() {
387 let ca =
388 ChunkedArray::<Int32Type>::from_slice(PlSmallStr::from_static("a"), &[1, 2, 3, 2, 1]);
389 assert_eq!(
390 ca.unique()
391 .unwrap()
392 .sort(false)
393 .into_iter()
394 .collect::<Vec<_>>(),
395 vec![Some(1), Some(2), Some(3)]
396 );
397 let ca = BooleanChunked::from_slice(PlSmallStr::from_static("a"), &[true, false, true]);
398 assert_eq!(
399 ca.unique().unwrap().into_iter().collect::<Vec<_>>(),
400 vec![Some(false), Some(true)]
401 );
402
403 let ca = StringChunked::new(
404 PlSmallStr::EMPTY,
405 &[Some("a"), None, Some("a"), Some("b"), None],
406 );
407 assert_eq!(
408 Vec::from(&ca.unique().unwrap().sort(false)),
409 &[None, Some("a"), Some("b")]
410 );
411 }
412
413 #[test]
414 fn arg_unique() {
415 let ca =
416 ChunkedArray::<Int32Type>::from_slice(PlSmallStr::from_static("a"), &[1, 2, 1, 1, 3]);
417 assert_eq!(
418 ca.arg_unique().unwrap().into_iter().collect::<Vec<_>>(),
419 vec![Some(0), Some(1), Some(4)]
420 );
421 }
422}