polars_core/chunked_array/ops/unique/
mod.rs1use std::hash::Hash;
2use std::ops::Deref;
3
4use arrow::bitmap::MutableBitmap;
5use polars_compute::unique::BooleanUniqueKernelState;
6use polars_utils::total_ord::{ToTotalOrd, TotalHash};
7
8use crate::hashing::_HASHMAP_INIT_SIZE;
9use crate::prelude::*;
10use crate::series::IsSorted;
11
12fn finish_is_unique_helper(
13 unique_idx: Vec<IdxSize>,
14 len: IdxSize,
15 setter: bool,
16 default: bool,
17) -> BooleanChunked {
18 let mut values = MutableBitmap::with_capacity(len as usize);
19 values.extend_constant(len as usize, default);
20
21 for idx in unique_idx {
22 unsafe { values.set_unchecked(idx as usize, setter) }
23 }
24 let arr = BooleanArray::from_data_default(values.into(), None);
25 arr.into()
26}
27
28pub(crate) fn is_unique_helper(
29 groups: &GroupPositions,
30 len: IdxSize,
31 unique_val: bool,
32 duplicated_val: bool,
33) -> BooleanChunked {
34 debug_assert_ne!(unique_val, duplicated_val);
35
36 let idx = match groups.deref() {
37 GroupsType::Idx(groups) => groups
38 .iter()
39 .filter_map(|(first, g)| if g.len() == 1 { Some(first) } else { None })
40 .collect::<Vec<_>>(),
41 GroupsType::Slice { groups, .. } => groups
42 .iter()
43 .filter_map(|[first, len]| if *len == 1 { Some(*first) } else { None })
44 .collect(),
45 };
46 finish_is_unique_helper(idx, len, unique_val, duplicated_val)
47}
48
49#[cfg(feature = "object")]
50impl<T: PolarsObject> ChunkUnique for ObjectChunked<T> {
51 fn unique(&self) -> PolarsResult<ChunkedArray<ObjectType<T>>> {
52 polars_bail!(opq = unique, self.dtype());
53 }
54
55 fn arg_unique(&self) -> PolarsResult<IdxCa> {
56 polars_bail!(opq = arg_unique, self.dtype());
57 }
58}
59
60fn arg_unique<T>(a: impl Iterator<Item = T>, capacity: usize) -> Vec<IdxSize>
61where
62 T: ToTotalOrd,
63 <T as ToTotalOrd>::TotalOrdItem: Hash + Eq,
64{
65 let mut set = PlHashSet::new();
66 let mut unique = Vec::with_capacity(capacity);
67 a.enumerate().for_each(|(idx, val)| {
68 if set.insert(val.to_total_ord()) {
69 unique.push(idx as IdxSize)
70 }
71 });
72 unique
73}
74
75macro_rules! arg_unique_ca {
76 ($ca:expr) => {{
77 match $ca.has_nulls() {
78 false => arg_unique($ca.into_no_null_iter(), $ca.len()),
79 _ => arg_unique($ca.iter(), $ca.len()),
80 }
81 }};
82}
83
84impl<T> ChunkUnique for ChunkedArray<T>
85where
86 T: PolarsNumericType,
87 T::Native: TotalHash + TotalEq + ToTotalOrd,
88 <T::Native as ToTotalOrd>::TotalOrdItem: Hash + Eq + Ord,
89 ChunkedArray<T>: for<'a> ChunkCompareEq<&'a ChunkedArray<T>, Item = BooleanChunked>,
90{
91 fn unique(&self) -> PolarsResult<Self> {
92 if self.is_empty() {
94 return Ok(self.clone());
95 }
96 match self.is_sorted_flag() {
97 IsSorted::Ascending | IsSorted::Descending => {
98 if self.null_count() > 0 {
99 let mut arr = MutablePrimitiveArray::with_capacity(self.len());
100
101 if !self.is_empty() {
102 let mut iter = self.iter();
103 let last = iter.next().unwrap();
104 arr.push(last);
105 let mut last = last.to_total_ord();
106
107 let to_extend = iter.filter(|opt_val| {
108 let opt_val_tot_ord = opt_val.to_total_ord();
109 let out = opt_val_tot_ord != last;
110 last = opt_val_tot_ord;
111 out
112 });
113
114 arr.extend(to_extend);
115 }
116
117 let arr: PrimitiveArray<T::Native> = arr.into();
118 Ok(ChunkedArray::with_chunk(self.name().clone(), arr))
119 } else {
120 let mask = self.not_equal_missing(&self.shift(1));
121 self.filter(&mask)
122 }
123 },
124 IsSorted::Not => {
125 let sorted = self.sort(false);
126 sorted.unique()
127 },
128 }
129 }
130
131 fn arg_unique(&self) -> PolarsResult<IdxCa> {
132 Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
133 }
134
135 fn n_unique(&self) -> PolarsResult<usize> {
136 if self.is_empty() {
138 return Ok(0);
139 }
140 match self.is_sorted_flag() {
141 IsSorted::Ascending | IsSorted::Descending => {
142 if self.null_count() > 0 {
143 let mut count = 0;
144
145 if self.is_empty() {
146 return Ok(count);
147 }
148
149 let mut iter = self.iter();
150 let mut last = iter.next().unwrap().to_total_ord();
151
152 count += 1;
153
154 iter.for_each(|opt_val| {
155 let opt_val = opt_val.to_total_ord();
156 if opt_val != last {
157 last = opt_val;
158 count += 1;
159 }
160 });
161
162 Ok(count)
163 } else {
164 let mask = self.not_equal_missing(&self.shift(1));
165 Ok(mask.sum().unwrap() as usize)
166 }
167 },
168 IsSorted::Not => {
169 let sorted = self.sort(false);
170 sorted.n_unique()
171 },
172 }
173 }
174}
175
176impl ChunkUnique for StringChunked {
177 fn unique(&self) -> PolarsResult<Self> {
178 let out = self.as_binary().unique()?;
179 Ok(unsafe { out.to_string_unchecked() })
180 }
181
182 fn arg_unique(&self) -> PolarsResult<IdxCa> {
183 self.as_binary().arg_unique()
184 }
185
186 fn n_unique(&self) -> PolarsResult<usize> {
187 self.as_binary().n_unique()
188 }
189}
190
191impl ChunkUnique for BinaryChunked {
192 fn unique(&self) -> PolarsResult<Self> {
193 match self.null_count() {
194 0 => {
195 let mut set =
196 PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
197 for arr in self.downcast_iter() {
198 set.extend(arr.values_iter())
199 }
200 Ok(BinaryChunked::from_iter_values(
201 self.name().clone(),
202 set.iter().copied(),
203 ))
204 },
205 _ => {
206 let mut set =
207 PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
208 for arr in self.downcast_iter() {
209 set.extend(arr.iter())
210 }
211 Ok(BinaryChunked::from_iter_options(
212 self.name().clone(),
213 set.iter().copied(),
214 ))
215 },
216 }
217 }
218
219 fn arg_unique(&self) -> PolarsResult<IdxCa> {
220 Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
221 }
222
223 fn n_unique(&self) -> PolarsResult<usize> {
224 let mut set: PlHashSet<&[u8]> = PlHashSet::new();
225 if self.null_count() > 0 {
226 for arr in self.downcast_iter() {
227 set.extend(arr.into_iter().flatten())
228 }
229 Ok(set.len() + 1)
230 } else {
231 for arr in self.downcast_iter() {
232 set.extend(arr.values_iter())
233 }
234 Ok(set.len())
235 }
236 }
237}
238
239impl ChunkUnique for BooleanChunked {
240 fn unique(&self) -> PolarsResult<Self> {
241 use polars_compute::unique::RangedUniqueKernel;
242
243 let mut state = BooleanUniqueKernelState::new();
244
245 for arr in self.downcast_iter() {
246 state.append(arr);
247
248 if state.has_seen_all() {
249 break;
250 }
251 }
252
253 let unique = state.finalize_unique();
254
255 Ok(Self::with_chunk(self.name().clone(), unique))
256 }
257
258 fn arg_unique(&self) -> PolarsResult<IdxCa> {
259 Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
260 }
261}
262
263#[cfg(test)]
264mod test {
265 use crate::prelude::*;
266
267 #[test]
268 fn unique() {
269 let ca =
270 ChunkedArray::<Int32Type>::from_slice(PlSmallStr::from_static("a"), &[1, 2, 3, 2, 1]);
271 assert_eq!(
272 ca.unique()
273 .unwrap()
274 .sort(false)
275 .into_iter()
276 .collect::<Vec<_>>(),
277 vec![Some(1), Some(2), Some(3)]
278 );
279 let ca = BooleanChunked::from_slice(PlSmallStr::from_static("a"), &[true, false, true]);
280 assert_eq!(
281 ca.unique().unwrap().into_iter().collect::<Vec<_>>(),
282 vec![Some(false), Some(true)]
283 );
284
285 let ca = StringChunked::new(
286 PlSmallStr::EMPTY,
287 &[Some("a"), None, Some("a"), Some("b"), None],
288 );
289 assert_eq!(
290 Vec::from(&ca.unique().unwrap().sort(false)),
291 &[None, Some("a"), Some("b")]
292 );
293 }
294
295 #[test]
296 fn arg_unique() {
297 let ca =
298 ChunkedArray::<Int32Type>::from_slice(PlSmallStr::from_static("a"), &[1, 2, 1, 1, 3]);
299 assert_eq!(
300 ca.arg_unique().unwrap().into_iter().collect::<Vec<_>>(),
301 vec![Some(0), Some(1), Some(4)]
302 );
303 }
304}