polars_core/chunked_array/ops/unique/
mod.rs1use std::hash::Hash;
2use std::ops::Deref;
3
4use arrow::bitmap::MutableBitmap;
5use polars_compute::unique::BooleanUniqueKernelState;
6use polars_utils::total_ord::{ToTotalOrd, TotalHash};
7
8use crate::hashing::_HASHMAP_INIT_SIZE;
9use crate::prelude::*;
10use crate::series::IsSorted;
11
12fn finish_is_unique_helper(
13 unique_idx: Vec<IdxSize>,
14 len: IdxSize,
15 setter: bool,
16 default: bool,
17) -> BooleanChunked {
18 let mut values = MutableBitmap::with_capacity(len as usize);
19 values.extend_constant(len as usize, default);
20
21 for idx in unique_idx {
22 unsafe { values.set_unchecked(idx as usize, setter) }
23 }
24 let arr = BooleanArray::from_data_default(values.into(), None);
25 arr.into()
26}
27
28pub(crate) fn is_unique_helper(
29 groups: &GroupPositions,
30 len: IdxSize,
31 unique_val: bool,
32 duplicated_val: bool,
33) -> BooleanChunked {
34 debug_assert_ne!(unique_val, duplicated_val);
35
36 let idx = match groups.deref() {
37 GroupsType::Idx(groups) => groups
38 .iter()
39 .filter_map(|(first, g)| if g.len() == 1 { Some(first) } else { None })
40 .collect::<Vec<_>>(),
41 GroupsType::Slice { groups, .. } => groups
42 .iter()
43 .filter_map(|[first, len]| if *len == 1 { Some(*first) } else { None })
44 .collect(),
45 };
46 finish_is_unique_helper(idx, len, unique_val, duplicated_val)
47}
48
49#[cfg(feature = "object")]
50impl<T: PolarsObject> ChunkUnique for ObjectChunked<T> {
51 fn unique(&self) -> PolarsResult<ChunkedArray<ObjectType<T>>> {
52 polars_bail!(opq = unique, self.dtype());
53 }
54
55 fn arg_unique(&self) -> PolarsResult<IdxCa> {
56 polars_bail!(opq = arg_unique, self.dtype());
57 }
58}
59
60fn arg_unique<T>(a: impl Iterator<Item = T>, capacity: usize) -> Vec<IdxSize>
61where
62 T: ToTotalOrd,
63 <T as ToTotalOrd>::TotalOrdItem: Hash + Eq,
64{
65 let mut set = PlHashSet::new();
66 let mut unique = Vec::with_capacity(capacity);
67 a.enumerate().for_each(|(idx, val)| {
68 if set.insert(val.to_total_ord()) {
69 unique.push(idx as IdxSize)
70 }
71 });
72 unique
73}
74
75macro_rules! arg_unique_ca {
76 ($ca:expr) => {{
77 match $ca.has_nulls() {
78 false => arg_unique($ca.into_no_null_iter(), $ca.len()),
79 _ => arg_unique($ca.iter(), $ca.len()),
80 }
81 }};
82}
83
84impl<T> ChunkUnique for ChunkedArray<T>
85where
86 T: PolarsNumericType,
87 T::Native: TotalHash + TotalEq + ToTotalOrd,
88 <T::Native as ToTotalOrd>::TotalOrdItem: Hash + Eq + Ord,
89 ChunkedArray<T>:
90 IntoSeries + for<'a> ChunkCompareEq<&'a ChunkedArray<T>, Item = BooleanChunked>,
91{
92 fn unique(&self) -> PolarsResult<Self> {
93 if self.is_empty() {
95 return Ok(self.clone());
96 }
97 match self.is_sorted_flag() {
98 IsSorted::Ascending | IsSorted::Descending => {
99 if self.null_count() > 0 {
100 let mut arr = MutablePrimitiveArray::with_capacity(self.len());
101
102 if !self.is_empty() {
103 let mut iter = self.iter();
104 let last = iter.next().unwrap();
105 arr.push(last);
106 let mut last = last.to_total_ord();
107
108 let to_extend = iter.filter(|opt_val| {
109 let opt_val_tot_ord = opt_val.to_total_ord();
110 let out = opt_val_tot_ord != last;
111 last = opt_val_tot_ord;
112 out
113 });
114
115 arr.extend(to_extend);
116 }
117
118 let arr: PrimitiveArray<T::Native> = arr.into();
119 Ok(ChunkedArray::with_chunk(self.name().clone(), arr))
120 } else {
121 let mask = self.not_equal_missing(&self.shift(1));
122 self.filter(&mask)
123 }
124 },
125 IsSorted::Not => {
126 let sorted = self.sort(false);
127 sorted.unique()
128 },
129 }
130 }
131
132 fn arg_unique(&self) -> PolarsResult<IdxCa> {
133 Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
134 }
135
136 fn n_unique(&self) -> PolarsResult<usize> {
137 if self.is_empty() {
139 return Ok(0);
140 }
141 match self.is_sorted_flag() {
142 IsSorted::Ascending | IsSorted::Descending => {
143 if self.null_count() > 0 {
144 let mut count = 0;
145
146 if self.is_empty() {
147 return Ok(count);
148 }
149
150 let mut iter = self.iter();
151 let mut last = iter.next().unwrap().to_total_ord();
152
153 count += 1;
154
155 iter.for_each(|opt_val| {
156 let opt_val = opt_val.to_total_ord();
157 if opt_val != last {
158 last = opt_val;
159 count += 1;
160 }
161 });
162
163 Ok(count)
164 } else {
165 let mask = self.not_equal_missing(&self.shift(1));
166 Ok(mask.sum().unwrap() as usize)
167 }
168 },
169 IsSorted::Not => {
170 let sorted = self.sort(false);
171 sorted.n_unique()
172 },
173 }
174 }
175}
176
177impl ChunkUnique for StringChunked {
178 fn unique(&self) -> PolarsResult<Self> {
179 let out = self.as_binary().unique()?;
180 Ok(unsafe { out.to_string_unchecked() })
181 }
182
183 fn arg_unique(&self) -> PolarsResult<IdxCa> {
184 self.as_binary().arg_unique()
185 }
186
187 fn n_unique(&self) -> PolarsResult<usize> {
188 self.as_binary().n_unique()
189 }
190}
191
192impl ChunkUnique for BinaryChunked {
193 fn unique(&self) -> PolarsResult<Self> {
194 match self.null_count() {
195 0 => {
196 let mut set =
197 PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
198 for arr in self.downcast_iter() {
199 set.extend(arr.values_iter())
200 }
201 Ok(BinaryChunked::from_iter_values(
202 self.name().clone(),
203 set.iter().copied(),
204 ))
205 },
206 _ => {
207 let mut set =
208 PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
209 for arr in self.downcast_iter() {
210 set.extend(arr.iter())
211 }
212 Ok(BinaryChunked::from_iter_options(
213 self.name().clone(),
214 set.iter().copied(),
215 ))
216 },
217 }
218 }
219
220 fn arg_unique(&self) -> PolarsResult<IdxCa> {
221 Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
222 }
223
224 fn n_unique(&self) -> PolarsResult<usize> {
225 let mut set: PlHashSet<&[u8]> = PlHashSet::new();
226 if self.null_count() > 0 {
227 for arr in self.downcast_iter() {
228 set.extend(arr.into_iter().flatten())
229 }
230 Ok(set.len() + 1)
231 } else {
232 for arr in self.downcast_iter() {
233 set.extend(arr.values_iter())
234 }
235 Ok(set.len())
236 }
237 }
238}
239
240impl ChunkUnique for BooleanChunked {
241 fn unique(&self) -> PolarsResult<Self> {
242 use polars_compute::unique::RangedUniqueKernel;
243
244 let mut state = BooleanUniqueKernelState::new();
245
246 for arr in self.downcast_iter() {
247 state.append(arr);
248
249 if state.has_seen_all() {
250 break;
251 }
252 }
253
254 let unique = state.finalize_unique();
255
256 Ok(Self::with_chunk(self.name().clone(), unique))
257 }
258
259 fn arg_unique(&self) -> PolarsResult<IdxCa> {
260 Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
261 }
262}
263
264#[cfg(test)]
265mod test {
266 use crate::prelude::*;
267
268 #[test]
269 fn unique() {
270 let ca =
271 ChunkedArray::<Int32Type>::from_slice(PlSmallStr::from_static("a"), &[1, 2, 3, 2, 1]);
272 assert_eq!(
273 ca.unique()
274 .unwrap()
275 .sort(false)
276 .into_iter()
277 .collect::<Vec<_>>(),
278 vec![Some(1), Some(2), Some(3)]
279 );
280 let ca = BooleanChunked::from_slice(PlSmallStr::from_static("a"), &[true, false, true]);
281 assert_eq!(
282 ca.unique().unwrap().into_iter().collect::<Vec<_>>(),
283 vec![Some(false), Some(true)]
284 );
285
286 let ca = StringChunked::new(
287 PlSmallStr::EMPTY,
288 &[Some("a"), None, Some("a"), Some("b"), None],
289 );
290 assert_eq!(
291 Vec::from(&ca.unique().unwrap().sort(false)),
292 &[None, Some("a"), Some("b")]
293 );
294 }
295
296 #[test]
297 fn arg_unique() {
298 let ca =
299 ChunkedArray::<Int32Type>::from_slice(PlSmallStr::from_static("a"), &[1, 2, 1, 1, 3]);
300 assert_eq!(
301 ca.arg_unique().unwrap().into_iter().collect::<Vec<_>>(),
302 vec![Some(0), Some(1), Some(4)]
303 );
304 }
305}