polars_core/chunked_array/ops/unique/
mod.rs

1use std::hash::Hash;
2use std::ops::Deref;
3
4use arrow::bitmap::MutableBitmap;
5use polars_compute::unique::BooleanUniqueKernelState;
6use polars_utils::total_ord::{ToTotalOrd, TotalHash};
7
8use crate::hashing::_HASHMAP_INIT_SIZE;
9use crate::prelude::*;
10use crate::series::IsSorted;
11
12fn finish_is_unique_helper(
13    unique_idx: Vec<IdxSize>,
14    len: IdxSize,
15    setter: bool,
16    default: bool,
17) -> BooleanChunked {
18    let mut values = MutableBitmap::with_capacity(len as usize);
19    values.extend_constant(len as usize, default);
20
21    for idx in unique_idx {
22        unsafe { values.set_unchecked(idx as usize, setter) }
23    }
24    let arr = BooleanArray::from_data_default(values.into(), None);
25    arr.into()
26}
27
28pub(crate) fn is_unique_helper(
29    groups: &GroupPositions,
30    len: IdxSize,
31    unique_val: bool,
32    duplicated_val: bool,
33) -> BooleanChunked {
34    debug_assert_ne!(unique_val, duplicated_val);
35
36    let idx = match groups.deref() {
37        GroupsType::Idx(groups) => groups
38            .iter()
39            .filter_map(|(first, g)| if g.len() == 1 { Some(first) } else { None })
40            .collect::<Vec<_>>(),
41        GroupsType::Slice { groups, .. } => groups
42            .iter()
43            .filter_map(|[first, len]| if *len == 1 { Some(*first) } else { None })
44            .collect(),
45    };
46    finish_is_unique_helper(idx, len, unique_val, duplicated_val)
47}
48
49#[cfg(feature = "object")]
50impl<T: PolarsObject> ChunkUnique for ObjectChunked<T> {
51    fn unique(&self) -> PolarsResult<ChunkedArray<ObjectType<T>>> {
52        polars_bail!(opq = unique, self.dtype());
53    }
54
55    fn arg_unique(&self) -> PolarsResult<IdxCa> {
56        polars_bail!(opq = arg_unique, self.dtype());
57    }
58}
59
60fn arg_unique<T>(a: impl Iterator<Item = T>, capacity: usize) -> Vec<IdxSize>
61where
62    T: ToTotalOrd,
63    <T as ToTotalOrd>::TotalOrdItem: Hash + Eq,
64{
65    let mut set = PlHashSet::new();
66    let mut unique = Vec::with_capacity(capacity);
67    a.enumerate().for_each(|(idx, val)| {
68        if set.insert(val.to_total_ord()) {
69            unique.push(idx as IdxSize)
70        }
71    });
72    unique
73}
74
75macro_rules! arg_unique_ca {
76    ($ca:expr) => {{
77        match $ca.has_nulls() {
78            false => arg_unique($ca.into_no_null_iter(), $ca.len()),
79            _ => arg_unique($ca.iter(), $ca.len()),
80        }
81    }};
82}
83
84impl<T> ChunkUnique for ChunkedArray<T>
85where
86    T: PolarsNumericType,
87    T::Native: TotalHash + TotalEq + ToTotalOrd,
88    <T::Native as ToTotalOrd>::TotalOrdItem: Hash + Eq + Ord,
89    ChunkedArray<T>:
90        IntoSeries + for<'a> ChunkCompareEq<&'a ChunkedArray<T>, Item = BooleanChunked>,
91{
92    fn unique(&self) -> PolarsResult<Self> {
93        // prevent stackoverflow repeated sorted.unique call
94        if self.is_empty() {
95            return Ok(self.clone());
96        }
97        match self.is_sorted_flag() {
98            IsSorted::Ascending | IsSorted::Descending => {
99                if self.null_count() > 0 {
100                    let mut arr = MutablePrimitiveArray::with_capacity(self.len());
101
102                    if !self.is_empty() {
103                        let mut iter = self.iter();
104                        let last = iter.next().unwrap();
105                        arr.push(last);
106                        let mut last = last.to_total_ord();
107
108                        let to_extend = iter.filter(|opt_val| {
109                            let opt_val_tot_ord = opt_val.to_total_ord();
110                            let out = opt_val_tot_ord != last;
111                            last = opt_val_tot_ord;
112                            out
113                        });
114
115                        arr.extend(to_extend);
116                    }
117
118                    let arr: PrimitiveArray<T::Native> = arr.into();
119                    Ok(ChunkedArray::with_chunk(self.name().clone(), arr))
120                } else {
121                    let mask = self.not_equal_missing(&self.shift(1));
122                    self.filter(&mask)
123                }
124            },
125            IsSorted::Not => {
126                let sorted = self.sort(false);
127                sorted.unique()
128            },
129        }
130    }
131
132    fn arg_unique(&self) -> PolarsResult<IdxCa> {
133        Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
134    }
135
136    fn n_unique(&self) -> PolarsResult<usize> {
137        // prevent stackoverflow repeated sorted.unique call
138        if self.is_empty() {
139            return Ok(0);
140        }
141        match self.is_sorted_flag() {
142            IsSorted::Ascending | IsSorted::Descending => {
143                if self.null_count() > 0 {
144                    let mut count = 0;
145
146                    if self.is_empty() {
147                        return Ok(count);
148                    }
149
150                    let mut iter = self.iter();
151                    let mut last = iter.next().unwrap().to_total_ord();
152
153                    count += 1;
154
155                    iter.for_each(|opt_val| {
156                        let opt_val = opt_val.to_total_ord();
157                        if opt_val != last {
158                            last = opt_val;
159                            count += 1;
160                        }
161                    });
162
163                    Ok(count)
164                } else {
165                    let mask = self.not_equal_missing(&self.shift(1));
166                    Ok(mask.sum().unwrap() as usize)
167                }
168            },
169            IsSorted::Not => {
170                let sorted = self.sort(false);
171                sorted.n_unique()
172            },
173        }
174    }
175}
176
177impl ChunkUnique for StringChunked {
178    fn unique(&self) -> PolarsResult<Self> {
179        let out = self.as_binary().unique()?;
180        Ok(unsafe { out.to_string_unchecked() })
181    }
182
183    fn arg_unique(&self) -> PolarsResult<IdxCa> {
184        self.as_binary().arg_unique()
185    }
186
187    fn n_unique(&self) -> PolarsResult<usize> {
188        self.as_binary().n_unique()
189    }
190}
191
192impl ChunkUnique for BinaryChunked {
193    fn unique(&self) -> PolarsResult<Self> {
194        match self.null_count() {
195            0 => {
196                let mut set =
197                    PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
198                for arr in self.downcast_iter() {
199                    set.extend(arr.values_iter())
200                }
201                Ok(BinaryChunked::from_iter_values(
202                    self.name().clone(),
203                    set.iter().copied(),
204                ))
205            },
206            _ => {
207                let mut set =
208                    PlHashSet::with_capacity(std::cmp::min(_HASHMAP_INIT_SIZE, self.len()));
209                for arr in self.downcast_iter() {
210                    set.extend(arr.iter())
211                }
212                Ok(BinaryChunked::from_iter_options(
213                    self.name().clone(),
214                    set.iter().copied(),
215                ))
216            },
217        }
218    }
219
220    fn arg_unique(&self) -> PolarsResult<IdxCa> {
221        Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
222    }
223
224    fn n_unique(&self) -> PolarsResult<usize> {
225        let mut set: PlHashSet<&[u8]> = PlHashSet::new();
226        if self.null_count() > 0 {
227            for arr in self.downcast_iter() {
228                set.extend(arr.into_iter().flatten())
229            }
230            Ok(set.len() + 1)
231        } else {
232            for arr in self.downcast_iter() {
233                set.extend(arr.values_iter())
234            }
235            Ok(set.len())
236        }
237    }
238}
239
240impl ChunkUnique for BooleanChunked {
241    fn unique(&self) -> PolarsResult<Self> {
242        use polars_compute::unique::RangedUniqueKernel;
243
244        let mut state = BooleanUniqueKernelState::new();
245
246        for arr in self.downcast_iter() {
247            state.append(arr);
248
249            if state.has_seen_all() {
250                break;
251            }
252        }
253
254        let unique = state.finalize_unique();
255
256        Ok(Self::with_chunk(self.name().clone(), unique))
257    }
258
259    fn arg_unique(&self) -> PolarsResult<IdxCa> {
260        Ok(IdxCa::from_vec(self.name().clone(), arg_unique_ca!(self)))
261    }
262}
263
264#[cfg(test)]
265mod test {
266    use crate::prelude::*;
267
268    #[test]
269    fn unique() {
270        let ca =
271            ChunkedArray::<Int32Type>::from_slice(PlSmallStr::from_static("a"), &[1, 2, 3, 2, 1]);
272        assert_eq!(
273            ca.unique()
274                .unwrap()
275                .sort(false)
276                .into_iter()
277                .collect::<Vec<_>>(),
278            vec![Some(1), Some(2), Some(3)]
279        );
280        let ca = BooleanChunked::from_slice(PlSmallStr::from_static("a"), &[true, false, true]);
281        assert_eq!(
282            ca.unique().unwrap().into_iter().collect::<Vec<_>>(),
283            vec![Some(false), Some(true)]
284        );
285
286        let ca = StringChunked::new(
287            PlSmallStr::EMPTY,
288            &[Some("a"), None, Some("a"), Some("b"), None],
289        );
290        assert_eq!(
291            Vec::from(&ca.unique().unwrap().sort(false)),
292            &[None, Some("a"), Some("b")]
293        );
294    }
295
296    #[test]
297    fn arg_unique() {
298        let ca =
299            ChunkedArray::<Int32Type>::from_slice(PlSmallStr::from_static("a"), &[1, 2, 1, 1, 3]);
300        assert_eq!(
301            ca.arg_unique().unwrap().into_iter().collect::<Vec<_>>(),
302            vec![Some(0), Some(1), Some(4)]
303        );
304    }
305}