polars_ops/series/ops/
various.rs

1use num_traits::Bounded;
2#[cfg(feature = "dtype-struct")]
3use polars_core::chunked_array::ops::row_encode::_get_rows_encoded_ca;
4use polars_core::prelude::arity::unary_elementwise_values;
5use polars_core::prelude::*;
6use polars_core::series::IsSorted;
7use polars_core::with_match_physical_numeric_polars_type;
8#[cfg(feature = "hash")]
9use polars_utils::aliases::PlSeedableRandomStateQuality;
10use polars_utils::total_ord::TotalOrd;
11
12use crate::series::ops::SeriesSealed;
13
14pub trait SeriesMethods: SeriesSealed {
15    /// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
16    /// with dtype [`IdxType`]
17    fn value_counts(
18        &self,
19        sort: bool,
20        parallel: bool,
21        name: PlSmallStr,
22        normalize: bool,
23    ) -> PolarsResult<DataFrame> {
24        let s = self.as_series();
25        polars_ensure!(
26            s.name() != &name,
27            Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate \
28            column names; change `name` to fix", name,
29        );
30        // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined
31        let groups = s.group_tuples(parallel, sort)?;
32        let values = unsafe { s.agg_first(&groups) }
33            .with_name(s.name().clone())
34            .into();
35        let counts = groups.group_count().with_name(name.clone());
36
37        let counts = if normalize {
38            let len = s.len() as f64;
39            let counts: Float64Chunked =
40                unary_elementwise_values(&counts, |count| count as f64 / len);
41            counts.into_column()
42        } else {
43            counts.into_column()
44        };
45
46        let height = counts.len();
47        let cols = vec![values, counts];
48        let df = unsafe { DataFrame::new_unchecked(height, cols) };
49        if sort {
50            df.sort(
51                [name],
52                SortMultipleOptions::default()
53                    .with_order_descending(true)
54                    .with_multithreaded(parallel),
55            )
56        } else {
57            Ok(df)
58        }
59    }
60
61    #[cfg(feature = "hash")]
62    fn hash(&self, build_hasher: PlSeedableRandomStateQuality) -> UInt64Chunked {
63        let s = self.as_series().to_physical_repr();
64        let mut h = vec![];
65        s.0.vec_hash(build_hasher, &mut h).unwrap();
66        UInt64Chunked::from_vec(s.name().clone(), h)
67    }
68
69    fn ensure_sorted_arg(&self, operation: &str) -> PolarsResult<()> {
70        polars_ensure!(self.is_sorted(Default::default())?, InvalidOperation: "argument in operation '{}' is not sorted, please sort the 'expr/series/column' first", operation);
71        Ok(())
72    }
73
74    /// Checks if a [`Series`] is sorted. Tries to fail fast.
75    fn is_sorted(&self, options: SortOptions) -> PolarsResult<bool> {
76        let s = self.as_series();
77        let null_count = s.null_count();
78
79        // fast paths
80        if (options.descending
81            && (options.nulls_last || null_count == 0)
82            && matches!(s.is_sorted_flag(), IsSorted::Descending))
83            || (!options.descending
84                && (!options.nulls_last || null_count == 0)
85                && matches!(s.is_sorted_flag(), IsSorted::Ascending))
86        {
87            return Ok(true);
88        }
89
90        // for struct types we row-encode and recurse
91        #[cfg(feature = "dtype-struct")]
92        if matches!(s.dtype(), DataType::Struct(_)) {
93            let encoded = _get_rows_encoded_ca(
94                PlSmallStr::EMPTY,
95                &[s.clone().into()],
96                &[options.descending],
97                &[options.nulls_last],
98                false,
99            )?;
100            return encoded.into_series().is_sorted(options);
101        }
102
103        let s_len = s.len();
104        if null_count == s_len {
105            // All nulls is all equal
106            return Ok(true);
107        }
108        // Check if nulls are in the right location.
109        if null_count > 0 {
110            // The slice triggers a fast null count
111            if options.nulls_last {
112                if s.slice((s_len - null_count) as i64, null_count)
113                    .null_count()
114                    != null_count
115                {
116                    return Ok(false);
117                }
118            } else if s.slice(0, null_count).null_count() != null_count {
119                return Ok(false);
120            }
121        }
122
123        if s.dtype().is_primitive_numeric() {
124            with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
125                let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
126                return Ok(is_sorted_ca_num::<$T>(ca, options))
127            })
128        }
129
130        let cmp_len = s_len - null_count - 1; // Number of comparisons we might have to do
131        // TODO! Change this, allocation of a full boolean series is too expensive and doesn't fail fast.
132        // Compare adjacent elements with no-copy slices that don't include any nulls
133        let offset = !options.nulls_last as i64 * null_count as i64;
134        let (s1, s2) = (s.slice(offset, cmp_len), s.slice(offset + 1, cmp_len));
135        let cmp_op = if options.descending {
136            Series::gt_eq
137        } else {
138            Series::lt_eq
139        };
140        Ok(cmp_op(&s1, &s2)?.all())
141    }
142}
143
144fn check_cmp<T: NumericNative, Cmp: Fn(&T, &T) -> bool>(
145    vals: &[T],
146    f: Cmp,
147    previous: &mut T,
148) -> bool {
149    let mut sorted = true;
150
151    // Outer loop so we can fail fast
152    // Inner loop will auto vectorize
153    for c in vals.chunks(1024) {
154        // don't early stop or branch
155        // so it autovectorizes
156        for v in c {
157            sorted &= f(previous, v);
158            *previous = *v;
159        }
160        if !sorted {
161            return false;
162        }
163    }
164    sorted
165}
166
167// Assumes nulls last/first is already checked.
168fn is_sorted_ca_num<T: PolarsNumericType>(ca: &ChunkedArray<T>, options: SortOptions) -> bool {
169    if let Ok(vals) = ca.cont_slice() {
170        let mut previous = vals[0];
171        return if options.descending {
172            check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
173        } else {
174            check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
175        };
176    };
177
178    if ca.null_count() == 0 {
179        let mut previous = if options.descending {
180            T::Native::max_value()
181        } else {
182            T::Native::min_value()
183        };
184        for arr in ca.downcast_iter() {
185            let vals = arr.values();
186
187            let sorted = if options.descending {
188                check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
189            } else {
190                check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
191            };
192            if !sorted {
193                return false;
194            }
195        }
196        return true;
197    };
198
199    // Slice off nulls and recurse.
200    let null_count = ca.null_count();
201    if options.nulls_last {
202        let ca = ca.slice(0, ca.len() - null_count);
203        is_sorted_ca_num(&ca, options)
204    } else {
205        let ca = ca.slice(null_count as i64, ca.len() - null_count);
206        is_sorted_ca_num(&ca, options)
207    }
208}
209
210impl SeriesMethods for Series {}