Skip to main content

polars_ops/series/ops/
various.rs

1use num_traits::Bounded;
2#[cfg(feature = "dtype-struct")]
3use polars_core::chunked_array::ops::row_encode::_get_rows_encoded_ca;
4use polars_core::prelude::arity::unary_elementwise_values;
5use polars_core::prelude::*;
6use polars_core::series::IsSorted;
7use polars_core::with_match_physical_numeric_polars_type;
8#[cfg(feature = "hash")]
9use polars_utils::aliases::PlSeedableRandomStateQuality;
10use polars_utils::total_ord::TotalOrd;
11
12use crate::series::ops::SeriesSealed;
13
14pub trait SeriesMethods: SeriesSealed {
15    /// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
16    /// with dtype [`IdxType`]
17    fn value_counts(
18        &self,
19        sort: bool,
20        parallel: bool,
21        name: PlSmallStr,
22        normalize: bool,
23    ) -> PolarsResult<DataFrame> {
24        let s = self.as_series();
25        polars_ensure!(
26            s.name() != &name,
27            Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate \
28            column names; change `name` to fix", name,
29        );
30        // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined
31        let groups = s.group_tuples(parallel, sort)?;
32        let values = unsafe { s.agg_first(&groups) }
33            .with_name(s.name().clone())
34            .into();
35        let counts = groups.group_count().with_name(name.clone());
36
37        let counts = if normalize {
38            let len = s.len() as f64;
39            let counts: Float64Chunked =
40                unary_elementwise_values(&counts, |count| count as f64 / len);
41            counts.into_column()
42        } else {
43            counts.into_column()
44        };
45
46        let height = counts.len();
47        let cols = vec![values, counts];
48        let df = unsafe { DataFrame::new_unchecked(height, cols) };
49        if sort {
50            df.sort(
51                [name],
52                SortMultipleOptions::default()
53                    .with_order_descending(true)
54                    .with_multithreaded(parallel),
55            )
56        } else {
57            Ok(df)
58        }
59    }
60
61    #[cfg(feature = "hash")]
62    fn hash(&self, build_hasher: PlSeedableRandomStateQuality) -> UInt64Chunked {
63        let s = self.as_series();
64        let mut h = vec![];
65        s.0.vec_hash(build_hasher, &mut h).unwrap();
66        UInt64Chunked::from_vec(s.name().clone(), h)
67    }
68
69    fn ensure_sorted_arg(&self, operation: &str) -> PolarsResult<()> {
70        polars_ensure!(self.is_sorted(Default::default())?, InvalidOperation: "argument in operation '{}' is not sorted, please sort the 'expr/series/column' first", operation);
71        Ok(())
72    }
73
74    /// Checks if a [`Series`] is sorted. Tries to fail fast.
75    fn is_sorted(&self, options: SortOptions) -> PolarsResult<bool> {
76        let s = self.as_series();
77        let null_count = s.null_count();
78
79        // fast paths
80        if (options.descending
81            && (options.nulls_last || null_count == 0)
82            && matches!(s.is_sorted_flag(), IsSorted::Descending))
83            || (!options.descending
84                && (!options.nulls_last || null_count == 0)
85                && matches!(s.is_sorted_flag(), IsSorted::Ascending))
86        {
87            return Ok(true);
88        }
89
90        // for struct types we row-encode and recurse
91        #[cfg(feature = "dtype-struct")]
92        if matches!(s.dtype(), DataType::Struct(_)) {
93            let encoded = _get_rows_encoded_ca(
94                PlSmallStr::EMPTY,
95                &[s.clone().into()],
96                &[options.descending],
97                &[options.nulls_last],
98                false,
99            )?;
100            let options = SortOptions {
101                descending: false,
102                nulls_last: false,
103                ..options
104            };
105            return encoded.into_series().is_sorted(options);
106        }
107
108        let s_len = s.len();
109        if null_count == s_len {
110            // All nulls is all equal
111            return Ok(true);
112        }
113        // Check if nulls are in the right location.
114        if null_count > 0 {
115            // The slice triggers a fast null count
116            if options.nulls_last {
117                if s.slice((s_len - null_count) as i64, null_count)
118                    .null_count()
119                    != null_count
120                {
121                    return Ok(false);
122                }
123            } else if s.slice(0, null_count).null_count() != null_count {
124                return Ok(false);
125            }
126        }
127
128        if s.dtype().is_primitive_numeric() {
129            with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
130                let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
131                return Ok(is_sorted_ca_num::<$T>(ca, options))
132            })
133        }
134
135        // Logical non-primitive types (e.g. String, Categorical, List, …): take only the contiguous
136        // non-null values (`non_null`). For ordinary `Categorical` use `iter_str` (below); otherwise
137        // `to_physical_repr`, then
138        // (1) for ordinary [`DataType::Categorical`], compare adjacent **decoded strings** (`iter_str`),
139        // (2) reuse `is_sorted_ca_num` when the physical type is primitive numeric (temporal /
140        //     Decimal, Enum-as-integer, …) after `to_physical_repr`;
141        // (3) uses a dedicated kernel for boolean values,
142        // (4) else scans string / binary values with `TotalOrd`,
143        // (5) else fall back to pairwise `Series::lt_eq` / `gt_eq` (nested types, etc.).
144        let non_null_len = s_len - null_count;
145        if non_null_len <= 1 {
146            return Ok(true);
147        }
148
149        let offset = (!options.nulls_last as i64) * (null_count as i64);
150        let non_null = s.slice(offset, non_null_len);
151        debug_assert_eq!(
152            non_null.null_count(),
153            0,
154            "internal error: `is_sorted` non-null slice contains nulls"
155        );
156
157        #[cfg(feature = "dtype-categorical")]
158        if matches!(non_null.dtype(), DataType::Categorical(_, _)) {
159            return is_sorted_categorical_lexical_adjacent(&non_null, options);
160        }
161
162        let phys = non_null.to_physical_repr();
163        let s_phys = phys.as_ref();
164        if s_phys.dtype().is_primitive_numeric() {
165            with_match_physical_numeric_polars_type!(s_phys.dtype(), |$T| {
166                let ca: &ChunkedArray<$T> = s_phys.as_ref().as_ref().as_ref();
167                return Ok(is_sorted_ca_num::<$T>(ca, options))
168            })
169        }
170
171        match s_phys.dtype() {
172            DataType::Boolean => {
173                let ca = s_phys.bool()?;
174                Ok(is_sorted_ca_bool(ca, options.descending))
175            },
176            DataType::String => {
177                let ca = s_phys.str()?;
178                Ok(is_sorted_adjacent_total_ord(
179                    ca.no_null_iter(),
180                    options.descending,
181                ))
182            },
183            DataType::Binary => {
184                let ca = s_phys.binary()?;
185                Ok(is_sorted_adjacent_total_ord(
186                    ca.no_null_iter(),
187                    options.descending,
188                ))
189            },
190            DataType::BinaryOffset => {
191                let ca = s_phys.binary_offset()?;
192                Ok(is_sorted_adjacent_total_ord(
193                    ca.no_null_iter(),
194                    options.descending,
195                ))
196            },
197            _ => {
198                // `non_null` excludes nulls already; compare `non_null[..-1]` with `non_null[1..]`.
199                let cmp_len = non_null_len - 1;
200                let s1 = non_null.slice(0, cmp_len);
201                let s2 = non_null.slice(1, cmp_len);
202                let cmp_op = if options.descending {
203                    Series::gt_eq
204                } else {
205                    Series::lt_eq
206                };
207                Ok(cmp_op(&s1, &s2)?.all())
208            },
209        }
210    }
211}
212
213/// Returns whether iterator elements are non-decreasing (`descending == false`) or non-increasing
214/// (`descending == true`) under [`TotalOrd`].
215///
216/// Assumes the iterator `it` yields **only** the non-null values in row order (one item per row). An empty
217/// iterator is considered sorted. Stops at the first pair that violates the ordering.
218fn is_sorted_adjacent_total_ord<T: TotalOrd>(
219    it: impl Iterator<Item = T>,
220    descending: bool,
221) -> bool {
222    let mut it = it;
223    // Sliding window: `prev` is always the previous element; seed with the first value.
224    let Some(mut prev) = it.next() else {
225        return true;
226    };
227    if descending {
228        for v in it {
229            if !prev.tot_ge(&v) {
230                return false;
231            }
232            prev = v;
233        }
234    } else {
235        for v in it {
236            if !prev.tot_le(&v) {
237                return false;
238            }
239            prev = v;
240        }
241    }
242    true
243}
244
245/// Ordinary [`DataType::Categorical`]: lexical order via adjacent decoded strings (`iter_str`), same as
246/// `Series::lt_eq` / `gt_eq`, but without a Boolean series. Caller must pass a contiguous **non-null**
247/// slice.
248#[cfg(feature = "dtype-categorical")]
249fn is_sorted_categorical_lexical_adjacent(s: &Series, options: SortOptions) -> PolarsResult<bool> {
250    polars_ensure!(
251        matches!(s.dtype(), DataType::Categorical(_, _)),
252        ComputeError: "internal error: expected Categorical in lexical `is_sorted` path",
253    );
254
255    with_match_categorical_physical_type!(s.dtype().cat_physical().unwrap(), |$C| {
256        let ca = s.cat::<$C>()?;
257
258        // `ca.null_count() == 0` implies each `phys` row decodes via `iter_str` to `Some(..)`
259        Ok(is_sorted_adjacent_total_ord(
260            ca.iter_str().map(|opt| {
261                opt.expect(
262                    "`iter_str` produced None while categorical null_count reported 0 (`is_sorted`)"
263                )
264            }),
265            options.descending,
266        ))
267    })
268}
269
270/// Booleans ordered as [`false`] < [`true`] (same as inequality comparisons on [`BooleanChunked`]).
271///
272/// Monotone order is equivalent to at most one plateau change: ascending is `F…FT…T`, descending is
273/// `T…TF…F`. Implemented with `first_true_idx` / `first_false_idx` plus a global false/true count
274/// check.
275///
276/// Caller must ensure **`ca` has no nulls** on the flattened series (see `non_null` slice above).
277fn is_sorted_ca_bool(ca: &BooleanChunked, descending: bool) -> bool {
278    let len = ca.len();
279    if len <= 1 {
280        return true;
281    }
282    debug_assert_eq!(
283        ca.null_count(),
284        0,
285        "internal error: `is_sorted_ca_bool` expects a non-null boolean slice"
286    );
287    if descending {
288        let Some(idx) = ca.first_false_idx() else {
289            return true;
290        };
291        !ca.slice(idx as i64, ca.len() - idx).any()
292    } else {
293        let Some(idx) = ca.first_true_idx() else {
294            return true;
295        };
296        ca.slice(idx as i64, ca.len() - idx).all()
297    }
298}
299
300fn check_cmp<T: NumericNative, Cmp: Fn(&T, &T) -> bool>(
301    vals: &[T],
302    f: Cmp,
303    previous: &mut T,
304) -> bool {
305    let mut sorted = true;
306
307    // Outer loop so we can fail fast
308    // Inner loop will auto vectorize
309    for c in vals.chunks(1024) {
310        // don't early stop or branch
311        // so it autovectorizes
312        for v in c {
313            sorted &= f(previous, v);
314            *previous = *v;
315        }
316        if !sorted {
317            return false;
318        }
319    }
320    sorted
321}
322
323// Assumes nulls last/first is already checked.
324fn is_sorted_ca_num<T: PolarsNumericType>(ca: &ChunkedArray<T>, options: SortOptions) -> bool {
325    if let Ok(vals) = ca.cont_slice() {
326        let mut previous = vals[0];
327        return if options.descending {
328            check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
329        } else {
330            check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
331        };
332    };
333
334    if ca.null_count() == 0 {
335        let mut previous = if options.descending {
336            T::Native::max_value()
337        } else {
338            T::Native::min_value()
339        };
340        for arr in ca.downcast_iter() {
341            let vals = arr.values();
342
343            let sorted = if options.descending {
344                check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
345            } else {
346                check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
347            };
348            if !sorted {
349                return false;
350            }
351        }
352        return true;
353    };
354
355    // Slice off nulls and recurse.
356    let null_count = ca.null_count();
357    if options.nulls_last {
358        let ca = ca.slice(0, ca.len() - null_count);
359        is_sorted_ca_num(&ca, options)
360    } else {
361        let ca = ca.slice(null_count as i64, ca.len() - null_count);
362        is_sorted_ca_num(&ca, options)
363    }
364}
365
366impl SeriesMethods for Series {}