polars_ops/series/ops/
various.rs

1use num_traits::Bounded;
2#[cfg(feature = "dtype-struct")]
3use polars_core::chunked_array::ops::row_encode::_get_rows_encoded_ca;
4use polars_core::prelude::arity::unary_elementwise_values;
5use polars_core::prelude::*;
6use polars_core::series::IsSorted;
7use polars_core::with_match_physical_numeric_polars_type;
8#[cfg(feature = "hash")]
9use polars_utils::aliases::PlSeedableRandomStateQuality;
10use polars_utils::total_ord::TotalOrd;
11
12use crate::series::ops::SeriesSealed;
13
14pub trait SeriesMethods: SeriesSealed {
15    /// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
16    /// with dtype [`IdxType`]
17    fn value_counts(
18        &self,
19        sort: bool,
20        parallel: bool,
21        name: PlSmallStr,
22        normalize: bool,
23    ) -> PolarsResult<DataFrame> {
24        let s = self.as_series();
25        polars_ensure!(
26            s.name() != &name,
27            Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate \
28            column names; change `name` to fix", name,
29        );
30        // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined
31        let groups = s.group_tuples(parallel, sort)?;
32        let values = unsafe { s.agg_first(&groups) }
33            .with_name(s.name().clone())
34            .into();
35        let counts = groups.group_count().with_name(name.clone());
36
37        let counts = if normalize {
38            let len = s.len() as f64;
39            let counts: Float64Chunked =
40                unary_elementwise_values(&counts, |count| count as f64 / len);
41            counts.into_column()
42        } else {
43            counts.into_column()
44        };
45
46        let height = counts.len();
47        let cols = vec![values, counts];
48        let df = unsafe { DataFrame::new_no_checks(height, cols) };
49        if sort {
50            df.sort(
51                [name],
52                SortMultipleOptions::default()
53                    .with_order_descending(true)
54                    .with_multithreaded(parallel),
55            )
56        } else {
57            Ok(df)
58        }
59    }
60
61    #[cfg(feature = "hash")]
62    fn hash(&self, build_hasher: PlSeedableRandomStateQuality) -> UInt64Chunked {
63        let s = self.as_series().to_physical_repr();
64        let mut h = vec![];
65        s.0.vec_hash(build_hasher, &mut h).unwrap();
66        UInt64Chunked::from_vec(s.name().clone(), h)
67    }
68
69    fn ensure_sorted_arg(&self, operation: &str) -> PolarsResult<()> {
70        polars_ensure!(self.is_sorted(Default::default())?, InvalidOperation: "argument in operation '{}' is not sorted, please sort the 'expr/series/column' first", operation);
71        Ok(())
72    }
73
74    /// Checks if a [`Series`] is sorted. Tries to fail fast.
75    fn is_sorted(&self, options: SortOptions) -> PolarsResult<bool> {
76        let s = self.as_series();
77        let null_count = s.null_count();
78
79        // fast paths
80        if (options.descending
81            && (options.nulls_last || null_count == 0)
82            && matches!(s.is_sorted_flag(), IsSorted::Descending))
83            || (!options.descending
84                && (!options.nulls_last || null_count == 0)
85                && matches!(s.is_sorted_flag(), IsSorted::Ascending))
86        {
87            return Ok(true);
88        }
89
90        // for struct types we row-encode and recurse
91        #[cfg(feature = "dtype-struct")]
92        if matches!(s.dtype(), DataType::Struct(_)) {
93            let encoded = _get_rows_encoded_ca(
94                PlSmallStr::EMPTY,
95                &[s.clone().into()],
96                &[options.descending],
97                &[options.nulls_last],
98            )?;
99            return encoded.into_series().is_sorted(options);
100        }
101
102        let s_len = s.len();
103        if null_count == s_len {
104            // All nulls is all equal
105            return Ok(true);
106        }
107        // Check if nulls are in the right location.
108        if null_count > 0 {
109            // The slice triggers a fast null count
110            if options.nulls_last {
111                if s.slice((s_len - null_count) as i64, null_count)
112                    .null_count()
113                    != null_count
114                {
115                    return Ok(false);
116                }
117            } else if s.slice(0, null_count).null_count() != null_count {
118                return Ok(false);
119            }
120        }
121
122        if s.dtype().is_primitive_numeric() {
123            with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
124                let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
125                return Ok(is_sorted_ca_num::<$T>(ca, options))
126            })
127        }
128
129        let cmp_len = s_len - null_count - 1; // Number of comparisons we might have to do
130        // TODO! Change this, allocation of a full boolean series is too expensive and doesn't fail fast.
131        // Compare adjacent elements with no-copy slices that don't include any nulls
132        let offset = !options.nulls_last as i64 * null_count as i64;
133        let (s1, s2) = (s.slice(offset, cmp_len), s.slice(offset + 1, cmp_len));
134        let cmp_op = if options.descending {
135            Series::gt_eq
136        } else {
137            Series::lt_eq
138        };
139        Ok(cmp_op(&s1, &s2)?.all())
140    }
141}
142
143fn check_cmp<T: NumericNative, Cmp: Fn(&T, &T) -> bool>(
144    vals: &[T],
145    f: Cmp,
146    previous: &mut T,
147) -> bool {
148    let mut sorted = true;
149
150    // Outer loop so we can fail fast
151    // Inner loop will auto vectorize
152    for c in vals.chunks(1024) {
153        // don't early stop or branch
154        // so it autovectorizes
155        for v in c {
156            sorted &= f(previous, v);
157            *previous = *v;
158        }
159        if !sorted {
160            return false;
161        }
162    }
163    sorted
164}
165
166// Assumes nulls last/first is already checked.
167fn is_sorted_ca_num<T: PolarsNumericType>(ca: &ChunkedArray<T>, options: SortOptions) -> bool {
168    if let Ok(vals) = ca.cont_slice() {
169        let mut previous = vals[0];
170        return if options.descending {
171            check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
172        } else {
173            check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
174        };
175    };
176
177    if ca.null_count() == 0 {
178        let mut previous = if options.descending {
179            T::Native::max_value()
180        } else {
181            T::Native::min_value()
182        };
183        for arr in ca.downcast_iter() {
184            let vals = arr.values();
185
186            let sorted = if options.descending {
187                check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
188            } else {
189                check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
190            };
191            if !sorted {
192                return false;
193            }
194        }
195        return true;
196    };
197
198    // Slice off nulls and recurse.
199    let null_count = ca.null_count();
200    if options.nulls_last {
201        let ca = ca.slice(0, ca.len() - null_count);
202        is_sorted_ca_num(&ca, options)
203    } else {
204        let ca = ca.slice(null_count as i64, ca.len() - null_count);
205        is_sorted_ca_num(&ca, options)
206    }
207}
208
209impl SeriesMethods for Series {}