polars_ops/series/ops/
various.rs

1use num_traits::Bounded;
2#[cfg(feature = "dtype-struct")]
3use polars_core::chunked_array::ops::row_encode::_get_rows_encoded_ca;
4use polars_core::prelude::arity::unary_elementwise_values;
5use polars_core::prelude::*;
6use polars_core::series::IsSorted;
7use polars_core::with_match_physical_numeric_polars_type;
8use polars_utils::total_ord::TotalOrd;
9
10use crate::series::ops::SeriesSealed;
11
12pub trait SeriesMethods: SeriesSealed {
13    /// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
14    /// with dtype [`IdxType`]
15    fn value_counts(
16        &self,
17        sort: bool,
18        parallel: bool,
19        name: PlSmallStr,
20        normalize: bool,
21    ) -> PolarsResult<DataFrame> {
22        let s = self.as_series();
23        polars_ensure!(
24            s.name() != &name,
25            Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate \
26            column names; change `name` to fix", name,
27        );
28        // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined
29        let groups = s.group_tuples(parallel, sort)?;
30        let values = unsafe { s.agg_first(&groups) }.into();
31        let counts = groups.group_count().with_name(name.clone());
32
33        let counts = if normalize {
34            let len = s.len() as f64;
35            let counts: Float64Chunked =
36                unary_elementwise_values(&counts, |count| count as f64 / len);
37            counts.into_column()
38        } else {
39            counts.into_column()
40        };
41
42        let height = counts.len();
43        let cols = vec![values, counts];
44        let df = unsafe { DataFrame::new_no_checks(height, cols) };
45        if sort {
46            df.sort(
47                [name],
48                SortMultipleOptions::default()
49                    .with_order_descending(true)
50                    .with_multithreaded(parallel),
51            )
52        } else {
53            Ok(df)
54        }
55    }
56
57    #[cfg(feature = "hash")]
58    fn hash(&self, build_hasher: PlRandomState) -> UInt64Chunked {
59        let s = self.as_series().to_physical_repr();
60        match s.dtype() {
61            DataType::List(_) => {
62                let mut ca = s.list().unwrap().clone();
63                crate::chunked_array::hash::hash(&mut ca, build_hasher)
64            },
65            _ => {
66                let mut h = vec![];
67                s.0.vec_hash(build_hasher, &mut h).unwrap();
68                UInt64Chunked::from_vec(s.name().clone(), h)
69            },
70        }
71    }
72
73    fn ensure_sorted_arg(&self, operation: &str) -> PolarsResult<()> {
74        polars_ensure!(self.is_sorted(Default::default())?, InvalidOperation: "argument in operation '{}' is not sorted, please sort the 'expr/series/column' first", operation);
75        Ok(())
76    }
77
78    /// Checks if a [`Series`] is sorted. Tries to fail fast.
79    fn is_sorted(&self, options: SortOptions) -> PolarsResult<bool> {
80        let s = self.as_series();
81        let null_count = s.null_count();
82
83        // fast paths
84        if (options.descending
85            && (options.nulls_last || null_count == 0)
86            && matches!(s.is_sorted_flag(), IsSorted::Descending))
87            || (!options.descending
88                && (!options.nulls_last || null_count == 0)
89                && matches!(s.is_sorted_flag(), IsSorted::Ascending))
90        {
91            return Ok(true);
92        }
93
94        // for struct types we row-encode and recurse
95        #[cfg(feature = "dtype-struct")]
96        if matches!(s.dtype(), DataType::Struct(_)) {
97            let encoded = _get_rows_encoded_ca(
98                PlSmallStr::EMPTY,
99                &[s.clone().into()],
100                &[options.descending],
101                &[options.nulls_last],
102            )?;
103            return encoded.into_series().is_sorted(options);
104        }
105
106        let s_len = s.len();
107        if null_count == s_len {
108            // All nulls is all equal
109            return Ok(true);
110        }
111        // Check if nulls are in the right location.
112        if null_count > 0 {
113            // The slice triggers a fast null count
114            if options.nulls_last {
115                if s.slice((s_len - null_count) as i64, null_count)
116                    .null_count()
117                    != null_count
118                {
119                    return Ok(false);
120                }
121            } else if s.slice(0, null_count).null_count() != null_count {
122                return Ok(false);
123            }
124        }
125
126        if s.dtype().is_primitive_numeric() {
127            with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
128                let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
129                return Ok(is_sorted_ca_num::<$T>(ca, options))
130            })
131        }
132
133        let cmp_len = s_len - null_count - 1; // Number of comparisons we might have to do
134        // TODO! Change this, allocation of a full boolean series is too expensive and doesn't fail fast.
135        // Compare adjacent elements with no-copy slices that don't include any nulls
136        let offset = !options.nulls_last as i64 * null_count as i64;
137        let (s1, s2) = (s.slice(offset, cmp_len), s.slice(offset + 1, cmp_len));
138        let cmp_op = if options.descending {
139            Series::gt_eq
140        } else {
141            Series::lt_eq
142        };
143        Ok(cmp_op(&s1, &s2)?.all())
144    }
145}
146
147fn check_cmp<T: NumericNative, Cmp: Fn(&T, &T) -> bool>(
148    vals: &[T],
149    f: Cmp,
150    previous: &mut T,
151) -> bool {
152    let mut sorted = true;
153
154    // Outer loop so we can fail fast
155    // Inner loop will auto vectorize
156    for c in vals.chunks(1024) {
157        // don't early stop or branch
158        // so it autovectorizes
159        for v in c {
160            sorted &= f(previous, v);
161            *previous = *v;
162        }
163        if !sorted {
164            return false;
165        }
166    }
167    sorted
168}
169
170// Assumes nulls last/first is already checked.
171fn is_sorted_ca_num<T: PolarsNumericType>(ca: &ChunkedArray<T>, options: SortOptions) -> bool {
172    if let Ok(vals) = ca.cont_slice() {
173        let mut previous = vals[0];
174        return if options.descending {
175            check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
176        } else {
177            check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
178        };
179    };
180
181    if ca.null_count() == 0 {
182        let mut previous = if options.descending {
183            T::Native::max_value()
184        } else {
185            T::Native::min_value()
186        };
187        for arr in ca.downcast_iter() {
188            let vals = arr.values();
189
190            let sorted = if options.descending {
191                check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
192            } else {
193                check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
194            };
195            if !sorted {
196                return false;
197            }
198        }
199        return true;
200    };
201
202    // Slice off nulls and recurse.
203    let null_count = ca.null_count();
204    if options.nulls_last {
205        let ca = ca.slice(0, ca.len() - null_count);
206        is_sorted_ca_num(&ca, options)
207    } else {
208        let ca = ca.slice(null_count as i64, ca.len() - null_count);
209        is_sorted_ca_num(&ca, options)
210    }
211}
212
213impl SeriesMethods for Series {}