polars_ops/series/ops/
various.rs

1use num_traits::Bounded;
2#[cfg(feature = "dtype-struct")]
3use polars_core::chunked_array::ops::row_encode::_get_rows_encoded_ca;
4use polars_core::prelude::arity::unary_elementwise_values;
5use polars_core::prelude::*;
6use polars_core::series::IsSorted;
7use polars_core::with_match_physical_numeric_polars_type;
8#[cfg(feature = "hash")]
9use polars_utils::aliases::PlSeedableRandomStateQuality;
10use polars_utils::total_ord::TotalOrd;
11
12use crate::series::ops::SeriesSealed;
13
14pub trait SeriesMethods: SeriesSealed {
15    /// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
16    /// with dtype [`IdxType`]
17    fn value_counts(
18        &self,
19        sort: bool,
20        parallel: bool,
21        name: PlSmallStr,
22        normalize: bool,
23    ) -> PolarsResult<DataFrame> {
24        let s = self.as_series();
25        polars_ensure!(
26            s.name() != &name,
27            Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate \
28            column names; change `name` to fix", name,
29        );
30        // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined
31        let groups = s.group_tuples(parallel, sort)?;
32        let values = unsafe { s.agg_first(&groups) }
33            .with_name(s.name().clone())
34            .into();
35        let counts = groups.group_count().with_name(name.clone());
36
37        let counts = if normalize {
38            let len = s.len() as f64;
39            let counts: Float64Chunked =
40                unary_elementwise_values(&counts, |count| count as f64 / len);
41            counts.into_column()
42        } else {
43            counts.into_column()
44        };
45
46        let height = counts.len();
47        let cols = vec![values, counts];
48        let df = unsafe { DataFrame::new_no_checks(height, cols) };
49        if sort {
50            df.sort(
51                [name],
52                SortMultipleOptions::default()
53                    .with_order_descending(true)
54                    .with_multithreaded(parallel),
55            )
56        } else {
57            Ok(df)
58        }
59    }
60
61    #[cfg(feature = "hash")]
62    fn hash(&self, build_hasher: PlSeedableRandomStateQuality) -> UInt64Chunked {
63        let s = self.as_series().to_physical_repr();
64        match s.dtype() {
65            DataType::List(_) => {
66                let mut ca = s.list().unwrap().clone();
67                crate::chunked_array::hash::hash(&mut ca, build_hasher)
68            },
69            _ => {
70                let mut h = vec![];
71                s.0.vec_hash(build_hasher, &mut h).unwrap();
72                UInt64Chunked::from_vec(s.name().clone(), h)
73            },
74        }
75    }
76
77    fn ensure_sorted_arg(&self, operation: &str) -> PolarsResult<()> {
78        polars_ensure!(self.is_sorted(Default::default())?, InvalidOperation: "argument in operation '{}' is not sorted, please sort the 'expr/series/column' first", operation);
79        Ok(())
80    }
81
82    /// Checks if a [`Series`] is sorted. Tries to fail fast.
83    fn is_sorted(&self, options: SortOptions) -> PolarsResult<bool> {
84        let s = self.as_series();
85        let null_count = s.null_count();
86
87        // fast paths
88        if (options.descending
89            && (options.nulls_last || null_count == 0)
90            && matches!(s.is_sorted_flag(), IsSorted::Descending))
91            || (!options.descending
92                && (!options.nulls_last || null_count == 0)
93                && matches!(s.is_sorted_flag(), IsSorted::Ascending))
94        {
95            return Ok(true);
96        }
97
98        // for struct types we row-encode and recurse
99        #[cfg(feature = "dtype-struct")]
100        if matches!(s.dtype(), DataType::Struct(_)) {
101            let encoded = _get_rows_encoded_ca(
102                PlSmallStr::EMPTY,
103                &[s.clone().into()],
104                &[options.descending],
105                &[options.nulls_last],
106            )?;
107            return encoded.into_series().is_sorted(options);
108        }
109
110        let s_len = s.len();
111        if null_count == s_len {
112            // All nulls is all equal
113            return Ok(true);
114        }
115        // Check if nulls are in the right location.
116        if null_count > 0 {
117            // The slice triggers a fast null count
118            if options.nulls_last {
119                if s.slice((s_len - null_count) as i64, null_count)
120                    .null_count()
121                    != null_count
122                {
123                    return Ok(false);
124                }
125            } else if s.slice(0, null_count).null_count() != null_count {
126                return Ok(false);
127            }
128        }
129
130        if s.dtype().is_primitive_numeric() {
131            with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
132                let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
133                return Ok(is_sorted_ca_num::<$T>(ca, options))
134            })
135        }
136
137        let cmp_len = s_len - null_count - 1; // Number of comparisons we might have to do
138        // TODO! Change this, allocation of a full boolean series is too expensive and doesn't fail fast.
139        // Compare adjacent elements with no-copy slices that don't include any nulls
140        let offset = !options.nulls_last as i64 * null_count as i64;
141        let (s1, s2) = (s.slice(offset, cmp_len), s.slice(offset + 1, cmp_len));
142        let cmp_op = if options.descending {
143            Series::gt_eq
144        } else {
145            Series::lt_eq
146        };
147        Ok(cmp_op(&s1, &s2)?.all())
148    }
149}
150
151fn check_cmp<T: NumericNative, Cmp: Fn(&T, &T) -> bool>(
152    vals: &[T],
153    f: Cmp,
154    previous: &mut T,
155) -> bool {
156    let mut sorted = true;
157
158    // Outer loop so we can fail fast
159    // Inner loop will auto vectorize
160    for c in vals.chunks(1024) {
161        // don't early stop or branch
162        // so it autovectorizes
163        for v in c {
164            sorted &= f(previous, v);
165            *previous = *v;
166        }
167        if !sorted {
168            return false;
169        }
170    }
171    sorted
172}
173
174// Assumes nulls last/first is already checked.
175fn is_sorted_ca_num<T: PolarsNumericType>(ca: &ChunkedArray<T>, options: SortOptions) -> bool {
176    if let Ok(vals) = ca.cont_slice() {
177        let mut previous = vals[0];
178        return if options.descending {
179            check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
180        } else {
181            check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
182        };
183    };
184
185    if ca.null_count() == 0 {
186        let mut previous = if options.descending {
187            T::Native::max_value()
188        } else {
189            T::Native::min_value()
190        };
191        for arr in ca.downcast_iter() {
192            let vals = arr.values();
193
194            let sorted = if options.descending {
195                check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
196            } else {
197                check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
198            };
199            if !sorted {
200                return false;
201            }
202        }
203        return true;
204    };
205
206    // Slice off nulls and recurse.
207    let null_count = ca.null_count();
208    if options.nulls_last {
209        let ca = ca.slice(0, ca.len() - null_count);
210        is_sorted_ca_num(&ca, options)
211    } else {
212        let ca = ca.slice(null_count as i64, ca.len() - null_count);
213        is_sorted_ca_num(&ca, options)
214    }
215}
216
217impl SeriesMethods for Series {}