polars_ops/series/ops/
various.rs

1use num_traits::Bounded;
2#[cfg(feature = "dtype-struct")]
3use polars_core::chunked_array::ops::row_encode::_get_rows_encoded_ca;
4use polars_core::prelude::arity::unary_elementwise_values;
5use polars_core::prelude::*;
6use polars_core::series::IsSorted;
7use polars_core::with_match_physical_numeric_polars_type;
8use polars_utils::aliases::PlSeedableRandomStateQuality;
9use polars_utils::total_ord::TotalOrd;
10
11use crate::series::ops::SeriesSealed;
12
13pub trait SeriesMethods: SeriesSealed {
14    /// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
15    /// with dtype [`IdxType`]
16    fn value_counts(
17        &self,
18        sort: bool,
19        parallel: bool,
20        name: PlSmallStr,
21        normalize: bool,
22    ) -> PolarsResult<DataFrame> {
23        let s = self.as_series();
24        polars_ensure!(
25            s.name() != &name,
26            Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate \
27            column names; change `name` to fix", name,
28        );
29        // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined
30        let groups = s.group_tuples(parallel, sort)?;
31        let values = unsafe { s.agg_first(&groups) }.into();
32        let counts = groups.group_count().with_name(name.clone());
33
34        let counts = if normalize {
35            let len = s.len() as f64;
36            let counts: Float64Chunked =
37                unary_elementwise_values(&counts, |count| count as f64 / len);
38            counts.into_column()
39        } else {
40            counts.into_column()
41        };
42
43        let height = counts.len();
44        let cols = vec![values, counts];
45        let df = unsafe { DataFrame::new_no_checks(height, cols) };
46        if sort {
47            df.sort(
48                [name],
49                SortMultipleOptions::default()
50                    .with_order_descending(true)
51                    .with_multithreaded(parallel),
52            )
53        } else {
54            Ok(df)
55        }
56    }
57
58    #[cfg(feature = "hash")]
59    fn hash(&self, build_hasher: PlSeedableRandomStateQuality) -> UInt64Chunked {
60        let s = self.as_series().to_physical_repr();
61        match s.dtype() {
62            DataType::List(_) => {
63                let mut ca = s.list().unwrap().clone();
64                crate::chunked_array::hash::hash(&mut ca, build_hasher)
65            },
66            _ => {
67                let mut h = vec![];
68                s.0.vec_hash(build_hasher, &mut h).unwrap();
69                UInt64Chunked::from_vec(s.name().clone(), h)
70            },
71        }
72    }
73
74    fn ensure_sorted_arg(&self, operation: &str) -> PolarsResult<()> {
75        polars_ensure!(self.is_sorted(Default::default())?, InvalidOperation: "argument in operation '{}' is not sorted, please sort the 'expr/series/column' first", operation);
76        Ok(())
77    }
78
79    /// Checks if a [`Series`] is sorted. Tries to fail fast.
80    fn is_sorted(&self, options: SortOptions) -> PolarsResult<bool> {
81        let s = self.as_series();
82        let null_count = s.null_count();
83
84        // fast paths
85        if (options.descending
86            && (options.nulls_last || null_count == 0)
87            && matches!(s.is_sorted_flag(), IsSorted::Descending))
88            || (!options.descending
89                && (!options.nulls_last || null_count == 0)
90                && matches!(s.is_sorted_flag(), IsSorted::Ascending))
91        {
92            return Ok(true);
93        }
94
95        // for struct types we row-encode and recurse
96        #[cfg(feature = "dtype-struct")]
97        if matches!(s.dtype(), DataType::Struct(_)) {
98            let encoded = _get_rows_encoded_ca(
99                PlSmallStr::EMPTY,
100                &[s.clone().into()],
101                &[options.descending],
102                &[options.nulls_last],
103            )?;
104            return encoded.into_series().is_sorted(options);
105        }
106
107        let s_len = s.len();
108        if null_count == s_len {
109            // All nulls is all equal
110            return Ok(true);
111        }
112        // Check if nulls are in the right location.
113        if null_count > 0 {
114            // The slice triggers a fast null count
115            if options.nulls_last {
116                if s.slice((s_len - null_count) as i64, null_count)
117                    .null_count()
118                    != null_count
119                {
120                    return Ok(false);
121                }
122            } else if s.slice(0, null_count).null_count() != null_count {
123                return Ok(false);
124            }
125        }
126
127        if s.dtype().is_primitive_numeric() {
128            with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
129                let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
130                return Ok(is_sorted_ca_num::<$T>(ca, options))
131            })
132        }
133
134        let cmp_len = s_len - null_count - 1; // Number of comparisons we might have to do
135        // TODO! Change this, allocation of a full boolean series is too expensive and doesn't fail fast.
136        // Compare adjacent elements with no-copy slices that don't include any nulls
137        let offset = !options.nulls_last as i64 * null_count as i64;
138        let (s1, s2) = (s.slice(offset, cmp_len), s.slice(offset + 1, cmp_len));
139        let cmp_op = if options.descending {
140            Series::gt_eq
141        } else {
142            Series::lt_eq
143        };
144        Ok(cmp_op(&s1, &s2)?.all())
145    }
146}
147
148fn check_cmp<T: NumericNative, Cmp: Fn(&T, &T) -> bool>(
149    vals: &[T],
150    f: Cmp,
151    previous: &mut T,
152) -> bool {
153    let mut sorted = true;
154
155    // Outer loop so we can fail fast
156    // Inner loop will auto vectorize
157    for c in vals.chunks(1024) {
158        // don't early stop or branch
159        // so it autovectorizes
160        for v in c {
161            sorted &= f(previous, v);
162            *previous = *v;
163        }
164        if !sorted {
165            return false;
166        }
167    }
168    sorted
169}
170
171// Assumes nulls last/first is already checked.
172fn is_sorted_ca_num<T: PolarsNumericType>(ca: &ChunkedArray<T>, options: SortOptions) -> bool {
173    if let Ok(vals) = ca.cont_slice() {
174        let mut previous = vals[0];
175        return if options.descending {
176            check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
177        } else {
178            check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
179        };
180    };
181
182    if ca.null_count() == 0 {
183        let mut previous = if options.descending {
184            T::Native::max_value()
185        } else {
186            T::Native::min_value()
187        };
188        for arr in ca.downcast_iter() {
189            let vals = arr.values();
190
191            let sorted = if options.descending {
192                check_cmp(vals, |prev, c| prev.tot_ge(c), &mut previous)
193            } else {
194                check_cmp(vals, |prev, c| prev.tot_le(c), &mut previous)
195            };
196            if !sorted {
197                return false;
198            }
199        }
200        return true;
201    };
202
203    // Slice off nulls and recurse.
204    let null_count = ca.null_count();
205    if options.nulls_last {
206        let ca = ca.slice(0, ca.len() - null_count);
207        is_sorted_ca_num(&ca, options)
208    } else {
209        let ca = ca.slice(null_count as i64, ca.len() - null_count);
210        is_sorted_ca_num(&ca, options)
211    }
212}
213
214impl SeriesMethods for Series {}