1use std::fmt::{Debug, Display, Formatter};
2use std::hash::Hash;
3
4use num_traits::NumCast;
5use polars_compute::rolling::QuantileMethod;
6use polars_utils::format_pl_smallstr;
7use polars_utils::hashing::DirtyHash;
8use rayon::prelude::*;
9
10use self::hashing::*;
11use crate::POOL;
12use crate::prelude::*;
13use crate::utils::{_set_partition_size, accumulate_dataframes_vertical};
14
15pub mod aggregations;
16pub mod expr;
17pub(crate) mod hashing;
18mod into_groups;
19mod position;
20
21pub use into_groups::*;
22pub use position::*;
23
24use crate::chunked_array::ops::row_encode::{
25 encode_rows_unordered, encode_rows_vertical_par_unordered,
26};
27
28impl DataFrame {
29 pub fn group_by_with_series(
30 &self,
31 mut by: Vec<Column>,
32 multithreaded: bool,
33 sorted: bool,
34 ) -> PolarsResult<GroupBy<'_>> {
35 polars_ensure!(
36 !by.is_empty(),
37 ComputeError: "at least one key is required in a group_by operation"
38 );
39
40 let common_height = if self.width() > 0 {
44 self.height()
45 } else {
46 by.iter().map(|s| s.len()).max().expect("at least 1 key")
47 };
48 for by_key in by.iter_mut() {
49 if by_key.len() != common_height {
50 polars_ensure!(
51 by_key.len() == 1,
52 ShapeMismatch: "series used as keys should have the same length as the DataFrame"
53 );
54 *by_key = by_key.new_from_index(0, common_height)
55 }
56 }
57
58 let groups = if by.len() == 1 {
59 let column = &by[0];
60 column
61 .as_materialized_series()
62 .group_tuples(multithreaded, sorted)
63 } else if by.iter().any(|s| s.dtype().is_object()) {
64 #[cfg(feature = "object")]
65 {
66 let mut df = DataFrame::new(by.clone()).unwrap();
67 let n = df.height();
68 let rows = df.to_av_rows();
69 let iter = (0..n).map(|i| rows.get(i));
70 Ok(group_by(iter, sorted))
71 }
72 #[cfg(not(feature = "object"))]
73 {
74 unreachable!()
75 }
76 } else {
77 let by = by
79 .iter()
80 .filter(|s| !s.dtype().is_null())
81 .cloned()
82 .collect::<Vec<_>>();
83 if by.is_empty() {
84 let groups = if self.is_empty() {
85 vec![]
86 } else {
87 vec![[0, self.height() as IdxSize]]
88 };
89 Ok(GroupsType::new_slice(groups, false, true))
90 } else {
91 let rows = if multithreaded {
92 encode_rows_vertical_par_unordered(&by)
93 } else {
94 encode_rows_unordered(&by)
95 }?
96 .into_series();
97 rows.group_tuples(multithreaded, sorted)
98 }
99 };
100 Ok(GroupBy::new(self, by, groups?.into_sliceable(), None))
101 }
102
103 pub fn group_by<I, S>(&self, by: I) -> PolarsResult<GroupBy<'_>>
116 where
117 I: IntoIterator<Item = S>,
118 S: Into<PlSmallStr>,
119 {
120 let selected_keys = self.select_columns(by)?;
121 self.group_by_with_series(selected_keys, true, false)
122 }
123
124 pub fn group_by_stable<I, S>(&self, by: I) -> PolarsResult<GroupBy<'_>>
127 where
128 I: IntoIterator<Item = S>,
129 S: Into<PlSmallStr>,
130 {
131 let selected_keys = self.select_columns(by)?;
132 self.group_by_with_series(selected_keys, true, true)
133 }
134}
135
136#[derive(Debug, Clone)]
186pub struct GroupBy<'a> {
187 pub df: &'a DataFrame,
188 pub(crate) selected_keys: Vec<Column>,
189 groups: GroupPositions,
191 pub(crate) selected_agg: Option<Vec<PlSmallStr>>,
193}
194
195impl<'a> GroupBy<'a> {
196 pub fn new(
197 df: &'a DataFrame,
198 by: Vec<Column>,
199 groups: GroupPositions,
200 selected_agg: Option<Vec<PlSmallStr>>,
201 ) -> Self {
202 GroupBy {
203 df,
204 selected_keys: by,
205 groups,
206 selected_agg,
207 }
208 }
209
210 #[must_use]
216 pub fn select<I: IntoIterator<Item = S>, S: Into<PlSmallStr>>(mut self, selection: I) -> Self {
217 self.selected_agg = Some(selection.into_iter().map(|s| s.into()).collect());
218 self
219 }
220
221 pub fn get_groups(&self) -> &GroupPositions {
226 &self.groups
227 }
228
229 pub unsafe fn get_groups_mut(&mut self) -> &mut GroupPositions {
238 &mut self.groups
239 }
240
241 pub fn take_groups(self) -> GroupPositions {
242 self.groups
243 }
244
245 pub fn take_groups_mut(&mut self) -> GroupPositions {
246 std::mem::take(&mut self.groups)
247 }
248
249 pub fn keys_sliced(&self, slice: Option<(i64, usize)>) -> Vec<Column> {
250 #[allow(unused_assignments)]
251 let mut groups_owned = None;
253
254 let groups = if let Some((offset, len)) = slice {
255 groups_owned = Some(self.groups.slice(offset, len));
256 groups_owned.as_deref().unwrap()
257 } else {
258 &self.groups
259 };
260 POOL.install(|| {
261 self.selected_keys
262 .par_iter()
263 .map(Column::as_materialized_series)
264 .map(|s| {
265 match groups {
266 GroupsType::Idx(groups) => {
267 let mut out = unsafe { s.take_slice_unchecked(groups.first()) };
269 if groups.sorted {
270 out.set_sorted_flag(s.is_sorted_flag());
271 };
272 out
273 },
274 GroupsType::Slice {
275 groups,
276 overlapping,
277 monotonic: _,
278 } => {
279 if *overlapping && !groups.is_empty() {
280 let offset = groups[0][0];
282 let [upper_offset, upper_len] = groups[groups.len() - 1];
283 return s.slice(
284 offset as i64,
285 ((upper_offset + upper_len) - offset) as usize,
286 );
287 }
288
289 let indices = groups
290 .iter()
291 .map(|&[first, _len]| first)
292 .collect_ca(PlSmallStr::EMPTY);
293 let mut out = unsafe { s.take_unchecked(&indices) };
295 out.set_sorted_flag(s.is_sorted_flag());
297 out
298 },
299 }
300 })
301 .map(Column::from)
302 .collect()
303 })
304 }
305
306 pub fn keys(&self) -> Vec<Column> {
307 self.keys_sliced(None)
308 }
309
310 fn prepare_agg(&self) -> PolarsResult<(Vec<Column>, Vec<Column>)> {
311 let keys = self.keys();
312
313 let agg_col = match &self.selected_agg {
314 Some(selection) => self.df.select_columns_impl(selection.as_slice()),
315 None => {
316 let by: Vec<_> = self.selected_keys.iter().map(|s| s.name()).collect();
317 let selection = self
318 .df
319 .iter()
320 .map(|s| s.name())
321 .filter(|a| !by.contains(a))
322 .cloned()
323 .collect::<Vec<_>>();
324
325 self.df.select_columns_impl(selection.as_slice())
326 },
327 }?;
328
329 Ok((keys, agg_col))
330 }
331
332 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
358 pub fn mean(&self) -> PolarsResult<DataFrame> {
359 let (mut cols, agg_cols) = self.prepare_agg()?;
360
361 for agg_col in agg_cols {
362 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Mean);
363 let mut agg = unsafe { agg_col.agg_mean(&self.groups) };
364 agg.rename(new_name);
365 cols.push(agg);
366 }
367 DataFrame::new(cols)
368 }
369
370 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
396 pub fn sum(&self) -> PolarsResult<DataFrame> {
397 let (mut cols, agg_cols) = self.prepare_agg()?;
398
399 for agg_col in agg_cols {
400 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Sum);
401 let mut agg = unsafe { agg_col.agg_sum(&self.groups) };
402 agg.rename(new_name);
403 cols.push(agg);
404 }
405 DataFrame::new(cols)
406 }
407
408 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
434 pub fn min(&self) -> PolarsResult<DataFrame> {
435 let (mut cols, agg_cols) = self.prepare_agg()?;
436 for agg_col in agg_cols {
437 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Min);
438 let mut agg = unsafe { agg_col.agg_min(&self.groups) };
439 agg.rename(new_name);
440 cols.push(agg);
441 }
442 DataFrame::new(cols)
443 }
444
445 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
471 pub fn max(&self) -> PolarsResult<DataFrame> {
472 let (mut cols, agg_cols) = self.prepare_agg()?;
473 for agg_col in agg_cols {
474 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Max);
475 let mut agg = unsafe { agg_col.agg_max(&self.groups) };
476 agg.rename(new_name);
477 cols.push(agg);
478 }
479 DataFrame::new(cols)
480 }
481
482 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
508 pub fn first(&self) -> PolarsResult<DataFrame> {
509 let (mut cols, agg_cols) = self.prepare_agg()?;
510 for agg_col in agg_cols {
511 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::First);
512 let mut agg = unsafe { agg_col.agg_first(&self.groups) };
513 agg.rename(new_name);
514 cols.push(agg);
515 }
516 DataFrame::new(cols)
517 }
518
519 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
545 pub fn last(&self) -> PolarsResult<DataFrame> {
546 let (mut cols, agg_cols) = self.prepare_agg()?;
547 for agg_col in agg_cols {
548 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Last);
549 let mut agg = unsafe { agg_col.agg_last(&self.groups) };
550 agg.rename(new_name);
551 cols.push(agg);
552 }
553 DataFrame::new(cols)
554 }
555
556 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
582 pub fn n_unique(&self) -> PolarsResult<DataFrame> {
583 let (mut cols, agg_cols) = self.prepare_agg()?;
584 for agg_col in agg_cols {
585 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::NUnique);
586 let mut agg = unsafe { agg_col.agg_n_unique(&self.groups) };
587 agg.rename(new_name);
588 cols.push(agg);
589 }
590 DataFrame::new(cols)
591 }
592
593 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
605 pub fn quantile(&self, quantile: f64, method: QuantileMethod) -> PolarsResult<DataFrame> {
606 polars_ensure!(
607 (0.0..=1.0).contains(&quantile),
608 ComputeError: "`quantile` should be within 0.0 and 1.0"
609 );
610 let (mut cols, agg_cols) = self.prepare_agg()?;
611 for agg_col in agg_cols {
612 let new_name = fmt_group_by_column(
613 agg_col.name().as_str(),
614 GroupByMethod::Quantile(quantile, method),
615 );
616 let mut agg = unsafe { agg_col.agg_quantile(&self.groups, quantile, method) };
617 agg.rename(new_name);
618 cols.push(agg);
619 }
620 DataFrame::new(cols)
621 }
622
623 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
634 pub fn median(&self) -> PolarsResult<DataFrame> {
635 let (mut cols, agg_cols) = self.prepare_agg()?;
636 for agg_col in agg_cols {
637 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Median);
638 let mut agg = unsafe { agg_col.agg_median(&self.groups) };
639 agg.rename(new_name);
640 cols.push(agg);
641 }
642 DataFrame::new(cols)
643 }
644
645 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
647 pub fn var(&self, ddof: u8) -> PolarsResult<DataFrame> {
648 let (mut cols, agg_cols) = self.prepare_agg()?;
649 for agg_col in agg_cols {
650 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Var(ddof));
651 let mut agg = unsafe { agg_col.agg_var(&self.groups, ddof) };
652 agg.rename(new_name);
653 cols.push(agg);
654 }
655 DataFrame::new(cols)
656 }
657
658 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
660 pub fn std(&self, ddof: u8) -> PolarsResult<DataFrame> {
661 let (mut cols, agg_cols) = self.prepare_agg()?;
662 for agg_col in agg_cols {
663 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Std(ddof));
664 let mut agg = unsafe { agg_col.agg_std(&self.groups, ddof) };
665 agg.rename(new_name);
666 cols.push(agg);
667 }
668 DataFrame::new(cols)
669 }
670
671 pub fn count(&self) -> PolarsResult<DataFrame> {
697 let (mut cols, agg_cols) = self.prepare_agg()?;
698
699 for agg_col in agg_cols {
700 let new_name = fmt_group_by_column(
701 agg_col.name().as_str(),
702 GroupByMethod::Count {
703 include_nulls: true,
704 },
705 );
706 let mut ca = self.groups.group_count();
707 ca.rename(new_name);
708 cols.push(ca.into_column());
709 }
710 DataFrame::new(cols)
711 }
712
713 pub fn groups(&self) -> PolarsResult<DataFrame> {
739 let mut cols = self.keys();
740 let mut column = self.groups.as_list_chunked();
741 let new_name = fmt_group_by_column("", GroupByMethod::Groups);
742 column.rename(new_name);
743 cols.push(column.into_column());
744 DataFrame::new(cols)
745 }
746
747 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
774 pub fn agg_list(&self) -> PolarsResult<DataFrame> {
775 let (mut cols, agg_cols) = self.prepare_agg()?;
776 for agg_col in agg_cols {
777 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Implode);
778 let mut agg = unsafe { agg_col.agg_list(&self.groups) };
779 agg.rename(new_name);
780 cols.push(agg);
781 }
782 DataFrame::new(cols)
783 }
784
785 fn prepare_apply(&self) -> PolarsResult<DataFrame> {
786 polars_ensure!(self.df.height() > 0, ComputeError: "cannot group_by + apply on empty 'DataFrame'");
787 if let Some(agg) = &self.selected_agg {
788 if agg.is_empty() {
789 Ok(self.df.clone())
790 } else {
791 let mut new_cols = Vec::with_capacity(self.selected_keys.len() + agg.len());
792 new_cols.extend_from_slice(&self.selected_keys);
793 let cols = self.df.select_columns_impl(agg.as_slice())?;
794 new_cols.extend(cols);
795 Ok(unsafe { DataFrame::new_no_checks(self.df.height(), new_cols) })
796 }
797 } else {
798 Ok(self.df.clone())
799 }
800 }
801
802 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
804 pub fn par_apply<F>(&self, f: F) -> PolarsResult<DataFrame>
805 where
806 F: Fn(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
807 {
808 let df = self.prepare_apply()?;
809 let dfs = self
810 .get_groups()
811 .par_iter()
812 .map(|g| {
813 let sub_df = unsafe { take_df(&df, g) };
816 f(sub_df)
817 })
818 .collect::<PolarsResult<Vec<_>>>()?;
819
820 let mut df = accumulate_dataframes_vertical(dfs)?;
821 df.as_single_chunk_par();
822 Ok(df)
823 }
824
825 pub fn apply<F>(&self, mut f: F) -> PolarsResult<DataFrame>
827 where
828 F: FnMut(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
829 {
830 let df = self.prepare_apply()?;
831 let dfs = self
832 .get_groups()
833 .iter()
834 .map(|g| {
835 let sub_df = unsafe { take_df(&df, g) };
838 f(sub_df)
839 })
840 .collect::<PolarsResult<Vec<_>>>()?;
841
842 let mut df = accumulate_dataframes_vertical(dfs)?;
843 df.as_single_chunk_par();
844 Ok(df)
845 }
846
847 pub fn sliced(mut self, slice: Option<(i64, usize)>) -> Self {
848 match slice {
849 None => self,
850 Some((offset, length)) => {
851 self.groups = self.groups.slice(offset, length);
852 self.selected_keys = self.keys_sliced(slice);
853 self
854 },
855 }
856 }
857}
858
859unsafe fn take_df(df: &DataFrame, g: GroupsIndicator) -> DataFrame {
860 match g {
861 GroupsIndicator::Idx(idx) => df.take_slice_unchecked(idx.1),
862 GroupsIndicator::Slice([first, len]) => df.slice(first as i64, len as usize),
863 }
864}
865
866#[derive(Copy, Clone, Debug)]
867pub enum GroupByMethod {
868 Min,
869 NanMin,
870 Max,
871 NanMax,
872 Median,
873 Mean,
874 First,
875 FirstNonNull,
876 Last,
877 LastNonNull,
878 Item { allow_empty: bool },
879 Sum,
880 Groups,
881 NUnique,
882 Quantile(f64, QuantileMethod),
883 Count { include_nulls: bool },
884 Implode,
885 Std(u8),
886 Var(u8),
887}
888
889impl Display for GroupByMethod {
890 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
891 use GroupByMethod::*;
892 let s = match self {
893 Min => "min",
894 NanMin => "nan_min",
895 Max => "max",
896 NanMax => "nan_max",
897 Median => "median",
898 Mean => "mean",
899 First => "first",
900 FirstNonNull => "first_non_null",
901 Last => "last",
902 LastNonNull => "last_non_null",
903 Item { .. } => "item",
904 Sum => "sum",
905 Groups => "groups",
906 NUnique => "n_unique",
907 Quantile(_, _) => "quantile",
908 Count { .. } => "count",
909 Implode => "list",
910 Std(_) => "std",
911 Var(_) => "var",
912 };
913 write!(f, "{s}")
914 }
915}
916
917pub fn fmt_group_by_column(name: &str, method: GroupByMethod) -> PlSmallStr {
919 use GroupByMethod::*;
920 match method {
921 Min => format_pl_smallstr!("{name}_min"),
922 Max => format_pl_smallstr!("{name}_max"),
923 NanMin => format_pl_smallstr!("{name}_nan_min"),
924 NanMax => format_pl_smallstr!("{name}_nan_max"),
925 Median => format_pl_smallstr!("{name}_median"),
926 Mean => format_pl_smallstr!("{name}_mean"),
927 First => format_pl_smallstr!("{name}_first"),
928 FirstNonNull => format_pl_smallstr!("{name}_first_non_null"),
929 Last => format_pl_smallstr!("{name}_last"),
930 LastNonNull => format_pl_smallstr!("{name}_last_non_null"),
931 Item { .. } => format_pl_smallstr!("{name}_item"),
932 Sum => format_pl_smallstr!("{name}_sum"),
933 Groups => PlSmallStr::from_static("groups"),
934 NUnique => format_pl_smallstr!("{name}_n_unique"),
935 Count { .. } => format_pl_smallstr!("{name}_count"),
936 Implode => format_pl_smallstr!("{name}_agg_list"),
937 Quantile(quantile, _interpol) => format_pl_smallstr!("{name}_quantile_{quantile:.2}"),
938 Std(_) => format_pl_smallstr!("{name}_agg_std"),
939 Var(_) => format_pl_smallstr!("{name}_agg_var"),
940 }
941}
942
943#[cfg(test)]
944mod test {
945 use num_traits::FloatConst;
946
947 use crate::prelude::*;
948
949 #[test]
950 #[cfg(feature = "dtype-date")]
951 #[cfg_attr(miri, ignore)]
952 fn test_group_by() -> PolarsResult<()> {
953 let s0 = Column::new(
954 PlSmallStr::from_static("date"),
955 &[
956 "2020-08-21",
957 "2020-08-21",
958 "2020-08-22",
959 "2020-08-23",
960 "2020-08-22",
961 ],
962 );
963 let s1 = Column::new(PlSmallStr::from_static("temp"), [20, 10, 7, 9, 1]);
964 let s2 = Column::new(PlSmallStr::from_static("rain"), [0.2, 0.1, 0.3, 0.1, 0.01]);
965 let df = DataFrame::new(vec![s0, s1, s2]).unwrap();
966
967 let out = df.group_by_stable(["date"])?.select(["temp"]).count()?;
968 assert_eq!(
969 out.column("temp_count")?,
970 &Column::new(PlSmallStr::from_static("temp_count"), [2 as IdxSize, 2, 1])
971 );
972
973 #[allow(deprecated)]
975 let out = df
977 .group_by_stable(["date"])?
978 .select(["temp", "rain"])
979 .mean()?;
980 assert_eq!(
981 out.column("temp_mean")?,
982 &Column::new(PlSmallStr::from_static("temp_mean"), [15.0f64, 4.0, 9.0])
983 );
984
985 #[allow(deprecated)]
987 let out = df
989 .group_by_stable(["date", "temp"])?
990 .select(["rain"])
991 .mean()?;
992 assert!(out.column("rain_mean").is_ok());
993
994 #[allow(deprecated)]
996 let out = df.group_by_stable(["date"])?.select(["temp"]).sum()?;
997 assert_eq!(
998 out.column("temp_sum")?,
999 &Column::new(PlSmallStr::from_static("temp_sum"), [30, 8, 9])
1000 );
1001
1002 #[allow(deprecated)]
1004 let gb = df.group_by(["date"]).unwrap().n_unique().unwrap();
1006 assert_eq!(gb.width(), 3);
1008 Ok(())
1009 }
1010
1011 #[test]
1012 #[cfg_attr(miri, ignore)]
1013 fn test_static_group_by_by_12_columns() {
1014 let s0 = Column::new("G1".into(), ["A", "A", "B", "B", "C"].as_ref());
1016 let s1 = Column::new("N".into(), [1, 2, 2, 4, 2].as_ref());
1017 let s2 = Column::new("G2".into(), ["k", "l", "m", "m", "l"].as_ref());
1018 let s3 = Column::new("G3".into(), ["a", "b", "c", "c", "d"].as_ref());
1019 let s4 = Column::new("G4".into(), ["1", "2", "3", "3", "4"].as_ref());
1020 let s5 = Column::new("G5".into(), ["X", "Y", "Z", "Z", "W"].as_ref());
1021 let s6 = Column::new("G6".into(), [false, true, true, true, false].as_ref());
1022 let s7 = Column::new("G7".into(), ["r", "x", "q", "q", "o"].as_ref());
1023 let s8 = Column::new("G8".into(), ["R", "X", "Q", "Q", "O"].as_ref());
1024 let s9 = Column::new("G9".into(), [1, 2, 3, 3, 4].as_ref());
1025 let s10 = Column::new("G10".into(), [".", "!", "?", "?", "/"].as_ref());
1026 let s11 = Column::new("G11".into(), ["(", ")", "@", "@", "$"].as_ref());
1027 let s12 = Column::new("G12".into(), ["-", "_", ";", ";", ","].as_ref());
1028
1029 let df =
1030 DataFrame::new(vec![s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]).unwrap();
1031
1032 #[allow(deprecated)]
1034 let adf = df
1035 .group_by([
1036 "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12",
1037 ])
1038 .unwrap()
1039 .select(["N"])
1040 .sum()
1041 .unwrap();
1042
1043 assert_eq!(
1044 Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
1045 &[Some(1), Some(2), Some(2), Some(6)]
1046 );
1047 }
1048
1049 #[test]
1050 #[cfg_attr(miri, ignore)]
1051 fn test_dynamic_group_by_by_13_columns() {
1052 let series_content = ["A", "A", "B", "B", "C"];
1054
1055 let series_names = [
1057 "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12", "G13",
1058 ];
1059
1060 let mut columns = Vec::with_capacity(14);
1062
1063 for series_name in series_names {
1065 let group_columns = Column::new(series_name.into(), series_content.as_ref());
1066 columns.push(group_columns);
1067 }
1068
1069 let agg_series = Column::new("N".into(), [1, 2, 3, 3, 4].as_ref());
1071 columns.push(agg_series);
1072
1073 let df = DataFrame::new(columns).unwrap();
1075
1076 #[allow(deprecated)]
1078 let adf = df
1080 .group_by(series_names)
1081 .unwrap()
1082 .select(["N"])
1083 .sum()
1084 .unwrap();
1085
1086 for series_name in &series_names {
1089 assert_eq!(
1090 Vec::from(&adf.column(series_name).unwrap().str().unwrap().sort(false)),
1091 &[Some("A"), Some("B"), Some("C")]
1092 );
1093 }
1094
1095 assert_eq!(
1097 Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
1098 &[Some(3), Some(4), Some(6)]
1099 );
1100 }
1101
1102 #[test]
1103 #[cfg_attr(miri, ignore)]
1104 fn test_group_by_floats() {
1105 let df = df! {"flt" => [1., 1., 2., 2., 3.],
1106 "val" => [1, 1, 1, 1, 1]
1107 }
1108 .unwrap();
1109 #[allow(deprecated)]
1111 let res = df.group_by(["flt"]).unwrap().sum().unwrap();
1112 let res = res.sort(["flt"], SortMultipleOptions::default()).unwrap();
1113 assert_eq!(
1114 Vec::from(res.column("val_sum").unwrap().i32().unwrap()),
1115 &[Some(2), Some(2), Some(1)]
1116 );
1117 }
1118
1119 #[test]
1120 #[cfg_attr(miri, ignore)]
1121 #[cfg(feature = "dtype-categorical")]
1122 fn test_group_by_categorical() {
1123 let mut df = df! {"foo" => ["a", "a", "b", "b", "c"],
1124 "ham" => ["a", "a", "b", "b", "c"],
1125 "bar" => [1, 1, 1, 1, 1]
1126 }
1127 .unwrap();
1128
1129 df.apply("foo", |s| {
1130 s.cast(&DataType::from_categories(Categories::global()))
1131 .unwrap()
1132 })
1133 .unwrap();
1134
1135 #[allow(deprecated)]
1137 let res = df
1139 .group_by_stable(["foo", "ham"])
1140 .unwrap()
1141 .select(["bar"])
1142 .sum()
1143 .unwrap();
1144
1145 assert_eq!(
1146 Vec::from(
1147 res.column("bar_sum")
1148 .unwrap()
1149 .as_materialized_series()
1150 .i32()
1151 .unwrap()
1152 ),
1153 &[Some(2), Some(2), Some(1)]
1154 );
1155 }
1156
1157 #[test]
1158 #[cfg_attr(miri, ignore)]
1159 fn test_group_by_null_handling() -> PolarsResult<()> {
1160 let df = df!(
1161 "a" => ["a", "a", "a", "b", "b"],
1162 "b" => [Some(1), Some(2), None, None, Some(1)]
1163 )?;
1164 #[allow(deprecated)]
1166 let out = df.group_by_stable(["a"])?.mean()?;
1167
1168 assert_eq!(
1169 Vec::from(out.column("b_mean")?.as_materialized_series().f64()?),
1170 &[Some(1.5), Some(1.0)]
1171 );
1172 Ok(())
1173 }
1174
1175 #[test]
1176 #[cfg_attr(miri, ignore)]
1177 fn test_group_by_var() -> PolarsResult<()> {
1178 let df = df![
1180 "g" => ["foo", "foo", "bar"],
1181 "flt" => [1.0, 2.0, 3.0],
1182 "int" => [1, 2, 3]
1183 ]?;
1184
1185 #[allow(deprecated)]
1187 let out = df.group_by_stable(["g"])?.select(["int"]).var(1)?;
1188
1189 assert_eq!(out.column("int_agg_var")?.f64()?.get(0), Some(0.5));
1190 #[allow(deprecated)]
1192 let out = df.group_by_stable(["g"])?.select(["int"]).std(1)?;
1193 let val = out.column("int_agg_std")?.f64()?.get(0).unwrap();
1194 let expected = f64::FRAC_1_SQRT_2();
1195 assert!((val - expected).abs() < 0.000001);
1196 Ok(())
1197 }
1198
1199 #[test]
1200 #[cfg_attr(miri, ignore)]
1201 #[cfg(feature = "dtype-categorical")]
1202 fn test_group_by_null_group() -> PolarsResult<()> {
1203 let mut df = df![
1205 "g" => [Some("foo"), Some("foo"), Some("bar"), None, None],
1206 "flt" => [1.0, 2.0, 3.0, 1.0, 1.0],
1207 "int" => [1, 2, 3, 1, 1]
1208 ]?;
1209
1210 df.try_apply("g", |s| {
1211 s.cast(&DataType::from_categories(Categories::global()))
1212 })?;
1213
1214 #[allow(deprecated)]
1216 let _ = df.group_by(["g"])?.sum()?;
1217 Ok(())
1218 }
1219}