1use std::fmt::{Debug, Display, Formatter};
2use std::hash::Hash;
3
4use num_traits::NumCast;
5use polars_compute::rolling::QuantileMethod;
6use polars_utils::format_pl_smallstr;
7use polars_utils::hashing::DirtyHash;
8use rayon::prelude::*;
9
10use self::hashing::*;
11use crate::POOL;
12use crate::prelude::*;
13use crate::utils::{_set_partition_size, accumulate_dataframes_vertical};
14
15pub mod aggregations;
16pub mod expr;
17pub(crate) mod hashing;
18mod into_groups;
19mod position;
20
21pub use into_groups::*;
22pub use position::*;
23
24use crate::chunked_array::ops::row_encode::{
25 encode_rows_unordered, encode_rows_vertical_par_unordered,
26};
27
28impl DataFrame {
29 pub fn group_by_with_series(
30 &self,
31 mut by: Vec<Column>,
32 multithreaded: bool,
33 sorted: bool,
34 ) -> PolarsResult<GroupBy<'_>> {
35 polars_ensure!(
36 !by.is_empty(),
37 ComputeError: "at least one key is required in a group_by operation"
38 );
39
40 let common_height = if self.width() > 0 {
44 self.height()
45 } else {
46 by.iter().map(|s| s.len()).max().expect("at least 1 key")
47 };
48 for by_key in by.iter_mut() {
49 if by_key.len() != common_height {
50 polars_ensure!(
51 by_key.len() == 1,
52 ShapeMismatch: "series used as keys should have the same length as the DataFrame"
53 );
54 *by_key = by_key.new_from_index(0, common_height)
55 }
56 }
57
58 let groups = if by.len() == 1 {
59 let column = &by[0];
60 column
61 .as_materialized_series()
62 .group_tuples(multithreaded, sorted)
63 } else if by.iter().any(|s| s.dtype().is_object()) {
64 #[cfg(feature = "object")]
65 {
66 let mut df = DataFrame::new(by.clone()).unwrap();
67 let n = df.height();
68 let rows = df.to_av_rows();
69 let iter = (0..n).map(|i| rows.get(i));
70 Ok(group_by(iter, sorted))
71 }
72 #[cfg(not(feature = "object"))]
73 {
74 unreachable!()
75 }
76 } else {
77 let by = by
79 .iter()
80 .filter(|s| !s.dtype().is_null())
81 .cloned()
82 .collect::<Vec<_>>();
83 if by.is_empty() {
84 let groups = if self.is_empty() {
85 vec![]
86 } else {
87 vec![[0, self.height() as IdxSize]]
88 };
89 Ok(GroupsType::new_slice(groups, false, true))
90 } else {
91 let rows = if multithreaded {
92 encode_rows_vertical_par_unordered(&by)
93 } else {
94 encode_rows_unordered(&by)
95 }?
96 .into_series();
97 rows.group_tuples(multithreaded, sorted)
98 }
99 };
100 Ok(GroupBy::new(self, by, groups?.into_sliceable(), None))
101 }
102
103 pub fn group_by<I, S>(&self, by: I) -> PolarsResult<GroupBy<'_>>
116 where
117 I: IntoIterator<Item = S>,
118 S: Into<PlSmallStr>,
119 {
120 let selected_keys = self.select_columns(by)?;
121 self.group_by_with_series(selected_keys, true, false)
122 }
123
124 pub fn group_by_stable<I, S>(&self, by: I) -> PolarsResult<GroupBy<'_>>
127 where
128 I: IntoIterator<Item = S>,
129 S: Into<PlSmallStr>,
130 {
131 let selected_keys = self.select_columns(by)?;
132 self.group_by_with_series(selected_keys, true, true)
133 }
134}
135
136#[derive(Debug, Clone)]
186pub struct GroupBy<'a> {
187 pub df: &'a DataFrame,
188 pub(crate) selected_keys: Vec<Column>,
189 groups: GroupPositions,
191 pub(crate) selected_agg: Option<Vec<PlSmallStr>>,
193}
194
195impl<'a> GroupBy<'a> {
196 pub fn new(
197 df: &'a DataFrame,
198 by: Vec<Column>,
199 groups: GroupPositions,
200 selected_agg: Option<Vec<PlSmallStr>>,
201 ) -> Self {
202 GroupBy {
203 df,
204 selected_keys: by,
205 groups,
206 selected_agg,
207 }
208 }
209
210 #[must_use]
216 pub fn select<I: IntoIterator<Item = S>, S: Into<PlSmallStr>>(mut self, selection: I) -> Self {
217 self.selected_agg = Some(selection.into_iter().map(|s| s.into()).collect());
218 self
219 }
220
221 pub fn get_groups(&self) -> &GroupPositions {
226 &self.groups
227 }
228
229 pub unsafe fn get_groups_mut(&mut self) -> &mut GroupPositions {
238 &mut self.groups
239 }
240
241 pub fn into_groups(self) -> GroupPositions {
242 self.groups
243 }
244
245 pub fn keys_sliced(&self, slice: Option<(i64, usize)>) -> Vec<Column> {
246 #[allow(unused_assignments)]
247 let mut groups_owned = None;
249
250 let groups = if let Some((offset, len)) = slice {
251 groups_owned = Some(self.groups.slice(offset, len));
252 groups_owned.as_deref().unwrap()
253 } else {
254 &self.groups
255 };
256 POOL.install(|| {
257 self.selected_keys
258 .par_iter()
259 .map(Column::as_materialized_series)
260 .map(|s| {
261 match groups {
262 GroupsType::Idx(groups) => {
263 let mut out = unsafe { s.take_slice_unchecked(groups.first()) };
265 if groups.sorted {
266 out.set_sorted_flag(s.is_sorted_flag());
267 };
268 out
269 },
270 GroupsType::Slice {
271 groups,
272 overlapping,
273 monotonic: _,
274 } => {
275 if *overlapping && !groups.is_empty() {
276 let offset = groups[0][0];
278 let [upper_offset, upper_len] = groups[groups.len() - 1];
279 return s.slice(
280 offset as i64,
281 ((upper_offset + upper_len) - offset) as usize,
282 );
283 }
284
285 let indices = groups
286 .iter()
287 .map(|&[first, _len]| first)
288 .collect_ca(PlSmallStr::EMPTY);
289 let mut out = unsafe { s.take_unchecked(&indices) };
291 out.set_sorted_flag(s.is_sorted_flag());
293 out
294 },
295 }
296 })
297 .map(Column::from)
298 .collect()
299 })
300 }
301
302 pub fn keys(&self) -> Vec<Column> {
303 self.keys_sliced(None)
304 }
305
306 fn prepare_agg(&self) -> PolarsResult<(Vec<Column>, Vec<Column>)> {
307 let keys = self.keys();
308
309 let agg_col = match &self.selected_agg {
310 Some(selection) => self.df.select_columns_impl(selection.as_slice()),
311 None => {
312 let by: Vec<_> = self.selected_keys.iter().map(|s| s.name()).collect();
313 let selection = self
314 .df
315 .iter()
316 .map(|s| s.name())
317 .filter(|a| !by.contains(a))
318 .cloned()
319 .collect::<Vec<_>>();
320
321 self.df.select_columns_impl(selection.as_slice())
322 },
323 }?;
324
325 Ok((keys, agg_col))
326 }
327
328 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
354 pub fn mean(&self) -> PolarsResult<DataFrame> {
355 let (mut cols, agg_cols) = self.prepare_agg()?;
356
357 for agg_col in agg_cols {
358 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Mean);
359 let mut agg = unsafe { agg_col.agg_mean(&self.groups) };
360 agg.rename(new_name);
361 cols.push(agg);
362 }
363 DataFrame::new(cols)
364 }
365
366 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
392 pub fn sum(&self) -> PolarsResult<DataFrame> {
393 let (mut cols, agg_cols) = self.prepare_agg()?;
394
395 for agg_col in agg_cols {
396 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Sum);
397 let mut agg = unsafe { agg_col.agg_sum(&self.groups) };
398 agg.rename(new_name);
399 cols.push(agg);
400 }
401 DataFrame::new(cols)
402 }
403
404 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
430 pub fn min(&self) -> PolarsResult<DataFrame> {
431 let (mut cols, agg_cols) = self.prepare_agg()?;
432 for agg_col in agg_cols {
433 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Min);
434 let mut agg = unsafe { agg_col.agg_min(&self.groups) };
435 agg.rename(new_name);
436 cols.push(agg);
437 }
438 DataFrame::new(cols)
439 }
440
441 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
467 pub fn max(&self) -> PolarsResult<DataFrame> {
468 let (mut cols, agg_cols) = self.prepare_agg()?;
469 for agg_col in agg_cols {
470 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Max);
471 let mut agg = unsafe { agg_col.agg_max(&self.groups) };
472 agg.rename(new_name);
473 cols.push(agg);
474 }
475 DataFrame::new(cols)
476 }
477
478 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
504 pub fn first(&self) -> PolarsResult<DataFrame> {
505 let (mut cols, agg_cols) = self.prepare_agg()?;
506 for agg_col in agg_cols {
507 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::First);
508 let mut agg = unsafe { agg_col.agg_first(&self.groups) };
509 agg.rename(new_name);
510 cols.push(agg);
511 }
512 DataFrame::new(cols)
513 }
514
515 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
541 pub fn last(&self) -> PolarsResult<DataFrame> {
542 let (mut cols, agg_cols) = self.prepare_agg()?;
543 for agg_col in agg_cols {
544 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Last);
545 let mut agg = unsafe { agg_col.agg_last(&self.groups) };
546 agg.rename(new_name);
547 cols.push(agg);
548 }
549 DataFrame::new(cols)
550 }
551
552 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
578 pub fn n_unique(&self) -> PolarsResult<DataFrame> {
579 let (mut cols, agg_cols) = self.prepare_agg()?;
580 for agg_col in agg_cols {
581 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::NUnique);
582 let mut agg = unsafe { agg_col.agg_n_unique(&self.groups) };
583 agg.rename(new_name);
584 cols.push(agg);
585 }
586 DataFrame::new(cols)
587 }
588
589 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
601 pub fn quantile(&self, quantile: f64, method: QuantileMethod) -> PolarsResult<DataFrame> {
602 polars_ensure!(
603 (0.0..=1.0).contains(&quantile),
604 ComputeError: "`quantile` should be within 0.0 and 1.0"
605 );
606 let (mut cols, agg_cols) = self.prepare_agg()?;
607 for agg_col in agg_cols {
608 let new_name = fmt_group_by_column(
609 agg_col.name().as_str(),
610 GroupByMethod::Quantile(quantile, method),
611 );
612 let mut agg = unsafe { agg_col.agg_quantile(&self.groups, quantile, method) };
613 agg.rename(new_name);
614 cols.push(agg);
615 }
616 DataFrame::new(cols)
617 }
618
619 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
630 pub fn median(&self) -> PolarsResult<DataFrame> {
631 let (mut cols, agg_cols) = self.prepare_agg()?;
632 for agg_col in agg_cols {
633 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Median);
634 let mut agg = unsafe { agg_col.agg_median(&self.groups) };
635 agg.rename(new_name);
636 cols.push(agg);
637 }
638 DataFrame::new(cols)
639 }
640
641 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
643 pub fn var(&self, ddof: u8) -> PolarsResult<DataFrame> {
644 let (mut cols, agg_cols) = self.prepare_agg()?;
645 for agg_col in agg_cols {
646 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Var(ddof));
647 let mut agg = unsafe { agg_col.agg_var(&self.groups, ddof) };
648 agg.rename(new_name);
649 cols.push(agg);
650 }
651 DataFrame::new(cols)
652 }
653
654 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
656 pub fn std(&self, ddof: u8) -> PolarsResult<DataFrame> {
657 let (mut cols, agg_cols) = self.prepare_agg()?;
658 for agg_col in agg_cols {
659 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Std(ddof));
660 let mut agg = unsafe { agg_col.agg_std(&self.groups, ddof) };
661 agg.rename(new_name);
662 cols.push(agg);
663 }
664 DataFrame::new(cols)
665 }
666
667 pub fn count(&self) -> PolarsResult<DataFrame> {
693 let (mut cols, agg_cols) = self.prepare_agg()?;
694
695 for agg_col in agg_cols {
696 let new_name = fmt_group_by_column(
697 agg_col.name().as_str(),
698 GroupByMethod::Count {
699 include_nulls: true,
700 },
701 );
702 let mut ca = self.groups.group_count();
703 ca.rename(new_name);
704 cols.push(ca.into_column());
705 }
706 DataFrame::new(cols)
707 }
708
709 pub fn groups(&self) -> PolarsResult<DataFrame> {
735 let mut cols = self.keys();
736 let mut column = self.groups.as_list_chunked();
737 let new_name = fmt_group_by_column("", GroupByMethod::Groups);
738 column.rename(new_name);
739 cols.push(column.into_column());
740 DataFrame::new(cols)
741 }
742
743 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
770 pub fn agg_list(&self) -> PolarsResult<DataFrame> {
771 let (mut cols, agg_cols) = self.prepare_agg()?;
772 for agg_col in agg_cols {
773 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Implode);
774 let mut agg = unsafe { agg_col.agg_list(&self.groups) };
775 agg.rename(new_name);
776 cols.push(agg);
777 }
778 DataFrame::new(cols)
779 }
780
781 fn prepare_apply(&self) -> PolarsResult<DataFrame> {
782 polars_ensure!(self.df.height() > 0, ComputeError: "cannot group_by + apply on empty 'DataFrame'");
783 if let Some(agg) = &self.selected_agg {
784 if agg.is_empty() {
785 Ok(self.df.clone())
786 } else {
787 let mut new_cols = Vec::with_capacity(self.selected_keys.len() + agg.len());
788 new_cols.extend_from_slice(&self.selected_keys);
789 let cols = self.df.select_columns_impl(agg.as_slice())?;
790 new_cols.extend(cols);
791 Ok(unsafe { DataFrame::new_no_checks(self.df.height(), new_cols) })
792 }
793 } else {
794 Ok(self.df.clone())
795 }
796 }
797
798 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
800 pub fn par_apply<F>(&self, f: F) -> PolarsResult<DataFrame>
801 where
802 F: Fn(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
803 {
804 let df = self.prepare_apply()?;
805 let dfs = self
806 .get_groups()
807 .par_iter()
808 .map(|g| {
809 let sub_df = unsafe { take_df(&df, g) };
812 f(sub_df)
813 })
814 .collect::<PolarsResult<Vec<_>>>()?;
815
816 let mut df = accumulate_dataframes_vertical(dfs)?;
817 df.as_single_chunk_par();
818 Ok(df)
819 }
820
821 pub fn apply<F>(&self, f: F) -> PolarsResult<DataFrame>
823 where
824 F: FnMut(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
825 {
826 self.apply_sliced(None, f)
827 }
828
829 pub fn apply_sliced<F>(&self, slice: Option<(i64, usize)>, mut f: F) -> PolarsResult<DataFrame>
830 where
831 F: FnMut(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
832 {
833 let df = self.prepare_apply()?;
834 let max_height = if let Some((offset, len)) = slice {
835 offset.try_into().unwrap_or(usize::MAX).saturating_add(len)
836 } else {
837 usize::MAX
838 };
839 let mut height = 0;
840 let mut dfs = Vec::with_capacity(self.get_groups().len());
841 for g in self.get_groups().iter() {
842 let sub_df = unsafe { take_df(&df, g) };
844 let df = f(sub_df)?;
845 height += df.height();
846 dfs.push(df);
847
848 if height >= max_height {
851 break;
852 }
853 }
854
855 let mut df = accumulate_dataframes_vertical(dfs)?;
856 if let Some((offset, len)) = slice {
857 df = df.slice(offset, len);
858 }
859 Ok(df)
860 }
861
862 pub fn sliced(mut self, slice: Option<(i64, usize)>) -> Self {
863 match slice {
864 None => self,
865 Some((offset, length)) => {
866 self.groups = self.groups.slice(offset, length);
867 self.selected_keys = self.keys_sliced(slice);
868 self
869 },
870 }
871 }
872}
873
874unsafe fn take_df(df: &DataFrame, g: GroupsIndicator) -> DataFrame {
875 match g {
876 GroupsIndicator::Idx(idx) => df.take_slice_unchecked(idx.1),
877 GroupsIndicator::Slice([first, len]) => df.slice(first as i64, len as usize),
878 }
879}
880
881#[derive(Copy, Clone, Debug)]
882pub enum GroupByMethod {
883 Min,
884 NanMin,
885 Max,
886 NanMax,
887 Median,
888 Mean,
889 First,
890 FirstNonNull,
891 Last,
892 LastNonNull,
893 Item { allow_empty: bool },
894 Sum,
895 Groups,
896 NUnique,
897 Quantile(f64, QuantileMethod),
898 Count { include_nulls: bool },
899 Implode,
900 Std(u8),
901 Var(u8),
902}
903
904impl Display for GroupByMethod {
905 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
906 use GroupByMethod::*;
907 let s = match self {
908 Min => "min",
909 NanMin => "nan_min",
910 Max => "max",
911 NanMax => "nan_max",
912 Median => "median",
913 Mean => "mean",
914 First => "first",
915 FirstNonNull => "first_non_null",
916 Last => "last",
917 LastNonNull => "last_non_null",
918 Item { .. } => "item",
919 Sum => "sum",
920 Groups => "groups",
921 NUnique => "n_unique",
922 Quantile(_, _) => "quantile",
923 Count { .. } => "count",
924 Implode => "list",
925 Std(_) => "std",
926 Var(_) => "var",
927 };
928 write!(f, "{s}")
929 }
930}
931
932pub fn fmt_group_by_column(name: &str, method: GroupByMethod) -> PlSmallStr {
934 use GroupByMethod::*;
935 match method {
936 Min => format_pl_smallstr!("{name}_min"),
937 Max => format_pl_smallstr!("{name}_max"),
938 NanMin => format_pl_smallstr!("{name}_nan_min"),
939 NanMax => format_pl_smallstr!("{name}_nan_max"),
940 Median => format_pl_smallstr!("{name}_median"),
941 Mean => format_pl_smallstr!("{name}_mean"),
942 First => format_pl_smallstr!("{name}_first"),
943 FirstNonNull => format_pl_smallstr!("{name}_first_non_null"),
944 Last => format_pl_smallstr!("{name}_last"),
945 LastNonNull => format_pl_smallstr!("{name}_last_non_null"),
946 Item { .. } => format_pl_smallstr!("{name}_item"),
947 Sum => format_pl_smallstr!("{name}_sum"),
948 Groups => PlSmallStr::from_static("groups"),
949 NUnique => format_pl_smallstr!("{name}_n_unique"),
950 Count { .. } => format_pl_smallstr!("{name}_count"),
951 Implode => format_pl_smallstr!("{name}_agg_list"),
952 Quantile(quantile, _interpol) => format_pl_smallstr!("{name}_quantile_{quantile:.2}"),
953 Std(_) => format_pl_smallstr!("{name}_agg_std"),
954 Var(_) => format_pl_smallstr!("{name}_agg_var"),
955 }
956}
957
958#[cfg(test)]
959mod test {
960 use num_traits::FloatConst;
961
962 use crate::prelude::*;
963
964 #[test]
965 #[cfg(feature = "dtype-date")]
966 #[cfg_attr(miri, ignore)]
967 fn test_group_by() -> PolarsResult<()> {
968 let s0 = Column::new(
969 PlSmallStr::from_static("date"),
970 &[
971 "2020-08-21",
972 "2020-08-21",
973 "2020-08-22",
974 "2020-08-23",
975 "2020-08-22",
976 ],
977 );
978 let s1 = Column::new(PlSmallStr::from_static("temp"), [20, 10, 7, 9, 1]);
979 let s2 = Column::new(PlSmallStr::from_static("rain"), [0.2, 0.1, 0.3, 0.1, 0.01]);
980 let df = DataFrame::new(vec![s0, s1, s2]).unwrap();
981
982 let out = df.group_by_stable(["date"])?.select(["temp"]).count()?;
983 assert_eq!(
984 out.column("temp_count")?,
985 &Column::new(PlSmallStr::from_static("temp_count"), [2 as IdxSize, 2, 1])
986 );
987
988 #[allow(deprecated)]
990 let out = df
992 .group_by_stable(["date"])?
993 .select(["temp", "rain"])
994 .mean()?;
995 assert_eq!(
996 out.column("temp_mean")?,
997 &Column::new(PlSmallStr::from_static("temp_mean"), [15.0f64, 4.0, 9.0])
998 );
999
1000 #[allow(deprecated)]
1002 let out = df
1004 .group_by_stable(["date", "temp"])?
1005 .select(["rain"])
1006 .mean()?;
1007 assert!(out.column("rain_mean").is_ok());
1008
1009 #[allow(deprecated)]
1011 let out = df.group_by_stable(["date"])?.select(["temp"]).sum()?;
1012 assert_eq!(
1013 out.column("temp_sum")?,
1014 &Column::new(PlSmallStr::from_static("temp_sum"), [30, 8, 9])
1015 );
1016
1017 #[allow(deprecated)]
1019 let gb = df.group_by(["date"]).unwrap().n_unique().unwrap();
1021 assert_eq!(gb.width(), 3);
1023 Ok(())
1024 }
1025
1026 #[test]
1027 #[cfg_attr(miri, ignore)]
1028 fn test_static_group_by_by_12_columns() {
1029 let s0 = Column::new("G1".into(), ["A", "A", "B", "B", "C"].as_ref());
1031 let s1 = Column::new("N".into(), [1, 2, 2, 4, 2].as_ref());
1032 let s2 = Column::new("G2".into(), ["k", "l", "m", "m", "l"].as_ref());
1033 let s3 = Column::new("G3".into(), ["a", "b", "c", "c", "d"].as_ref());
1034 let s4 = Column::new("G4".into(), ["1", "2", "3", "3", "4"].as_ref());
1035 let s5 = Column::new("G5".into(), ["X", "Y", "Z", "Z", "W"].as_ref());
1036 let s6 = Column::new("G6".into(), [false, true, true, true, false].as_ref());
1037 let s7 = Column::new("G7".into(), ["r", "x", "q", "q", "o"].as_ref());
1038 let s8 = Column::new("G8".into(), ["R", "X", "Q", "Q", "O"].as_ref());
1039 let s9 = Column::new("G9".into(), [1, 2, 3, 3, 4].as_ref());
1040 let s10 = Column::new("G10".into(), [".", "!", "?", "?", "/"].as_ref());
1041 let s11 = Column::new("G11".into(), ["(", ")", "@", "@", "$"].as_ref());
1042 let s12 = Column::new("G12".into(), ["-", "_", ";", ";", ","].as_ref());
1043
1044 let df =
1045 DataFrame::new(vec![s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]).unwrap();
1046
1047 #[allow(deprecated)]
1049 let adf = df
1050 .group_by([
1051 "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12",
1052 ])
1053 .unwrap()
1054 .select(["N"])
1055 .sum()
1056 .unwrap();
1057
1058 assert_eq!(
1059 Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
1060 &[Some(1), Some(2), Some(2), Some(6)]
1061 );
1062 }
1063
1064 #[test]
1065 #[cfg_attr(miri, ignore)]
1066 fn test_dynamic_group_by_by_13_columns() {
1067 let series_content = ["A", "A", "B", "B", "C"];
1069
1070 let series_names = [
1072 "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12", "G13",
1073 ];
1074
1075 let mut columns = Vec::with_capacity(14);
1077
1078 for series_name in series_names {
1080 let group_columns = Column::new(series_name.into(), series_content.as_ref());
1081 columns.push(group_columns);
1082 }
1083
1084 let agg_series = Column::new("N".into(), [1, 2, 3, 3, 4].as_ref());
1086 columns.push(agg_series);
1087
1088 let df = DataFrame::new(columns).unwrap();
1090
1091 #[allow(deprecated)]
1093 let adf = df
1095 .group_by(series_names)
1096 .unwrap()
1097 .select(["N"])
1098 .sum()
1099 .unwrap();
1100
1101 for series_name in &series_names {
1104 assert_eq!(
1105 Vec::from(&adf.column(series_name).unwrap().str().unwrap().sort(false)),
1106 &[Some("A"), Some("B"), Some("C")]
1107 );
1108 }
1109
1110 assert_eq!(
1112 Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
1113 &[Some(3), Some(4), Some(6)]
1114 );
1115 }
1116
1117 #[test]
1118 #[cfg_attr(miri, ignore)]
1119 fn test_group_by_floats() {
1120 let df = df! {"flt" => [1., 1., 2., 2., 3.],
1121 "val" => [1, 1, 1, 1, 1]
1122 }
1123 .unwrap();
1124 #[allow(deprecated)]
1126 let res = df.group_by(["flt"]).unwrap().sum().unwrap();
1127 let res = res.sort(["flt"], SortMultipleOptions::default()).unwrap();
1128 assert_eq!(
1129 Vec::from(res.column("val_sum").unwrap().i32().unwrap()),
1130 &[Some(2), Some(2), Some(1)]
1131 );
1132 }
1133
1134 #[test]
1135 #[cfg_attr(miri, ignore)]
1136 #[cfg(feature = "dtype-categorical")]
1137 fn test_group_by_categorical() {
1138 let mut df = df! {"foo" => ["a", "a", "b", "b", "c"],
1139 "ham" => ["a", "a", "b", "b", "c"],
1140 "bar" => [1, 1, 1, 1, 1]
1141 }
1142 .unwrap();
1143
1144 df.apply("foo", |s| {
1145 s.cast(&DataType::from_categories(Categories::global()))
1146 .unwrap()
1147 })
1148 .unwrap();
1149
1150 #[allow(deprecated)]
1152 let res = df
1154 .group_by_stable(["foo", "ham"])
1155 .unwrap()
1156 .select(["bar"])
1157 .sum()
1158 .unwrap();
1159
1160 assert_eq!(
1161 Vec::from(
1162 res.column("bar_sum")
1163 .unwrap()
1164 .as_materialized_series()
1165 .i32()
1166 .unwrap()
1167 ),
1168 &[Some(2), Some(2), Some(1)]
1169 );
1170 }
1171
1172 #[test]
1173 #[cfg_attr(miri, ignore)]
1174 fn test_group_by_null_handling() -> PolarsResult<()> {
1175 let df = df!(
1176 "a" => ["a", "a", "a", "b", "b"],
1177 "b" => [Some(1), Some(2), None, None, Some(1)]
1178 )?;
1179 #[allow(deprecated)]
1181 let out = df.group_by_stable(["a"])?.mean()?;
1182
1183 assert_eq!(
1184 Vec::from(out.column("b_mean")?.as_materialized_series().f64()?),
1185 &[Some(1.5), Some(1.0)]
1186 );
1187 Ok(())
1188 }
1189
1190 #[test]
1191 #[cfg_attr(miri, ignore)]
1192 fn test_group_by_var() -> PolarsResult<()> {
1193 let df = df![
1195 "g" => ["foo", "foo", "bar"],
1196 "flt" => [1.0, 2.0, 3.0],
1197 "int" => [1, 2, 3]
1198 ]?;
1199
1200 #[allow(deprecated)]
1202 let out = df.group_by_stable(["g"])?.select(["int"]).var(1)?;
1203
1204 assert_eq!(out.column("int_agg_var")?.f64()?.get(0), Some(0.5));
1205 #[allow(deprecated)]
1207 let out = df.group_by_stable(["g"])?.select(["int"]).std(1)?;
1208 let val = out.column("int_agg_std")?.f64()?.get(0).unwrap();
1209 let expected = f64::FRAC_1_SQRT_2();
1210 assert!((val - expected).abs() < 0.000001);
1211 Ok(())
1212 }
1213
1214 #[test]
1215 #[cfg_attr(miri, ignore)]
1216 #[cfg(feature = "dtype-categorical")]
1217 fn test_group_by_null_group() -> PolarsResult<()> {
1218 let mut df = df![
1220 "g" => [Some("foo"), Some("foo"), Some("bar"), None, None],
1221 "flt" => [1.0, 2.0, 3.0, 1.0, 1.0],
1222 "int" => [1, 2, 3, 1, 1]
1223 ]?;
1224
1225 df.try_apply("g", |s| {
1226 s.cast(&DataType::from_categories(Categories::global()))
1227 })?;
1228
1229 #[allow(deprecated)]
1231 let _ = df.group_by(["g"])?.sum()?;
1232 Ok(())
1233 }
1234}