1use std::fmt::{Debug, Display, Formatter};
2use std::hash::Hash;
3
4use num_traits::NumCast;
5use polars_compute::rolling::QuantileMethod;
6use polars_utils::format_pl_smallstr;
7use polars_utils::hashing::DirtyHash;
8use rayon::prelude::*;
9
10use self::hashing::*;
11use crate::POOL;
12use crate::prelude::*;
13use crate::utils::{_set_partition_size, accumulate_dataframes_vertical};
14
15pub mod aggregations;
16pub(crate) mod hashing;
17mod into_groups;
18mod position;
19
20pub use into_groups::*;
21pub use position::*;
22
23use crate::chunked_array::ops::row_encode::{
24 encode_rows_unordered, encode_rows_vertical_par_unordered,
25};
26
27impl DataFrame {
28 pub fn group_by_with_series(
29 &self,
30 mut by: Vec<Column>,
31 multithreaded: bool,
32 sorted: bool,
33 ) -> PolarsResult<GroupBy<'_>> {
34 polars_ensure!(
35 !by.is_empty(),
36 ComputeError: "at least one key is required in a group_by operation"
37 );
38
39 let common_height = if self.width() > 0 {
43 self.height()
44 } else {
45 by.iter().map(|s| s.len()).max().expect("at least 1 key")
46 };
47 for by_key in by.iter_mut() {
48 if by_key.len() != common_height {
49 polars_ensure!(
50 by_key.len() == 1,
51 ShapeMismatch: "series used as keys should have the same length as the DataFrame"
52 );
53 *by_key = by_key.new_from_index(0, common_height)
54 }
55 }
56
57 let groups = if by.len() == 1 {
58 let column = &by[0];
59 column
60 .as_materialized_series()
61 .group_tuples(multithreaded, sorted)
62 } else if by.iter().any(|s| s.dtype().is_object()) {
63 #[cfg(feature = "object")]
64 {
65 let mut df = DataFrame::new(self.height(), by.clone()).unwrap();
66 let n = df.height();
67 let rows = df.to_av_rows();
68 let iter = (0..n).map(|i| rows.get(i));
69 Ok(group_by(iter, sorted))
70 }
71 #[cfg(not(feature = "object"))]
72 {
73 unreachable!()
74 }
75 } else {
76 let by = by
78 .iter()
79 .filter(|s| !s.dtype().is_null())
80 .cloned()
81 .collect::<Vec<_>>();
82 if by.is_empty() {
83 let groups = if self.height() == 0 {
84 vec![]
85 } else {
86 vec![[0, self.height() as IdxSize]]
87 };
88
89 Ok(GroupsType::new_slice(groups, false, true))
90 } else {
91 let rows = if multithreaded {
92 encode_rows_vertical_par_unordered(&by)
93 } else {
94 encode_rows_unordered(&by)
95 }?
96 .into_series();
97 rows.group_tuples(multithreaded, sorted)
98 }
99 };
100 Ok(GroupBy::new(self, by, groups?.into_sliceable(), None))
101 }
102
103 pub fn group_by<I, S>(&self, by: I) -> PolarsResult<GroupBy<'_>>
116 where
117 I: IntoIterator<Item = S>,
118 S: AsRef<str>,
119 {
120 let selected_keys = self.select_to_vec(by)?;
121 self.group_by_with_series(selected_keys, true, false)
122 }
123
124 pub fn group_by_stable<I, S>(&self, by: I) -> PolarsResult<GroupBy<'_>>
127 where
128 I: IntoIterator<Item = S>,
129 S: AsRef<str>,
130 {
131 let selected_keys = self.select_to_vec(by)?;
132 self.group_by_with_series(selected_keys, true, true)
133 }
134}
135
136#[derive(Debug, Clone)]
186pub struct GroupBy<'a> {
187 pub df: &'a DataFrame,
188 pub(crate) selected_keys: Vec<Column>,
189 groups: GroupPositions,
191 pub(crate) selected_agg: Option<Vec<PlSmallStr>>,
193}
194
195impl<'a> GroupBy<'a> {
196 pub fn new(
197 df: &'a DataFrame,
198 by: Vec<Column>,
199 groups: GroupPositions,
200 selected_agg: Option<Vec<PlSmallStr>>,
201 ) -> Self {
202 GroupBy {
203 df,
204 selected_keys: by,
205 groups,
206 selected_agg,
207 }
208 }
209
210 #[must_use]
216 pub fn select<I: IntoIterator<Item = S>, S: Into<PlSmallStr>>(mut self, selection: I) -> Self {
217 self.selected_agg = Some(selection.into_iter().map(|s| s.into()).collect());
218 self
219 }
220
221 pub fn get_groups(&self) -> &GroupPositions {
226 &self.groups
227 }
228
229 pub unsafe fn get_groups_mut(&mut self) -> &mut GroupPositions {
238 &mut self.groups
239 }
240
241 pub fn into_groups(self) -> GroupPositions {
242 self.groups
243 }
244
245 pub fn keys_sliced(&self, slice: Option<(i64, usize)>) -> Vec<Column> {
246 #[allow(unused_assignments)]
247 let mut groups_owned = None;
249
250 let groups = if let Some((offset, len)) = slice {
251 groups_owned = Some(self.groups.slice(offset, len));
252 groups_owned.as_deref().unwrap()
253 } else {
254 &self.groups
255 };
256 POOL.install(|| {
257 self.selected_keys
258 .par_iter()
259 .map(Column::as_materialized_series)
260 .map(|s| {
261 match groups {
262 GroupsType::Idx(groups) => {
263 let mut out = unsafe { s.take_slice_unchecked(groups.first()) };
265 if groups.sorted {
266 out.set_sorted_flag(s.is_sorted_flag());
267 };
268 out
269 },
270 GroupsType::Slice {
271 groups,
272 overlapping,
273 monotonic: _,
274 } => {
275 if *overlapping && !groups.is_empty() {
276 let offset = groups[0][0];
278 let [upper_offset, upper_len] = groups[groups.len() - 1];
279 return s.slice(
280 offset as i64,
281 ((upper_offset + upper_len) - offset) as usize,
282 );
283 }
284
285 let indices = groups
286 .iter()
287 .map(|&[first, _len]| first)
288 .collect_ca(PlSmallStr::EMPTY);
289 let mut out = unsafe { s.take_unchecked(&indices) };
291 out.set_sorted_flag(s.is_sorted_flag());
293 out
294 },
295 }
296 })
297 .map(Column::from)
298 .collect()
299 })
300 }
301
302 pub fn keys(&self) -> Vec<Column> {
303 self.keys_sliced(None)
304 }
305
306 fn prepare_agg(&self) -> PolarsResult<(Vec<Column>, Vec<Column>)> {
307 let keys = self.keys();
308
309 let agg_col = match &self.selected_agg {
310 Some(selection) => self.df.select_to_vec(selection),
311 None => {
312 let by: Vec<_> = self.selected_keys.iter().map(|s| s.name()).collect();
313 let selection = self
314 .df
315 .columns()
316 .iter()
317 .map(|s| s.name())
318 .filter(|a| !by.contains(a))
319 .cloned()
320 .collect::<Vec<_>>();
321
322 self.df.select_to_vec(selection.as_slice())
323 },
324 }?;
325
326 Ok((keys, agg_col))
327 }
328
329 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
355 pub fn mean(&self) -> PolarsResult<DataFrame> {
356 let (mut cols, agg_cols) = self.prepare_agg()?;
357
358 for agg_col in agg_cols {
359 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Mean);
360 let mut agg = unsafe { agg_col.agg_mean(&self.groups) };
361 agg.rename(new_name);
362 cols.push(agg);
363 }
364
365 DataFrame::new_infer_height(cols)
366 }
367
368 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
394 pub fn sum(&self) -> PolarsResult<DataFrame> {
395 let (mut cols, agg_cols) = self.prepare_agg()?;
396
397 for agg_col in agg_cols {
398 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Sum);
399 let mut agg = unsafe { agg_col.agg_sum(&self.groups) };
400 agg.rename(new_name);
401 cols.push(agg);
402 }
403 DataFrame::new_infer_height(cols)
404 }
405
406 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
432 pub fn min(&self) -> PolarsResult<DataFrame> {
433 let (mut cols, agg_cols) = self.prepare_agg()?;
434 for agg_col in agg_cols {
435 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Min);
436 let mut agg = unsafe { agg_col.agg_min(&self.groups) };
437 agg.rename(new_name);
438 cols.push(agg);
439 }
440 DataFrame::new_infer_height(cols)
441 }
442
443 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
469 pub fn max(&self) -> PolarsResult<DataFrame> {
470 let (mut cols, agg_cols) = self.prepare_agg()?;
471 for agg_col in agg_cols {
472 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Max);
473 let mut agg = unsafe { agg_col.agg_max(&self.groups) };
474 agg.rename(new_name);
475 cols.push(agg);
476 }
477 DataFrame::new_infer_height(cols)
478 }
479
480 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
506 pub fn first(&self) -> PolarsResult<DataFrame> {
507 let (mut cols, agg_cols) = self.prepare_agg()?;
508 for agg_col in agg_cols {
509 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::First);
510 let mut agg = unsafe { agg_col.agg_first(&self.groups) };
511 agg.rename(new_name);
512 cols.push(agg);
513 }
514 DataFrame::new_infer_height(cols)
515 }
516
517 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
543 pub fn last(&self) -> PolarsResult<DataFrame> {
544 let (mut cols, agg_cols) = self.prepare_agg()?;
545 for agg_col in agg_cols {
546 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Last);
547 let mut agg = unsafe { agg_col.agg_last(&self.groups) };
548 agg.rename(new_name);
549 cols.push(agg);
550 }
551 DataFrame::new_infer_height(cols)
552 }
553
554 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
580 pub fn n_unique(&self) -> PolarsResult<DataFrame> {
581 let (mut cols, agg_cols) = self.prepare_agg()?;
582 for agg_col in agg_cols {
583 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::NUnique);
584 let mut agg = unsafe { agg_col.agg_n_unique(&self.groups) };
585 agg.rename(new_name);
586 cols.push(agg);
587 }
588 DataFrame::new_infer_height(cols)
589 }
590
591 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
603 pub fn quantile(&self, quantile: f64, method: QuantileMethod) -> PolarsResult<DataFrame> {
604 polars_ensure!(
605 (0.0..=1.0).contains(&quantile),
606 ComputeError: "`quantile` should be within 0.0 and 1.0"
607 );
608 let (mut cols, agg_cols) = self.prepare_agg()?;
609 for agg_col in agg_cols {
610 let new_name = fmt_group_by_column(
611 agg_col.name().as_str(),
612 GroupByMethod::Quantile(quantile, method),
613 );
614 let mut agg = unsafe { agg_col.agg_quantile(&self.groups, quantile, method) };
615 agg.rename(new_name);
616 cols.push(agg);
617 }
618 DataFrame::new_infer_height(cols)
619 }
620
621 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
632 pub fn median(&self) -> PolarsResult<DataFrame> {
633 let (mut cols, agg_cols) = self.prepare_agg()?;
634 for agg_col in agg_cols {
635 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Median);
636 let mut agg = unsafe { agg_col.agg_median(&self.groups) };
637 agg.rename(new_name);
638 cols.push(agg);
639 }
640 DataFrame::new_infer_height(cols)
641 }
642
643 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
645 pub fn var(&self, ddof: u8) -> PolarsResult<DataFrame> {
646 let (mut cols, agg_cols) = self.prepare_agg()?;
647 for agg_col in agg_cols {
648 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Var(ddof));
649 let mut agg = unsafe { agg_col.agg_var(&self.groups, ddof) };
650 agg.rename(new_name);
651 cols.push(agg);
652 }
653 DataFrame::new_infer_height(cols)
654 }
655
656 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
658 pub fn std(&self, ddof: u8) -> PolarsResult<DataFrame> {
659 let (mut cols, agg_cols) = self.prepare_agg()?;
660 for agg_col in agg_cols {
661 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Std(ddof));
662 let mut agg = unsafe { agg_col.agg_std(&self.groups, ddof) };
663 agg.rename(new_name);
664 cols.push(agg);
665 }
666 DataFrame::new_infer_height(cols)
667 }
668
669 pub fn count(&self) -> PolarsResult<DataFrame> {
695 let (mut cols, agg_cols) = self.prepare_agg()?;
696
697 for agg_col in agg_cols {
698 let new_name = fmt_group_by_column(
699 agg_col.name().as_str(),
700 GroupByMethod::Count {
701 include_nulls: true,
702 },
703 );
704 let mut ca = self.groups.group_count();
705 ca.rename(new_name);
706 cols.push(ca.into_column());
707 }
708 DataFrame::new_infer_height(cols)
709 }
710
711 pub fn groups(&self) -> PolarsResult<DataFrame> {
737 let mut cols = self.keys();
738 let mut column = self.groups.as_list_chunked();
739 let new_name = fmt_group_by_column("", GroupByMethod::Groups);
740 column.rename(new_name);
741 cols.push(column.into_column());
742 DataFrame::new_infer_height(cols)
743 }
744
745 fn prepare_apply(&self) -> PolarsResult<DataFrame> {
746 if let Some(agg) = &self.selected_agg {
747 if agg.is_empty() {
748 Ok(self.df.clone())
749 } else {
750 let mut new_cols = Vec::with_capacity(self.selected_keys.len() + agg.len());
751 new_cols.extend_from_slice(&self.selected_keys);
752 let cols = self.df.select_to_vec(agg.as_slice())?;
753 new_cols.extend(cols);
754 Ok(unsafe { DataFrame::new_unchecked(self.df.height(), new_cols) })
755 }
756 } else {
757 Ok(self.df.clone())
758 }
759 }
760
761 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
763 pub fn par_apply<F>(&self, f: F) -> PolarsResult<DataFrame>
764 where
765 F: Fn(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
766 {
767 polars_ensure!(self.df.height() > 0, ComputeError: "cannot group_by + apply on empty 'DataFrame'");
768 let df = self.prepare_apply()?;
769 let dfs = self
770 .get_groups()
771 .par_iter()
772 .map(|g| {
773 let sub_df = unsafe { take_df(&df, g) };
776 f(sub_df)
777 })
778 .collect::<PolarsResult<Vec<_>>>()?;
779
780 let mut df = accumulate_dataframes_vertical(dfs)?;
781 df.rechunk_mut_par();
782 Ok(df)
783 }
784
785 pub fn apply<F>(&self, f: F) -> PolarsResult<DataFrame>
787 where
788 F: FnMut(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
789 {
790 self.apply_sliced(None, f, None)
791 }
792
793 pub fn apply_sliced<F>(
794 &self,
795 slice: Option<(i64, usize)>,
796 mut f: F,
797 schema: Option<&SchemaRef>,
798 ) -> PolarsResult<DataFrame>
799 where
800 F: FnMut(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
801 {
802 if self.df.height() == 0 {
803 if let Some(schema) = schema {
805 return Ok(DataFrame::empty_with_arc_schema(schema.clone()));
806 }
807
808 polars_bail!(ComputeError: "cannot group_by + apply on empty 'DataFrame'");
809 }
810
811 let df = self.prepare_apply()?;
812 let max_height = if let Some((offset, len)) = slice {
813 offset.try_into().unwrap_or(usize::MAX).saturating_add(len)
814 } else {
815 usize::MAX
816 };
817 let mut height = 0;
818 let mut dfs = Vec::with_capacity(self.get_groups().len());
819 for g in self.get_groups().iter() {
820 let sub_df = unsafe { take_df(&df, g) };
822 let df = f(sub_df)?;
823 height += df.height();
824 dfs.push(df);
825
826 if height >= max_height {
829 break;
830 }
831 }
832
833 let mut df = accumulate_dataframes_vertical(dfs)?;
834 if let Some((offset, len)) = slice {
835 df = df.slice(offset, len);
836 }
837 Ok(df)
838 }
839
840 pub fn sliced(mut self, slice: Option<(i64, usize)>) -> Self {
841 match slice {
842 None => self,
843 Some((offset, length)) => {
844 self.groups = self.groups.slice(offset, length);
845 self.selected_keys = self.keys_sliced(slice);
846 self
847 },
848 }
849 }
850}
851
852unsafe fn take_df(df: &DataFrame, g: GroupsIndicator) -> DataFrame {
853 match g {
854 GroupsIndicator::Idx(idx) => df.take_slice_unchecked(idx.1),
855 GroupsIndicator::Slice([first, len]) => df.slice(first as i64, len as usize),
856 }
857}
858
859#[derive(Copy, Clone, Debug)]
860pub enum GroupByMethod {
861 Min,
862 NanMin,
863 Max,
864 NanMax,
865 Median,
866 Mean,
867 First,
868 FirstNonNull,
869 Last,
870 LastNonNull,
871 Item { allow_empty: bool },
872 Sum,
873 Groups,
874 NUnique,
875 Quantile(f64, QuantileMethod),
876 Count { include_nulls: bool },
877 Implode { maintain_order: bool },
878 Std(u8),
879 Var(u8),
880 ArgMin,
881 ArgMax,
882}
883
884impl Display for GroupByMethod {
885 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
886 use GroupByMethod::*;
887 let s = match self {
888 Min => "min",
889 NanMin => "nan_min",
890 Max => "max",
891 NanMax => "nan_max",
892 Median => "median",
893 Mean => "mean",
894 First => "first",
895 FirstNonNull => "first_non_null",
896 Last => "last",
897 LastNonNull => "last_non_null",
898 Item { .. } => "item",
899 Sum => "sum",
900 Groups => "groups",
901 NUnique => "n_unique",
902 Quantile(_, _) => "quantile",
903 Count { .. } => "count",
904 Implode { .. } => "implode",
905 Std(_) => "std",
906 Var(_) => "var",
907 ArgMin => "arg_min",
908 ArgMax => "arg_max",
909 };
910 write!(f, "{s}")
911 }
912}
913
914pub fn fmt_group_by_column(name: &str, method: GroupByMethod) -> PlSmallStr {
916 use GroupByMethod::*;
917 match method {
918 Min => format_pl_smallstr!("{name}_min"),
919 Max => format_pl_smallstr!("{name}_max"),
920 NanMin => format_pl_smallstr!("{name}_nan_min"),
921 NanMax => format_pl_smallstr!("{name}_nan_max"),
922 Median => format_pl_smallstr!("{name}_median"),
923 Mean => format_pl_smallstr!("{name}_mean"),
924 First => format_pl_smallstr!("{name}_first"),
925 FirstNonNull => format_pl_smallstr!("{name}_first_non_null"),
926 Last => format_pl_smallstr!("{name}_last"),
927 LastNonNull => format_pl_smallstr!("{name}_last_non_null"),
928 Item { .. } => format_pl_smallstr!("{name}_item"),
929 Sum => format_pl_smallstr!("{name}_sum"),
930 Groups => PlSmallStr::from_static("groups"),
931 NUnique => format_pl_smallstr!("{name}_n_unique"),
932 Count { .. } => format_pl_smallstr!("{name}_count"),
933 Implode { .. } => format_pl_smallstr!("{name}_agg_list"),
934 Quantile(quantile, _interpol) => format_pl_smallstr!("{name}_quantile_{quantile:.2}"),
935 Std(_) => format_pl_smallstr!("{name}_agg_std"),
936 Var(_) => format_pl_smallstr!("{name}_agg_var"),
937 ArgMin => format_pl_smallstr!("{name}_arg_min"),
938 ArgMax => format_pl_smallstr!("{name}_arg_max"),
939 }
940}
941
942#[cfg(test)]
943mod test {
944 use num_traits::FloatConst;
945
946 use crate::prelude::*;
947
948 #[test]
949 #[cfg(feature = "dtype-date")]
950 #[cfg_attr(miri, ignore)]
951 fn test_group_by() -> PolarsResult<()> {
952 let s0 = Column::new(
953 PlSmallStr::from_static("date"),
954 &[
955 "2020-08-21",
956 "2020-08-21",
957 "2020-08-22",
958 "2020-08-23",
959 "2020-08-22",
960 ],
961 );
962 let s1 = Column::new(PlSmallStr::from_static("temp"), [20, 10, 7, 9, 1]);
963 let s2 = Column::new(PlSmallStr::from_static("rain"), [0.2, 0.1, 0.3, 0.1, 0.01]);
964 let df = DataFrame::new_infer_height(vec![s0, s1, s2]).unwrap();
965
966 let out = df.group_by_stable(["date"])?.select(["temp"]).count()?;
967 assert_eq!(
968 out.column("temp_count")?,
969 &Column::new(PlSmallStr::from_static("temp_count"), [2 as IdxSize, 2, 1])
970 );
971
972 #[allow(deprecated)]
974 let out = df
976 .group_by_stable(["date"])?
977 .select(["temp", "rain"])
978 .mean()?;
979 assert_eq!(
980 out.column("temp_mean")?,
981 &Column::new(PlSmallStr::from_static("temp_mean"), [15.0f64, 4.0, 9.0])
982 );
983
984 #[allow(deprecated)]
986 let out = df
988 .group_by_stable(["date", "temp"])?
989 .select(["rain"])
990 .mean()?;
991 assert!(out.column("rain_mean").is_ok());
992
993 #[allow(deprecated)]
995 let out = df.group_by_stable(["date"])?.select(["temp"]).sum()?;
996 assert_eq!(
997 out.column("temp_sum")?,
998 &Column::new(PlSmallStr::from_static("temp_sum"), [30, 8, 9])
999 );
1000
1001 #[allow(deprecated)]
1003 let gb = df.group_by(["date"]).unwrap().n_unique().unwrap();
1005 assert_eq!(gb.width(), 3);
1007 Ok(())
1008 }
1009
1010 #[test]
1011 #[cfg_attr(miri, ignore)]
1012 fn test_static_group_by_by_12_columns() {
1013 let s0 = Column::new("G1".into(), ["A", "A", "B", "B", "C"].as_ref());
1015 let s1 = Column::new("N".into(), [1, 2, 2, 4, 2].as_ref());
1016 let s2 = Column::new("G2".into(), ["k", "l", "m", "m", "l"].as_ref());
1017 let s3 = Column::new("G3".into(), ["a", "b", "c", "c", "d"].as_ref());
1018 let s4 = Column::new("G4".into(), ["1", "2", "3", "3", "4"].as_ref());
1019 let s5 = Column::new("G5".into(), ["X", "Y", "Z", "Z", "W"].as_ref());
1020 let s6 = Column::new("G6".into(), [false, true, true, true, false].as_ref());
1021 let s7 = Column::new("G7".into(), ["r", "x", "q", "q", "o"].as_ref());
1022 let s8 = Column::new("G8".into(), ["R", "X", "Q", "Q", "O"].as_ref());
1023 let s9 = Column::new("G9".into(), [1, 2, 3, 3, 4].as_ref());
1024 let s10 = Column::new("G10".into(), [".", "!", "?", "?", "/"].as_ref());
1025 let s11 = Column::new("G11".into(), ["(", ")", "@", "@", "$"].as_ref());
1026 let s12 = Column::new("G12".into(), ["-", "_", ";", ";", ","].as_ref());
1027
1028 let df = DataFrame::new_infer_height(vec![
1029 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1030 ])
1031 .unwrap();
1032
1033 #[allow(deprecated)]
1035 let adf = df
1036 .group_by([
1037 "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12",
1038 ])
1039 .unwrap()
1040 .select(["N"])
1041 .sum()
1042 .unwrap();
1043
1044 assert_eq!(
1045 Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
1046 &[Some(1), Some(2), Some(2), Some(6)]
1047 );
1048 }
1049
1050 #[test]
1051 #[cfg_attr(miri, ignore)]
1052 fn test_dynamic_group_by_by_13_columns() {
1053 let series_content = ["A", "A", "B", "B", "C"];
1055
1056 let series_names = [
1058 "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12", "G13",
1059 ];
1060
1061 let mut columns = Vec::with_capacity(14);
1063
1064 for series_name in series_names {
1066 let group_columns = Column::new(series_name.into(), series_content.as_ref());
1067 columns.push(group_columns);
1068 }
1069
1070 let agg_series = Column::new("N".into(), [1, 2, 3, 3, 4].as_ref());
1072 columns.push(agg_series);
1073
1074 let df = DataFrame::new_infer_height(columns).unwrap();
1076
1077 #[allow(deprecated)]
1079 let adf = df
1081 .group_by(series_names)
1082 .unwrap()
1083 .select(["N"])
1084 .sum()
1085 .unwrap();
1086
1087 for series_name in &series_names {
1090 assert_eq!(
1091 Vec::from(&adf.column(series_name).unwrap().str().unwrap().sort(false)),
1092 &[Some("A"), Some("B"), Some("C")]
1093 );
1094 }
1095
1096 assert_eq!(
1098 Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
1099 &[Some(3), Some(4), Some(6)]
1100 );
1101 }
1102
1103 #[test]
1104 #[cfg_attr(miri, ignore)]
1105 fn test_group_by_floats() {
1106 let df = df! {"flt" => [1., 1., 2., 2., 3.],
1107 "val" => [1, 1, 1, 1, 1]
1108 }
1109 .unwrap();
1110 #[allow(deprecated)]
1112 let res = df.group_by(["flt"]).unwrap().sum().unwrap();
1113 let res = res.sort(["flt"], SortMultipleOptions::default()).unwrap();
1114 assert_eq!(
1115 Vec::from(res.column("val_sum").unwrap().i32().unwrap()),
1116 &[Some(2), Some(2), Some(1)]
1117 );
1118 }
1119
1120 #[test]
1121 #[cfg_attr(miri, ignore)]
1122 #[cfg(feature = "dtype-categorical")]
1123 fn test_group_by_categorical() {
1124 let mut df = df! {"foo" => ["a", "a", "b", "b", "c"],
1125 "ham" => ["a", "a", "b", "b", "c"],
1126 "bar" => [1, 1, 1, 1, 1]
1127 }
1128 .unwrap();
1129
1130 df.apply("foo", |s| {
1131 s.cast(&DataType::from_categories(Categories::global()))
1132 .unwrap()
1133 })
1134 .unwrap();
1135
1136 #[allow(deprecated)]
1138 let res = df
1140 .group_by_stable(["foo", "ham"])
1141 .unwrap()
1142 .select(["bar"])
1143 .sum()
1144 .unwrap();
1145
1146 assert_eq!(
1147 Vec::from(
1148 res.column("bar_sum")
1149 .unwrap()
1150 .as_materialized_series()
1151 .i32()
1152 .unwrap()
1153 ),
1154 &[Some(2), Some(2), Some(1)]
1155 );
1156 }
1157
1158 #[test]
1159 #[cfg_attr(miri, ignore)]
1160 fn test_group_by_null_handling() -> PolarsResult<()> {
1161 let df = df!(
1162 "a" => ["a", "a", "a", "b", "b"],
1163 "b" => [Some(1), Some(2), None, None, Some(1)]
1164 )?;
1165 #[allow(deprecated)]
1167 let out = df.group_by_stable(["a"])?.mean()?;
1168
1169 assert_eq!(
1170 Vec::from(out.column("b_mean")?.as_materialized_series().f64()?),
1171 &[Some(1.5), Some(1.0)]
1172 );
1173 Ok(())
1174 }
1175
1176 #[test]
1177 #[cfg_attr(miri, ignore)]
1178 fn test_group_by_var() -> PolarsResult<()> {
1179 let df = df![
1181 "g" => ["foo", "foo", "bar"],
1182 "flt" => [1.0, 2.0, 3.0],
1183 "int" => [1, 2, 3]
1184 ]?;
1185
1186 #[allow(deprecated)]
1188 let out = df.group_by_stable(["g"])?.select(["int"]).var(1)?;
1189
1190 assert_eq!(out.column("int_agg_var")?.f64()?.get(0), Some(0.5));
1191 #[allow(deprecated)]
1193 let out = df.group_by_stable(["g"])?.select(["int"]).std(1)?;
1194 let val = out.column("int_agg_std")?.f64()?.get(0).unwrap();
1195 let expected = f64::FRAC_1_SQRT_2();
1196 assert!((val - expected).abs() < 0.000001);
1197 Ok(())
1198 }
1199
1200 #[test]
1201 #[cfg_attr(miri, ignore)]
1202 #[cfg(feature = "dtype-categorical")]
1203 fn test_group_by_null_group() -> PolarsResult<()> {
1204 let mut df = df![
1206 "g" => [Some("foo"), Some("foo"), Some("bar"), None, None],
1207 "flt" => [1.0, 2.0, 3.0, 1.0, 1.0],
1208 "int" => [1, 2, 3, 1, 1]
1209 ]?;
1210
1211 df.try_apply("g", |s| {
1212 s.cast(&DataType::from_categories(Categories::global()))
1213 })?;
1214
1215 #[allow(deprecated)]
1217 let _ = df.group_by(["g"])?.sum()?;
1218 Ok(())
1219 }
1220}