1use std::fmt::{Debug, Display, Formatter};
2use std::hash::Hash;
3
4use num_traits::NumCast;
5use polars_compute::rolling::QuantileMethod;
6use polars_utils::format_pl_smallstr;
7use polars_utils::hashing::DirtyHash;
8use rayon::prelude::*;
9
10use self::hashing::*;
11use crate::POOL;
12use crate::prelude::*;
13use crate::utils::{_set_partition_size, accumulate_dataframes_vertical};
14
15pub mod aggregations;
16pub mod expr;
17pub(crate) mod hashing;
18mod into_groups;
19mod perfect;
20mod position;
21
22pub use into_groups::*;
23pub use position::*;
24
25use crate::chunked_array::ops::row_encode::{
26 encode_rows_unordered, encode_rows_vertical_par_unordered,
27};
28
29impl DataFrame {
30 pub fn group_by_with_series(
31 &self,
32 mut by: Vec<Column>,
33 multithreaded: bool,
34 sorted: bool,
35 ) -> PolarsResult<GroupBy> {
36 polars_ensure!(
37 !by.is_empty(),
38 ComputeError: "at least one key is required in a group_by operation"
39 );
40 let minimal_by_len = by.iter().map(|s| s.len()).min().expect("at least 1 key");
41 let df_height = self.height();
42
43 if (minimal_by_len != df_height) && (self.width() > 0) {
46 polars_ensure!(
47 minimal_by_len == 1,
48 ShapeMismatch: "series used as keys should have the same length as the DataFrame"
49 );
50 for by_key in by.iter_mut() {
51 if by_key.len() == minimal_by_len {
52 *by_key = by_key.new_from_index(0, df_height)
53 }
54 }
55 };
56
57 let groups = if by.len() == 1 {
58 let column = &by[0];
59 column
60 .as_materialized_series()
61 .group_tuples(multithreaded, sorted)
62 } else if by.iter().any(|s| s.dtype().is_object()) {
63 #[cfg(feature = "object")]
64 {
65 let mut df = DataFrame::new(by.clone()).unwrap();
66 let n = df.height();
67 let rows = df.to_av_rows();
68 let iter = (0..n).map(|i| rows.get(i));
69 Ok(group_by(iter, sorted))
70 }
71 #[cfg(not(feature = "object"))]
72 {
73 unreachable!()
74 }
75 } else {
76 let by = by
78 .iter()
79 .filter(|s| !s.dtype().is_null())
80 .cloned()
81 .collect::<Vec<_>>();
82 if by.is_empty() {
83 let groups = if self.is_empty() {
84 vec![]
85 } else {
86 vec![[0, self.height() as IdxSize]]
87 };
88 Ok(GroupsType::Slice {
89 groups,
90 rolling: false,
91 })
92 } else {
93 let rows = if multithreaded {
94 encode_rows_vertical_par_unordered(&by)
95 } else {
96 encode_rows_unordered(&by)
97 }?
98 .into_series();
99 rows.group_tuples(multithreaded, sorted)
100 }
101 };
102 Ok(GroupBy::new(self, by, groups?.into_sliceable(), None))
103 }
104
105 pub fn group_by<I, S>(&self, by: I) -> PolarsResult<GroupBy>
118 where
119 I: IntoIterator<Item = S>,
120 S: Into<PlSmallStr>,
121 {
122 let selected_keys = self.select_columns(by)?;
123 self.group_by_with_series(selected_keys, true, false)
124 }
125
126 pub fn group_by_stable<I, S>(&self, by: I) -> PolarsResult<GroupBy>
129 where
130 I: IntoIterator<Item = S>,
131 S: Into<PlSmallStr>,
132 {
133 let selected_keys = self.select_columns(by)?;
134 self.group_by_with_series(selected_keys, true, true)
135 }
136}
137
138#[derive(Debug, Clone)]
188pub struct GroupBy<'a> {
189 pub df: &'a DataFrame,
190 pub(crate) selected_keys: Vec<Column>,
191 groups: GroupPositions,
193 pub(crate) selected_agg: Option<Vec<PlSmallStr>>,
195}
196
197impl<'a> GroupBy<'a> {
198 pub fn new(
199 df: &'a DataFrame,
200 by: Vec<Column>,
201 groups: GroupPositions,
202 selected_agg: Option<Vec<PlSmallStr>>,
203 ) -> Self {
204 GroupBy {
205 df,
206 selected_keys: by,
207 groups,
208 selected_agg,
209 }
210 }
211
212 #[must_use]
218 pub fn select<I: IntoIterator<Item = S>, S: Into<PlSmallStr>>(mut self, selection: I) -> Self {
219 self.selected_agg = Some(selection.into_iter().map(|s| s.into()).collect());
220 self
221 }
222
223 pub fn get_groups(&self) -> &GroupPositions {
228 &self.groups
229 }
230
231 pub unsafe fn get_groups_mut(&mut self) -> &mut GroupPositions {
240 &mut self.groups
241 }
242
243 pub fn take_groups(self) -> GroupPositions {
244 self.groups
245 }
246
247 pub fn take_groups_mut(&mut self) -> GroupPositions {
248 std::mem::take(&mut self.groups)
249 }
250
251 pub fn keys_sliced(&self, slice: Option<(i64, usize)>) -> Vec<Column> {
252 #[allow(unused_assignments)]
253 let mut groups_owned = None;
255
256 let groups = if let Some((offset, len)) = slice {
257 groups_owned = Some(self.groups.slice(offset, len));
258 groups_owned.as_deref().unwrap()
259 } else {
260 &self.groups
261 };
262 POOL.install(|| {
263 self.selected_keys
264 .par_iter()
265 .map(Column::as_materialized_series)
266 .map(|s| {
267 match groups {
268 GroupsType::Idx(groups) => {
269 let mut out = unsafe { s.take_slice_unchecked(groups.first()) };
271 if groups.sorted {
272 out.set_sorted_flag(s.is_sorted_flag());
273 };
274 out
275 },
276 GroupsType::Slice { groups, rolling } => {
277 if *rolling && !groups.is_empty() {
278 let offset = groups[0][0];
280 let [upper_offset, upper_len] = groups[groups.len() - 1];
281 return s.slice(
282 offset as i64,
283 ((upper_offset + upper_len) - offset) as usize,
284 );
285 }
286
287 let indices = groups
288 .iter()
289 .map(|&[first, _len]| first)
290 .collect_ca(PlSmallStr::EMPTY);
291 let mut out = unsafe { s.take_unchecked(&indices) };
293 out.set_sorted_flag(s.is_sorted_flag());
295 out
296 },
297 }
298 })
299 .map(Column::from)
300 .collect()
301 })
302 }
303
304 pub fn keys(&self) -> Vec<Column> {
305 self.keys_sliced(None)
306 }
307
308 fn prepare_agg(&self) -> PolarsResult<(Vec<Column>, Vec<Column>)> {
309 let keys = self.keys();
310
311 let agg_col = match &self.selected_agg {
312 Some(selection) => self.df.select_columns_impl(selection.as_slice()),
313 None => {
314 let by: Vec<_> = self.selected_keys.iter().map(|s| s.name()).collect();
315 let selection = self
316 .df
317 .iter()
318 .map(|s| s.name())
319 .filter(|a| !by.contains(a))
320 .cloned()
321 .collect::<Vec<_>>();
322
323 self.df.select_columns_impl(selection.as_slice())
324 },
325 }?;
326
327 Ok((keys, agg_col))
328 }
329
330 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
356 pub fn mean(&self) -> PolarsResult<DataFrame> {
357 let (mut cols, agg_cols) = self.prepare_agg()?;
358
359 for agg_col in agg_cols {
360 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Mean);
361 let mut agg = unsafe { agg_col.agg_mean(&self.groups) };
362 agg.rename(new_name);
363 cols.push(agg);
364 }
365 DataFrame::new(cols)
366 }
367
368 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
394 pub fn sum(&self) -> PolarsResult<DataFrame> {
395 let (mut cols, agg_cols) = self.prepare_agg()?;
396
397 for agg_col in agg_cols {
398 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Sum);
399 let mut agg = unsafe { agg_col.agg_sum(&self.groups) };
400 agg.rename(new_name);
401 cols.push(agg);
402 }
403 DataFrame::new(cols)
404 }
405
406 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
432 pub fn min(&self) -> PolarsResult<DataFrame> {
433 let (mut cols, agg_cols) = self.prepare_agg()?;
434 for agg_col in agg_cols {
435 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Min);
436 let mut agg = unsafe { agg_col.agg_min(&self.groups) };
437 agg.rename(new_name);
438 cols.push(agg);
439 }
440 DataFrame::new(cols)
441 }
442
443 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
469 pub fn max(&self) -> PolarsResult<DataFrame> {
470 let (mut cols, agg_cols) = self.prepare_agg()?;
471 for agg_col in agg_cols {
472 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Max);
473 let mut agg = unsafe { agg_col.agg_max(&self.groups) };
474 agg.rename(new_name);
475 cols.push(agg);
476 }
477 DataFrame::new(cols)
478 }
479
480 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
506 pub fn first(&self) -> PolarsResult<DataFrame> {
507 let (mut cols, agg_cols) = self.prepare_agg()?;
508 for agg_col in agg_cols {
509 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::First);
510 let mut agg = unsafe { agg_col.agg_first(&self.groups) };
511 agg.rename(new_name);
512 cols.push(agg);
513 }
514 DataFrame::new(cols)
515 }
516
517 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
543 pub fn last(&self) -> PolarsResult<DataFrame> {
544 let (mut cols, agg_cols) = self.prepare_agg()?;
545 for agg_col in agg_cols {
546 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Last);
547 let mut agg = unsafe { agg_col.agg_last(&self.groups) };
548 agg.rename(new_name);
549 cols.push(agg);
550 }
551 DataFrame::new(cols)
552 }
553
554 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
580 pub fn n_unique(&self) -> PolarsResult<DataFrame> {
581 let (mut cols, agg_cols) = self.prepare_agg()?;
582 for agg_col in agg_cols {
583 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::NUnique);
584 let mut agg = unsafe { agg_col.agg_n_unique(&self.groups) };
585 agg.rename(new_name);
586 cols.push(agg);
587 }
588 DataFrame::new(cols)
589 }
590
591 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
603 pub fn quantile(&self, quantile: f64, method: QuantileMethod) -> PolarsResult<DataFrame> {
604 polars_ensure!(
605 (0.0..=1.0).contains(&quantile),
606 ComputeError: "`quantile` should be within 0.0 and 1.0"
607 );
608 let (mut cols, agg_cols) = self.prepare_agg()?;
609 for agg_col in agg_cols {
610 let new_name = fmt_group_by_column(
611 agg_col.name().as_str(),
612 GroupByMethod::Quantile(quantile, method),
613 );
614 let mut agg = unsafe { agg_col.agg_quantile(&self.groups, quantile, method) };
615 agg.rename(new_name);
616 cols.push(agg);
617 }
618 DataFrame::new(cols)
619 }
620
621 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
632 pub fn median(&self) -> PolarsResult<DataFrame> {
633 let (mut cols, agg_cols) = self.prepare_agg()?;
634 for agg_col in agg_cols {
635 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Median);
636 let mut agg = unsafe { agg_col.agg_median(&self.groups) };
637 agg.rename(new_name);
638 cols.push(agg);
639 }
640 DataFrame::new(cols)
641 }
642
643 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
645 pub fn var(&self, ddof: u8) -> PolarsResult<DataFrame> {
646 let (mut cols, agg_cols) = self.prepare_agg()?;
647 for agg_col in agg_cols {
648 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Var(ddof));
649 let mut agg = unsafe { agg_col.agg_var(&self.groups, ddof) };
650 agg.rename(new_name);
651 cols.push(agg);
652 }
653 DataFrame::new(cols)
654 }
655
656 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
658 pub fn std(&self, ddof: u8) -> PolarsResult<DataFrame> {
659 let (mut cols, agg_cols) = self.prepare_agg()?;
660 for agg_col in agg_cols {
661 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Std(ddof));
662 let mut agg = unsafe { agg_col.agg_std(&self.groups, ddof) };
663 agg.rename(new_name);
664 cols.push(agg);
665 }
666 DataFrame::new(cols)
667 }
668
669 pub fn count(&self) -> PolarsResult<DataFrame> {
695 let (mut cols, agg_cols) = self.prepare_agg()?;
696
697 for agg_col in agg_cols {
698 let new_name = fmt_group_by_column(
699 agg_col.name().as_str(),
700 GroupByMethod::Count {
701 include_nulls: true,
702 },
703 );
704 let mut ca = self.groups.group_count();
705 ca.rename(new_name);
706 cols.push(ca.into_column());
707 }
708 DataFrame::new(cols)
709 }
710
711 pub fn groups(&self) -> PolarsResult<DataFrame> {
737 let mut cols = self.keys();
738 let mut column = self.groups.as_list_chunked();
739 let new_name = fmt_group_by_column("", GroupByMethod::Groups);
740 column.rename(new_name);
741 cols.push(column.into_column());
742 DataFrame::new(cols)
743 }
744
745 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
772 pub fn agg_list(&self) -> PolarsResult<DataFrame> {
773 let (mut cols, agg_cols) = self.prepare_agg()?;
774 for agg_col in agg_cols {
775 let new_name = fmt_group_by_column(agg_col.name().as_str(), GroupByMethod::Implode);
776 let mut agg = unsafe { agg_col.agg_list(&self.groups) };
777 agg.rename(new_name);
778 cols.push(agg);
779 }
780 DataFrame::new(cols)
781 }
782
783 fn prepare_apply(&self) -> PolarsResult<DataFrame> {
784 polars_ensure!(self.df.height() > 0, ComputeError: "cannot group_by + apply on empty 'DataFrame'");
785 if let Some(agg) = &self.selected_agg {
786 if agg.is_empty() {
787 Ok(self.df.clone())
788 } else {
789 let mut new_cols = Vec::with_capacity(self.selected_keys.len() + agg.len());
790 new_cols.extend_from_slice(&self.selected_keys);
791 let cols = self.df.select_columns_impl(agg.as_slice())?;
792 new_cols.extend(cols);
793 Ok(unsafe { DataFrame::new_no_checks(self.df.height(), new_cols) })
794 }
795 } else {
796 Ok(self.df.clone())
797 }
798 }
799
800 #[deprecated(since = "0.24.1", note = "use polars.lazy aggregations")]
802 pub fn par_apply<F>(&self, f: F) -> PolarsResult<DataFrame>
803 where
804 F: Fn(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
805 {
806 let df = self.prepare_apply()?;
807 let dfs = self
808 .get_groups()
809 .par_iter()
810 .map(|g| {
811 let sub_df = unsafe { take_df(&df, g) };
814 f(sub_df)
815 })
816 .collect::<PolarsResult<Vec<_>>>()?;
817
818 let mut df = accumulate_dataframes_vertical(dfs)?;
819 df.as_single_chunk_par();
820 Ok(df)
821 }
822
823 pub fn apply<F>(&self, mut f: F) -> PolarsResult<DataFrame>
825 where
826 F: FnMut(DataFrame) -> PolarsResult<DataFrame> + Send + Sync,
827 {
828 let df = self.prepare_apply()?;
829 let dfs = self
830 .get_groups()
831 .iter()
832 .map(|g| {
833 let sub_df = unsafe { take_df(&df, g) };
836 f(sub_df)
837 })
838 .collect::<PolarsResult<Vec<_>>>()?;
839
840 let mut df = accumulate_dataframes_vertical(dfs)?;
841 df.as_single_chunk_par();
842 Ok(df)
843 }
844
845 pub fn sliced(mut self, slice: Option<(i64, usize)>) -> Self {
846 match slice {
847 None => self,
848 Some((offset, length)) => {
849 self.groups = (self.groups.slice(offset, length)).clone();
850 self.selected_keys = self.keys_sliced(slice);
851 self
852 },
853 }
854 }
855}
856
857unsafe fn take_df(df: &DataFrame, g: GroupsIndicator) -> DataFrame {
858 match g {
859 GroupsIndicator::Idx(idx) => df.take_slice_unchecked(idx.1),
860 GroupsIndicator::Slice([first, len]) => df.slice(first as i64, len as usize),
861 }
862}
863
864#[derive(Copy, Clone, Debug)]
865pub enum GroupByMethod {
866 Min,
867 NanMin,
868 Max,
869 NanMax,
870 Median,
871 Mean,
872 First,
873 Last,
874 Sum,
875 Groups,
876 NUnique,
877 Quantile(f64, QuantileMethod),
878 Count { include_nulls: bool },
879 Implode,
880 Std(u8),
881 Var(u8),
882}
883
884impl Display for GroupByMethod {
885 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
886 use GroupByMethod::*;
887 let s = match self {
888 Min => "min",
889 NanMin => "nan_min",
890 Max => "max",
891 NanMax => "nan_max",
892 Median => "median",
893 Mean => "mean",
894 First => "first",
895 Last => "last",
896 Sum => "sum",
897 Groups => "groups",
898 NUnique => "n_unique",
899 Quantile(_, _) => "quantile",
900 Count { .. } => "count",
901 Implode => "list",
902 Std(_) => "std",
903 Var(_) => "var",
904 };
905 write!(f, "{s}")
906 }
907}
908
909pub fn fmt_group_by_column(name: &str, method: GroupByMethod) -> PlSmallStr {
911 use GroupByMethod::*;
912 match method {
913 Min => format_pl_smallstr!("{name}_min"),
914 Max => format_pl_smallstr!("{name}_max"),
915 NanMin => format_pl_smallstr!("{name}_nan_min"),
916 NanMax => format_pl_smallstr!("{name}_nan_max"),
917 Median => format_pl_smallstr!("{name}_median"),
918 Mean => format_pl_smallstr!("{name}_mean"),
919 First => format_pl_smallstr!("{name}_first"),
920 Last => format_pl_smallstr!("{name}_last"),
921 Sum => format_pl_smallstr!("{name}_sum"),
922 Groups => PlSmallStr::from_static("groups"),
923 NUnique => format_pl_smallstr!("{name}_n_unique"),
924 Count { .. } => format_pl_smallstr!("{name}_count"),
925 Implode => format_pl_smallstr!("{name}_agg_list"),
926 Quantile(quantile, _interpol) => format_pl_smallstr!("{name}_quantile_{quantile:.2}"),
927 Std(_) => format_pl_smallstr!("{name}_agg_std"),
928 Var(_) => format_pl_smallstr!("{name}_agg_var"),
929 }
930}
931
932#[cfg(test)]
933mod test {
934 use num_traits::FloatConst;
935
936 use crate::prelude::*;
937
938 #[test]
939 #[cfg(feature = "dtype-date")]
940 #[cfg_attr(miri, ignore)]
941 fn test_group_by() -> PolarsResult<()> {
942 let s0 = Column::new(
943 PlSmallStr::from_static("date"),
944 &[
945 "2020-08-21",
946 "2020-08-21",
947 "2020-08-22",
948 "2020-08-23",
949 "2020-08-22",
950 ],
951 );
952 let s1 = Column::new(PlSmallStr::from_static("temp"), [20, 10, 7, 9, 1]);
953 let s2 = Column::new(PlSmallStr::from_static("rain"), [0.2, 0.1, 0.3, 0.1, 0.01]);
954 let df = DataFrame::new(vec![s0, s1, s2]).unwrap();
955
956 let out = df.group_by_stable(["date"])?.select(["temp"]).count()?;
957 assert_eq!(
958 out.column("temp_count")?,
959 &Column::new(PlSmallStr::from_static("temp_count"), [2 as IdxSize, 2, 1])
960 );
961
962 #[allow(deprecated)]
964 let out = df
966 .group_by_stable(["date"])?
967 .select(["temp", "rain"])
968 .mean()?;
969 assert_eq!(
970 out.column("temp_mean")?,
971 &Column::new(PlSmallStr::from_static("temp_mean"), [15.0f64, 4.0, 9.0])
972 );
973
974 #[allow(deprecated)]
976 let out = df
978 .group_by_stable(["date", "temp"])?
979 .select(["rain"])
980 .mean()?;
981 assert!(out.column("rain_mean").is_ok());
982
983 #[allow(deprecated)]
985 let out = df.group_by_stable(["date"])?.select(["temp"]).sum()?;
986 assert_eq!(
987 out.column("temp_sum")?,
988 &Column::new(PlSmallStr::from_static("temp_sum"), [30, 8, 9])
989 );
990
991 #[allow(deprecated)]
993 let gb = df.group_by(["date"]).unwrap().n_unique().unwrap();
995 assert_eq!(gb.width(), 3);
997 Ok(())
998 }
999
1000 #[test]
1001 #[cfg_attr(miri, ignore)]
1002 fn test_static_group_by_by_12_columns() {
1003 let s0 = Column::new("G1".into(), ["A", "A", "B", "B", "C"].as_ref());
1005 let s1 = Column::new("N".into(), [1, 2, 2, 4, 2].as_ref());
1006 let s2 = Column::new("G2".into(), ["k", "l", "m", "m", "l"].as_ref());
1007 let s3 = Column::new("G3".into(), ["a", "b", "c", "c", "d"].as_ref());
1008 let s4 = Column::new("G4".into(), ["1", "2", "3", "3", "4"].as_ref());
1009 let s5 = Column::new("G5".into(), ["X", "Y", "Z", "Z", "W"].as_ref());
1010 let s6 = Column::new("G6".into(), [false, true, true, true, false].as_ref());
1011 let s7 = Column::new("G7".into(), ["r", "x", "q", "q", "o"].as_ref());
1012 let s8 = Column::new("G8".into(), ["R", "X", "Q", "Q", "O"].as_ref());
1013 let s9 = Column::new("G9".into(), [1, 2, 3, 3, 4].as_ref());
1014 let s10 = Column::new("G10".into(), [".", "!", "?", "?", "/"].as_ref());
1015 let s11 = Column::new("G11".into(), ["(", ")", "@", "@", "$"].as_ref());
1016 let s12 = Column::new("G12".into(), ["-", "_", ";", ";", ","].as_ref());
1017
1018 let df =
1019 DataFrame::new(vec![s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]).unwrap();
1020
1021 #[allow(deprecated)]
1023 let adf = df
1024 .group_by([
1025 "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12",
1026 ])
1027 .unwrap()
1028 .select(["N"])
1029 .sum()
1030 .unwrap();
1031
1032 assert_eq!(
1033 Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
1034 &[Some(1), Some(2), Some(2), Some(6)]
1035 );
1036 }
1037
1038 #[test]
1039 #[cfg_attr(miri, ignore)]
1040 fn test_dynamic_group_by_by_13_columns() {
1041 let series_content = ["A", "A", "B", "B", "C"];
1043
1044 let series_names = [
1046 "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", "G11", "G12", "G13",
1047 ];
1048
1049 let mut columns = Vec::with_capacity(14);
1051
1052 for series_name in series_names {
1054 let group_columns = Column::new(series_name.into(), series_content.as_ref());
1055 columns.push(group_columns);
1056 }
1057
1058 let agg_series = Column::new("N".into(), [1, 2, 3, 3, 4].as_ref());
1060 columns.push(agg_series);
1061
1062 let df = DataFrame::new(columns).unwrap();
1064
1065 #[allow(deprecated)]
1067 let adf = df
1069 .group_by(series_names)
1070 .unwrap()
1071 .select(["N"])
1072 .sum()
1073 .unwrap();
1074
1075 for series_name in &series_names {
1078 assert_eq!(
1079 Vec::from(&adf.column(series_name).unwrap().str().unwrap().sort(false)),
1080 &[Some("A"), Some("B"), Some("C")]
1081 );
1082 }
1083
1084 assert_eq!(
1086 Vec::from(&adf.column("N_sum").unwrap().i32().unwrap().sort(false)),
1087 &[Some(3), Some(4), Some(6)]
1088 );
1089 }
1090
1091 #[test]
1092 #[cfg_attr(miri, ignore)]
1093 fn test_group_by_floats() {
1094 let df = df! {"flt" => [1., 1., 2., 2., 3.],
1095 "val" => [1, 1, 1, 1, 1]
1096 }
1097 .unwrap();
1098 #[allow(deprecated)]
1100 let res = df.group_by(["flt"]).unwrap().sum().unwrap();
1101 let res = res.sort(["flt"], SortMultipleOptions::default()).unwrap();
1102 assert_eq!(
1103 Vec::from(res.column("val_sum").unwrap().i32().unwrap()),
1104 &[Some(2), Some(2), Some(1)]
1105 );
1106 }
1107
1108 #[test]
1109 #[cfg_attr(miri, ignore)]
1110 #[cfg(feature = "dtype-categorical")]
1111 fn test_group_by_categorical() {
1112 let mut df = df! {"foo" => ["a", "a", "b", "b", "c"],
1113 "ham" => ["a", "a", "b", "b", "c"],
1114 "bar" => [1, 1, 1, 1, 1]
1115 }
1116 .unwrap();
1117
1118 df.apply("foo", |s| {
1119 s.cast(&DataType::Categorical(None, Default::default()))
1120 .unwrap()
1121 })
1122 .unwrap();
1123
1124 #[allow(deprecated)]
1126 let res = df
1128 .group_by_stable(["foo", "ham"])
1129 .unwrap()
1130 .select(["bar"])
1131 .sum()
1132 .unwrap();
1133
1134 assert_eq!(
1135 Vec::from(
1136 res.column("bar_sum")
1137 .unwrap()
1138 .as_materialized_series()
1139 .i32()
1140 .unwrap()
1141 ),
1142 &[Some(2), Some(2), Some(1)]
1143 );
1144 }
1145
1146 #[test]
1147 #[cfg_attr(miri, ignore)]
1148 fn test_group_by_null_handling() -> PolarsResult<()> {
1149 let df = df!(
1150 "a" => ["a", "a", "a", "b", "b"],
1151 "b" => [Some(1), Some(2), None, None, Some(1)]
1152 )?;
1153 #[allow(deprecated)]
1155 let out = df.group_by_stable(["a"])?.mean()?;
1156
1157 assert_eq!(
1158 Vec::from(out.column("b_mean")?.as_materialized_series().f64()?),
1159 &[Some(1.5), Some(1.0)]
1160 );
1161 Ok(())
1162 }
1163
1164 #[test]
1165 #[cfg_attr(miri, ignore)]
1166 fn test_group_by_var() -> PolarsResult<()> {
1167 let df = df![
1169 "g" => ["foo", "foo", "bar"],
1170 "flt" => [1.0, 2.0, 3.0],
1171 "int" => [1, 2, 3]
1172 ]?;
1173
1174 #[allow(deprecated)]
1176 let out = df.group_by_stable(["g"])?.select(["int"]).var(1)?;
1177
1178 assert_eq!(out.column("int_agg_var")?.f64()?.get(0), Some(0.5));
1179 #[allow(deprecated)]
1181 let out = df.group_by_stable(["g"])?.select(["int"]).std(1)?;
1182 let val = out.column("int_agg_std")?.f64()?.get(0).unwrap();
1183 let expected = f64::FRAC_1_SQRT_2();
1184 assert!((val - expected).abs() < 0.000001);
1185 Ok(())
1186 }
1187
1188 #[test]
1189 #[cfg_attr(miri, ignore)]
1190 #[cfg(feature = "dtype-categorical")]
1191 fn test_group_by_null_group() -> PolarsResult<()> {
1192 let mut df = df![
1194 "g" => [Some("foo"), Some("foo"), Some("bar"), None, None],
1195 "flt" => [1.0, 2.0, 3.0, 1.0, 1.0],
1196 "int" => [1, 2, 3, 1, 1]
1197 ]?;
1198
1199 df.try_apply("g", |s| {
1200 s.cast(&DataType::Categorical(None, Default::default()))
1201 })?;
1202
1203 #[allow(deprecated)]
1205 let _ = df.group_by(["g"])?.sum()?;
1206 Ok(())
1207 }
1208}