use polars_error::{polars_ensure, polars_err, PolarsResult};
use polars_utils::aliases::PlHashSet;
use super::Column;
use crate::datatypes::AnyValue;
use crate::frame::DataFrame;
use crate::prelude::PlSmallStr;
fn check_hstack(
col: &Column,
names: &mut PlHashSet<PlSmallStr>,
height: usize,
is_empty: bool,
) -> PolarsResult<()> {
polars_ensure!(
col.len() == height || is_empty,
ShapeMismatch: "unable to hstack Series of length {} and DataFrame of height {}",
col.len(), height,
);
polars_ensure!(
names.insert(col.name().clone()),
Duplicate: "unable to hstack, column with name {:?} already exists", col.name().as_str(),
);
Ok(())
}
impl DataFrame {
pub unsafe fn hstack_mut_unchecked(&mut self, columns: &[Column]) -> &mut Self {
self.columns.extend_from_slice(columns);
self
}
pub fn hstack_mut(&mut self, columns: &[Column]) -> PolarsResult<&mut Self> {
let mut names = self
.columns
.iter()
.map(|c| c.name().clone())
.collect::<PlHashSet<_>>();
let height = self.height();
let is_empty = self.is_empty();
for col in columns {
check_hstack(col, &mut names, height, is_empty)?;
}
drop(names);
Ok(unsafe { self.hstack_mut_unchecked(columns) })
}
}
pub fn concat_df_horizontal(dfs: &[DataFrame], check_duplicates: bool) -> PolarsResult<DataFrame> {
let max_len = dfs
.iter()
.map(|df| df.height())
.max()
.ok_or_else(|| polars_err!(ComputeError: "cannot concat empty dataframes"))?;
let owned_df;
let dfs = if !dfs.iter().all(|df| df.height() == max_len) {
owned_df = dfs
.iter()
.cloned()
.map(|mut df| {
if df.height() != max_len {
let diff = max_len - df.height();
df.columns.iter_mut().for_each(|s| {
let s = s.into_materialized_series();
*s = s.extend_constant(AnyValue::Null, diff).unwrap()
});
}
df
})
.collect::<Vec<_>>();
owned_df.as_slice()
} else {
dfs
};
let mut first_df = dfs[0].clone();
let height = first_df.height();
let is_empty = first_df.is_empty();
let mut names = if check_duplicates {
first_df
.columns
.iter()
.map(|s| s.name().clone())
.collect::<PlHashSet<_>>()
} else {
Default::default()
};
for df in &dfs[1..] {
let cols = df.get_columns();
if check_duplicates {
for col in cols {
check_hstack(col, &mut names, height, is_empty)?;
}
}
unsafe { first_df.hstack_mut_unchecked(cols) };
}
Ok(first_df)
}