polars_core/frame/
mod.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(any(feature = "rows", feature = "object"))]
33pub mod row;
34mod top_k;
35mod upstream_traits;
36mod validation;
37
38use arrow::record_batch::{RecordBatch, RecordBatchT};
39use polars_utils::pl_str::PlSmallStr;
40#[cfg(feature = "serde")]
41use serde::{Deserialize, Serialize};
42use strum_macros::IntoStaticStr;
43
44use crate::POOL;
45#[cfg(feature = "row_hash")]
46use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48use crate::series::IsSorted;
49
50#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52#[strum(serialize_all = "snake_case")]
53pub enum UniqueKeepStrategy {
54    /// Keep the first unique row.
55    First,
56    /// Keep the last unique row.
57    Last,
58    /// Keep None of the unique rows.
59    None,
60    /// Keep any of the unique rows
61    /// This allows more optimizations
62    #[default]
63    Any,
64}
65
66fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
67where
68    F: for<'a> FnMut(&'a T) -> &'a str,
69{
70    // Always unique.
71    if items.len() <= 1 {
72        return Ok(());
73    }
74
75    if items.len() <= 4 {
76        // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
77        for i in 0..items.len() - 1 {
78            let name = get_name(&items[i]);
79            for other in items.iter().skip(i + 1) {
80                if name == get_name(other) {
81                    polars_bail!(duplicate = name);
82                }
83            }
84        }
85    } else {
86        let mut names = PlHashSet::with_capacity(items.len());
87        for item in items {
88            let name = get_name(item);
89            if !names.insert(name) {
90                polars_bail!(duplicate = name);
91            }
92        }
93    }
94    Ok(())
95}
96
97/// A contiguous growable collection of `Series` that have the same length.
98///
99/// ## Use declarations
100///
101/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
102///
103/// ```rust
104/// use polars_core::prelude::*; // if the crate polars-core is used directly
105/// // use polars::prelude::*;      if the crate polars is used
106/// ```
107///
108/// # Initialization
109/// ## Default
110///
111/// A `DataFrame` can be initialized empty:
112///
113/// ```rust
114/// # use polars_core::prelude::*;
115/// let df = DataFrame::default();
116/// assert!(df.is_empty());
117/// ```
118///
119/// ## Wrapping a `Vec<Series>`
120///
121/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
122///
123/// ```rust
124/// # use polars_core::prelude::*;
125/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
126/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
127///
128/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
129/// ```
130///
131/// ## Using a macro
132///
133/// The [`df!`] macro is a convenient method:
134///
135/// ```rust
136/// # use polars_core::prelude::*;
137/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
138///                                       "Color" => ["Red", "Yellow", "Green"]);
139/// ```
140///
141/// ## Using a CSV file
142///
143/// See the `polars_io::csv::CsvReader`.
144///
145/// # Indexing
146/// ## By a number
147///
148/// The `Index<usize>` is implemented for the `DataFrame`.
149///
150/// ```rust
151/// # use polars_core::prelude::*;
152/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
153///              "Color" => ["Red", "Yellow", "Green"])?;
154///
155/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
156/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
157/// # Ok::<(), PolarsError>(())
158/// ```
159///
160/// ## By a `Series` name
161///
162/// ```rust
163/// # use polars_core::prelude::*;
164/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
165///              "Color" => ["Red", "Yellow", "Green"])?;
166///
167/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
168/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
169/// # Ok::<(), PolarsError>(())
170/// ```
171#[derive(Clone)]
172pub struct DataFrame {
173    height: usize,
174    // invariant: columns[i].len() == height for each 0 >= i > columns.len()
175    pub(crate) columns: Vec<Column>,
176
177    /// A cached schema. This might not give correct results if the DataFrame was modified in place
178    /// between schema and reading.
179    cached_schema: OnceLock<SchemaRef>,
180}
181
182impl DataFrame {
183    pub fn clear_schema(&mut self) {
184        self.cached_schema = OnceLock::new();
185    }
186
187    #[inline]
188    pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
189        self.columns.iter()
190    }
191
192    #[inline]
193    pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
194        self.columns.iter().map(Column::as_materialized_series)
195    }
196
197    #[inline]
198    pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
199        self.columns.par_iter().map(Column::as_materialized_series)
200    }
201
202    /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
203    ///
204    /// # Implementation
205    /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
206    /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
207    /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
208    ///
209    /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
210    /// However, this function will yield a smaller number. This is because this function returns
211    /// the visible size of the buffer, not its total capacity.
212    ///
213    /// FFI buffers are included in this estimation.
214    pub fn estimated_size(&self) -> usize {
215        self.columns.iter().map(Column::estimated_size).sum()
216    }
217
218    // Reduce monomorphization.
219    fn try_apply_columns(
220        &self,
221        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
222    ) -> PolarsResult<Vec<Column>> {
223        self.columns.iter().map(func).collect()
224    }
225    // Reduce monomorphization.
226    pub fn _apply_columns(&self, func: &(dyn Fn(&Column) -> Column)) -> Vec<Column> {
227        self.columns.iter().map(func).collect()
228    }
229    // Reduce monomorphization.
230    fn try_apply_columns_par(
231        &self,
232        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
233    ) -> PolarsResult<Vec<Column>> {
234        POOL.install(|| self.columns.par_iter().map(func).collect())
235    }
236    // Reduce monomorphization.
237    pub fn _apply_columns_par(
238        &self,
239        func: &(dyn Fn(&Column) -> Column + Send + Sync),
240    ) -> Vec<Column> {
241        POOL.install(|| self.columns.par_iter().map(func).collect())
242    }
243
244    /// Get the index of the column.
245    fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
246        self.get_column_index(name)
247            .ok_or_else(|| polars_err!(col_not_found = name))
248    }
249
250    fn check_already_present(&self, name: &str) -> PolarsResult<()> {
251        polars_ensure!(
252            self.columns.iter().all(|s| s.name().as_str() != name),
253            Duplicate: "column with name {:?} is already present in the DataFrame", name
254        );
255        Ok(())
256    }
257
258    /// Reserve additional slots into the chunks of the series.
259    pub(crate) fn reserve_chunks(&mut self, additional: usize) {
260        for s in &mut self.columns {
261            if let Column::Series(s) = s {
262                // SAFETY:
263                // do not modify the data, simply resize.
264                unsafe { s.chunks_mut().reserve(additional) }
265            }
266        }
267    }
268
269    /// Create a DataFrame from a Vector of Series.
270    ///
271    /// Errors if a column names are not unique, or if heights are not all equal.
272    ///
273    /// # Example
274    ///
275    /// ```
276    /// # use polars_core::prelude::*;
277    /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
278    /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
279    ///
280    /// let df = DataFrame::new(vec![s0, s1])?;
281    /// # Ok::<(), PolarsError>(())
282    /// ```
283    pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
284        DataFrame::validate_columns_slice(&columns)
285            .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {}", e)))?;
286        Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
287    }
288
289    pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
290        for col in &columns {
291            polars_ensure!(
292                col.len() == height,
293                ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
294                columns[0].name(), height, col.name(), col.len()
295            );
296        }
297
298        Ok(DataFrame {
299            height,
300            columns,
301            cached_schema: OnceLock::new(),
302        })
303    }
304
305    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
306    /// columns to match the other columns.
307    pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
308        // The length of the longest non-unit length column determines the
309        // broadcast length. If all columns are unit-length the broadcast length
310        // is one.
311        let broadcast_len = columns
312            .iter()
313            .map(|s| s.len())
314            .filter(|l| *l != 1)
315            .max()
316            .unwrap_or(1);
317        Self::new_with_broadcast_len(columns, broadcast_len)
318    }
319
320    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
321    /// columns to broadcast_len.
322    pub fn new_with_broadcast_len(
323        columns: Vec<Column>,
324        broadcast_len: usize,
325    ) -> PolarsResult<Self> {
326        ensure_names_unique(&columns, |s| s.name().as_str())?;
327        unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
328    }
329
330    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
331    /// columns to match the other columns.
332    ///  
333    /// # Safety
334    /// Does not check that the column names are unique (which they must be).
335    pub unsafe fn new_with_broadcast_no_namecheck(
336        mut columns: Vec<Column>,
337        broadcast_len: usize,
338    ) -> PolarsResult<Self> {
339        for col in &mut columns {
340            // Length not equal to the broadcast len, needs broadcast or is an error.
341            let len = col.len();
342            if len != broadcast_len {
343                if len != 1 {
344                    let name = col.name().to_owned();
345                    let extra_info =
346                        if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
347                            format!(" (matching column '{}')", c.name())
348                        } else {
349                            String::new()
350                        };
351                    polars_bail!(
352                        ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
353                    );
354                }
355                *col = col.new_from_index(0, broadcast_len);
356            }
357        }
358
359        let length = if columns.is_empty() { 0 } else { broadcast_len };
360
361        Ok(unsafe { DataFrame::new_no_checks(length, columns) })
362    }
363
364    /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
365    ///
366    /// # Example
367    ///
368    /// ```rust
369    /// use polars_core::prelude::DataFrame;
370    /// static EMPTY: DataFrame = DataFrame::empty();
371    /// ```
372    pub const fn empty() -> Self {
373        Self::empty_with_height(0)
374    }
375
376    /// Creates an empty `DataFrame` with a specific `height`.
377    pub const fn empty_with_height(height: usize) -> Self {
378        DataFrame {
379            height,
380            columns: vec![],
381            cached_schema: OnceLock::new(),
382        }
383    }
384
385    /// Create an empty `DataFrame` with empty columns as per the `schema`.
386    pub fn empty_with_schema(schema: &Schema) -> Self {
387        let cols = schema
388            .iter()
389            .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
390            .collect();
391        unsafe { DataFrame::new_no_checks(0, cols) }
392    }
393
394    /// Create an empty `DataFrame` with empty columns as per the `schema`.
395    pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
396        let cols = schema
397            .iter_values()
398            .map(|fld| {
399                Column::from(Series::new_empty(
400                    fld.name.clone(),
401                    &(DataType::from_arrow_field(fld)),
402                ))
403            })
404            .collect();
405        unsafe { DataFrame::new_no_checks(0, cols) }
406    }
407
408    /// Create a new `DataFrame` with the given schema, only containing nulls.
409    pub fn full_null(schema: &Schema, height: usize) -> Self {
410        let columns = schema
411            .iter_fields()
412            .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
413            .collect();
414        unsafe { DataFrame::new_no_checks(height, columns) }
415    }
416
417    /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
418    ///
419    /// # Example
420    ///
421    /// ```rust
422    /// # use polars_core::prelude::*;
423    /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
424    /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
425    /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
426    ///
427    /// assert_eq!(df.pop(), Some(s2));
428    /// assert_eq!(df.pop(), Some(s1));
429    /// assert_eq!(df.pop(), None);
430    /// assert!(df.is_empty());
431    /// # Ok::<(), PolarsError>(())
432    /// ```
433    pub fn pop(&mut self) -> Option<Column> {
434        self.clear_schema();
435
436        self.columns.pop()
437    }
438
439    /// Add a new column at index 0 that counts the rows.
440    ///
441    /// # Example
442    ///
443    /// ```
444    /// # use polars_core::prelude::*;
445    /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
446    /// assert_eq!(df1.shape(), (4, 1));
447    ///
448    /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
449    /// assert_eq!(df2.shape(), (4, 2));
450    /// println!("{}", df2);
451    ///
452    /// # Ok::<(), PolarsError>(())
453    /// ```
454    ///
455    /// Output:
456    ///
457    /// ```text
458    ///  shape: (4, 2)
459    ///  +-----+----------+
460    ///  | Id  | Name     |
461    ///  | --- | ---      |
462    ///  | u32 | str      |
463    ///  +=====+==========+
464    ///  | 0   | James    |
465    ///  +-----+----------+
466    ///  | 1   | Mary     |
467    ///  +-----+----------+
468    ///  | 2   | John     |
469    ///  +-----+----------+
470    ///  | 3   | Patricia |
471    ///  +-----+----------+
472    /// ```
473    pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
474        let mut columns = Vec::with_capacity(self.columns.len() + 1);
475        let offset = offset.unwrap_or(0);
476
477        let col = Column::new_row_index(name, offset, self.height())?;
478        columns.push(col);
479        columns.extend_from_slice(&self.columns);
480        DataFrame::new(columns)
481    }
482
483    /// Add a row index column in place.
484    ///
485    /// # Safety
486    /// The caller should ensure the DataFrame does not already contain a column with the given name.
487    ///
488    /// # Panics
489    /// Panics if the resulting column would reach or overflow IdxSize::MAX.
490    pub unsafe fn with_row_index_mut(
491        &mut self,
492        name: PlSmallStr,
493        offset: Option<IdxSize>,
494    ) -> &mut Self {
495        // TODO: Make this function unsafe
496        debug_assert!(
497            self.columns.iter().all(|c| c.name() != &name),
498            "with_row_index_mut(): column with name {} already exists",
499            &name
500        );
501
502        let offset = offset.unwrap_or(0);
503        let col = Column::new_row_index(name, offset, self.height()).unwrap();
504
505        self.clear_schema();
506        self.columns.insert(0, col);
507        self
508    }
509
510    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
511    /// `Series`.
512    ///
513    /// Calculates the height from the first column or `0` if no columns are given.
514    ///
515    /// # Safety
516    ///
517    /// It is the callers responsibility to uphold the contract of all `Series`
518    /// having an equal length and a unique name, if not this may panic down the line.
519    pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
520        let height = columns.first().map_or(0, Column::len);
521        unsafe { Self::new_no_checks(height, columns) }
522    }
523
524    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
525    /// `Series`.
526    ///
527    /// It is advised to use [DataFrame::new] in favor of this method.
528    ///
529    /// # Safety
530    ///
531    /// It is the callers responsibility to uphold the contract of all `Series`
532    /// having an equal length and a unique name, if not this may panic down the line.
533    pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
534        if cfg!(debug_assertions) {
535            DataFrame::validate_columns_slice(&columns).unwrap();
536        }
537
538        unsafe { Self::_new_no_checks_impl(height, columns) }
539    }
540
541    /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
542    /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
543    /// constructed with this method is generally highly unsafe and should not be long-lived.
544    #[allow(clippy::missing_safety_doc)]
545    pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
546        DataFrame {
547            height,
548            columns,
549            cached_schema: OnceLock::new(),
550        }
551    }
552
553    /// Shrink the capacity of this DataFrame to fit its length.
554    pub fn shrink_to_fit(&mut self) {
555        // Don't parallelize this. Memory overhead
556        for s in &mut self.columns {
557            s.shrink_to_fit();
558        }
559    }
560
561    /// Aggregate all the chunks in the DataFrame to a single chunk.
562    pub fn as_single_chunk(&mut self) -> &mut Self {
563        // Don't parallelize this. Memory overhead
564        for s in &mut self.columns {
565            *s = s.rechunk();
566        }
567        self
568    }
569
570    /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
571    /// This may lead to more peak memory consumption.
572    pub fn as_single_chunk_par(&mut self) -> &mut Self {
573        if self.columns.iter().any(|c| c.n_chunks() > 1) {
574            self.columns = self._apply_columns_par(&|s| s.rechunk());
575        }
576        self
577    }
578
579    /// Rechunks all columns to only have a single chunk.
580    pub fn rechunk_mut(&mut self) {
581        // SAFETY: We never adjust the length or names of the columns.
582        let columns = unsafe { self.get_columns_mut() };
583
584        for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
585            *col = col.rechunk();
586        }
587    }
588
589    pub fn _deshare_views_mut(&mut self) {
590        // SAFETY: We never adjust the length or names of the columns.
591        unsafe {
592            let columns = self.get_columns_mut();
593            for col in columns {
594                let Column::Series(s) = col else { continue };
595
596                if let Ok(ca) = s.binary() {
597                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
598                    *col = Column::from(gc_ca.into_series());
599                } else if let Ok(ca) = s.str() {
600                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
601                    *col = Column::from(gc_ca.into_series());
602                }
603            }
604        }
605    }
606
607    /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
608    pub fn rechunk_to_record_batch(
609        self,
610        compat_level: CompatLevel,
611    ) -> RecordBatchT<Box<dyn Array>> {
612        let height = self.height();
613
614        let (schema, arrays) = self
615            .columns
616            .into_iter()
617            .map(|col| {
618                let mut series = col.take_materialized_series();
619                // Rechunk to one chunk if necessary
620                if series.n_chunks() > 1 {
621                    series = series.rechunk();
622                }
623                (
624                    series.field().to_arrow(compat_level),
625                    series.to_arrow(0, compat_level),
626                )
627            })
628            .collect();
629
630        RecordBatchT::new(height, Arc::new(schema), arrays)
631    }
632
633    /// Returns true if the chunks of the columns do not align and re-chunking should be done
634    pub fn should_rechunk(&self) -> bool {
635        // Fast check. It is also needed for correctness, as code below doesn't check if the number
636        // of chunks is equal.
637        if !self
638            .get_columns()
639            .iter()
640            .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
641            .all_equal()
642        {
643            return true;
644        }
645
646        // From here we check chunk lengths.
647        let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
648        match chunk_lengths.next() {
649            None => false,
650            Some(first_column_chunk_lengths) => {
651                // Fast Path for single Chunk Series
652                if first_column_chunk_lengths.size_hint().0 == 1 {
653                    return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
654                }
655                // Always rechunk if we have more chunks than rows.
656                // except when we have an empty df containing a single chunk
657                let height = self.height();
658                let n_chunks = first_column_chunk_lengths.size_hint().0;
659                if n_chunks > height && !(height == 0 && n_chunks == 1) {
660                    return true;
661                }
662                // Slow Path for multi Chunk series
663                let v: Vec<_> = first_column_chunk_lengths.collect();
664                for cl in chunk_lengths {
665                    if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
666                        return true;
667                    }
668                }
669                false
670            },
671        }
672    }
673
674    /// Ensure all the chunks in the [`DataFrame`] are aligned.
675    pub fn align_chunks_par(&mut self) -> &mut Self {
676        if self.should_rechunk() {
677            self.as_single_chunk_par()
678        } else {
679            self
680        }
681    }
682
683    pub fn align_chunks(&mut self) -> &mut Self {
684        if self.should_rechunk() {
685            self.as_single_chunk()
686        } else {
687            self
688        }
689    }
690
691    /// Get the [`DataFrame`] schema.
692    ///
693    /// # Example
694    ///
695    /// ```rust
696    /// # use polars_core::prelude::*;
697    /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
698    ///                         "Diameter (m)" => [8.8e26, f64::INFINITY])?;
699    ///
700    /// let f1: Field = Field::new("Thing".into(), DataType::String);
701    /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
702    /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
703    ///
704    /// assert_eq!(&**df.schema(), &sc);
705    /// # Ok::<(), PolarsError>(())
706    /// ```
707    pub fn schema(&self) -> &SchemaRef {
708        let out = self.cached_schema.get_or_init(|| {
709            Arc::new(
710                self.columns
711                    .iter()
712                    .map(|x| (x.name().clone(), x.dtype().clone()))
713                    .collect(),
714            )
715        });
716
717        debug_assert_eq!(out.len(), self.width());
718
719        out
720    }
721
722    /// Get a reference to the [`DataFrame`] columns.
723    ///
724    /// # Example
725    ///
726    /// ```rust
727    /// # use polars_core::prelude::*;
728    /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
729    ///                         "Symbol" => ["A", "C", "G", "T"])?;
730    /// let columns: &[Column] = df.get_columns();
731    ///
732    /// assert_eq!(columns[0].name(), "Name");
733    /// assert_eq!(columns[1].name(), "Symbol");
734    /// # Ok::<(), PolarsError>(())
735    /// ```
736    #[inline]
737    pub fn get_columns(&self) -> &[Column] {
738        &self.columns
739    }
740
741    #[inline]
742    /// Get mutable access to the underlying columns.
743    ///
744    /// # Safety
745    ///
746    /// The caller must ensure the length of all [`Series`] remains equal to `height` or
747    /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
748    /// The caller must ensure that the cached schema is cleared if it modifies the schema by
749    /// calling [`DataFrame::clear_schema`].
750    pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
751        &mut self.columns
752    }
753
754    #[inline]
755    /// Remove all the columns in the [`DataFrame`] but keep the `height`.
756    pub fn clear_columns(&mut self) {
757        unsafe { self.get_columns_mut() }.clear();
758        self.clear_schema();
759    }
760
761    #[inline]
762    /// Extend the columns without checking for name collisions or height.
763    ///
764    /// # Safety
765    ///
766    /// The caller needs to ensure that:
767    /// - Column names are unique within the resulting [`DataFrame`].
768    /// - The length of each appended column matches the height of the [`DataFrame`]. For
769    ///   `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
770    ///   with [`DataFrame::set_height`].
771    pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
772        unsafe { self.get_columns_mut() }.extend(iter);
773        self.clear_schema();
774    }
775
776    /// Take ownership of the underlying columns vec.
777    pub fn take_columns(self) -> Vec<Column> {
778        self.columns
779    }
780
781    /// Iterator over the columns as [`Series`].
782    ///
783    /// # Example
784    ///
785    /// ```rust
786    /// # use polars_core::prelude::*;
787    /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
788    /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
789    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
790    ///
791    /// let mut iterator = df.iter();
792    ///
793    /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
794    /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
795    /// assert_eq!(iterator.next(), None);
796    /// # Ok::<(), PolarsError>(())
797    /// ```
798    pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
799        self.materialized_column_iter()
800    }
801
802    /// # Example
803    ///
804    /// ```rust
805    /// # use polars_core::prelude::*;
806    /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
807    ///                         "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
808    ///
809    /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
810    /// # Ok::<(), PolarsError>(())
811    /// ```
812    pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
813        self.columns.iter().map(|s| s.name()).collect()
814    }
815
816    /// Get the [`Vec<PlSmallStr>`] representing the column names.
817    pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
818        self.columns.iter().map(|s| s.name().clone()).collect()
819    }
820
821    pub fn get_column_names_str(&self) -> Vec<&str> {
822        self.columns.iter().map(|s| s.name().as_str()).collect()
823    }
824
825    /// Set the column names.
826    /// # Example
827    ///
828    /// ```rust
829    /// # use polars_core::prelude::*;
830    /// let mut df: DataFrame = df!("Mathematical set" => ["ℕ", "ℤ", "𝔻", "ℚ", "ℝ", "ℂ"])?;
831    /// df.set_column_names(["Set"])?;
832    ///
833    /// assert_eq!(df.get_column_names(), &["Set"]);
834    /// # Ok::<(), PolarsError>(())
835    /// ```
836    pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
837    where
838        I: IntoIterator<Item = S>,
839        S: Into<PlSmallStr>,
840    {
841        let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
842        self._set_column_names_impl(names.as_slice())
843    }
844
845    fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
846        polars_ensure!(
847            names.len() == self.width(),
848            ShapeMismatch: "{} column names provided for a DataFrame of width {}",
849            names.len(), self.width()
850        );
851        ensure_names_unique(names, |s| s.as_str())?;
852
853        let columns = mem::take(&mut self.columns);
854        self.columns = columns
855            .into_iter()
856            .zip(names)
857            .map(|(s, name)| {
858                let mut s = s;
859                s.rename(name.clone());
860                s
861            })
862            .collect();
863        self.clear_schema();
864        Ok(())
865    }
866
867    /// Get the data types of the columns in the [`DataFrame`].
868    ///
869    /// # Example
870    ///
871    /// ```rust
872    /// # use polars_core::prelude::*;
873    /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
874    ///                                "Fraction" => [0.965, 0.035])?;
875    ///
876    /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
877    /// # Ok::<(), PolarsError>(())
878    /// ```
879    pub fn dtypes(&self) -> Vec<DataType> {
880        self.columns.iter().map(|s| s.dtype().clone()).collect()
881    }
882
883    pub(crate) fn first_series_column(&self) -> Option<&Series> {
884        self.columns.iter().find_map(|col| col.as_series())
885    }
886
887    /// The number of chunks for the first column.
888    pub fn first_col_n_chunks(&self) -> usize {
889        match self.first_series_column() {
890            None if self.columns.is_empty() => 0,
891            None => 1,
892            Some(s) => s.n_chunks(),
893        }
894    }
895
896    /// The highest number of chunks for any column.
897    pub fn max_n_chunks(&self) -> usize {
898        self.columns
899            .iter()
900            .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
901            .max()
902            .unwrap_or(0)
903    }
904
905    /// Get a reference to the schema fields of the [`DataFrame`].
906    ///
907    /// # Example
908    ///
909    /// ```rust
910    /// # use polars_core::prelude::*;
911    /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
912    ///                            "Fraction" => [0.708, 0.292])?;
913    ///
914    /// let f1: Field = Field::new("Surface type".into(), DataType::String);
915    /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
916    ///
917    /// assert_eq!(earth.fields(), &[f1, f2]);
918    /// # Ok::<(), PolarsError>(())
919    /// ```
920    pub fn fields(&self) -> Vec<Field> {
921        self.columns
922            .iter()
923            .map(|s| s.field().into_owned())
924            .collect()
925    }
926
927    /// Get (height, width) of the [`DataFrame`].
928    ///
929    /// # Example
930    ///
931    /// ```rust
932    /// # use polars_core::prelude::*;
933    /// let df0: DataFrame = DataFrame::default();
934    /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
935    /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
936    ///                          "2" => [1, 2, 3, 4, 5])?;
937    ///
938    /// assert_eq!(df0.shape(), (0 ,0));
939    /// assert_eq!(df1.shape(), (5, 1));
940    /// assert_eq!(df2.shape(), (5, 2));
941    /// # Ok::<(), PolarsError>(())
942    /// ```
943    pub fn shape(&self) -> (usize, usize) {
944        (self.height, self.columns.len())
945    }
946
947    /// Get the width of the [`DataFrame`] which is the number of columns.
948    ///
949    /// # Example
950    ///
951    /// ```rust
952    /// # use polars_core::prelude::*;
953    /// let df0: DataFrame = DataFrame::default();
954    /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
955    /// let df2: DataFrame = df!("Series 1" => [0; 0],
956    ///                          "Series 2" => [0; 0])?;
957    ///
958    /// assert_eq!(df0.width(), 0);
959    /// assert_eq!(df1.width(), 1);
960    /// assert_eq!(df2.width(), 2);
961    /// # Ok::<(), PolarsError>(())
962    /// ```
963    pub fn width(&self) -> usize {
964        self.columns.len()
965    }
966
967    /// Get the height of the [`DataFrame`] which is the number of rows.
968    ///
969    /// # Example
970    ///
971    /// ```rust
972    /// # use polars_core::prelude::*;
973    /// let df0: DataFrame = DataFrame::default();
974    /// let df1: DataFrame = df!("Currency" => ["€", "$"])?;
975    /// let df2: DataFrame = df!("Currency" => ["€", "$", "¥", "£", "₿"])?;
976    ///
977    /// assert_eq!(df0.height(), 0);
978    /// assert_eq!(df1.height(), 2);
979    /// assert_eq!(df2.height(), 5);
980    /// # Ok::<(), PolarsError>(())
981    /// ```
982    pub fn height(&self) -> usize {
983        self.height
984    }
985
986    /// Returns the size as number of rows * number of columns
987    pub fn size(&self) -> usize {
988        let s = self.shape();
989        s.0 * s.1
990    }
991
992    /// Returns `true` if the [`DataFrame`] contains no rows.
993    ///
994    /// # Example
995    ///
996    /// ```rust
997    /// # use polars_core::prelude::*;
998    /// let df1: DataFrame = DataFrame::default();
999    /// assert!(df1.is_empty());
1000    ///
1001    /// let df2: DataFrame = df!("First name" => ["Forever"],
1002    ///                          "Last name" => ["Alone"])?;
1003    /// assert!(!df2.is_empty());
1004    /// # Ok::<(), PolarsError>(())
1005    /// ```
1006    pub fn is_empty(&self) -> bool {
1007        matches!(self.shape(), (0, _) | (_, 0))
1008    }
1009
1010    /// Set the height (i.e. number of rows) of this [`DataFrame`].
1011    ///
1012    /// # Safety
1013    ///
1014    /// This needs to be equal to the length of all the columns.
1015    pub unsafe fn set_height(&mut self, height: usize) {
1016        self.height = height;
1017    }
1018
1019    /// Add multiple [`Series`] to a [`DataFrame`].
1020    /// The added `Series` are required to have the same length.
1021    ///
1022    /// # Example
1023    ///
1024    /// ```rust
1025    /// # use polars_core::prelude::*;
1026    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1027    /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1028    /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1029    ///
1030    /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1031    /// assert_eq!(df2.shape(), (3, 3));
1032    /// println!("{}", df2);
1033    /// # Ok::<(), PolarsError>(())
1034    /// ```
1035    ///
1036    /// Output:
1037    ///
1038    /// ```text
1039    /// shape: (3, 3)
1040    /// +---------+--------+----------+
1041    /// | Element | Proton | Electron |
1042    /// | ---     | ---    | ---      |
1043    /// | str     | i32    | i32      |
1044    /// +=========+========+==========+
1045    /// | Copper  | 29     | 29       |
1046    /// +---------+--------+----------+
1047    /// | Silver  | 47     | 47       |
1048    /// +---------+--------+----------+
1049    /// | Gold    | 79     | 79       |
1050    /// +---------+--------+----------+
1051    /// ```
1052    pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1053        let mut new_cols = self.columns.clone();
1054        new_cols.extend_from_slice(columns);
1055        DataFrame::new(new_cols)
1056    }
1057
1058    /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1059    ///
1060    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1061    ///
1062    /// # Example
1063    ///
1064    /// ```rust
1065    /// # use polars_core::prelude::*;
1066    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1067    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1068    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1069    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1070    ///
1071    /// let df3: DataFrame = df1.vstack(&df2)?;
1072    ///
1073    /// assert_eq!(df3.shape(), (5, 2));
1074    /// println!("{}", df3);
1075    /// # Ok::<(), PolarsError>(())
1076    /// ```
1077    ///
1078    /// Output:
1079    ///
1080    /// ```text
1081    /// shape: (5, 2)
1082    /// +-----------+-------------------+
1083    /// | Element   | Melting Point (K) |
1084    /// | ---       | ---               |
1085    /// | str       | f64               |
1086    /// +===========+===================+
1087    /// | Copper    | 1357.77           |
1088    /// +-----------+-------------------+
1089    /// | Silver    | 1234.93           |
1090    /// +-----------+-------------------+
1091    /// | Gold      | 1337.33           |
1092    /// +-----------+-------------------+
1093    /// | Platinum  | 2041.4            |
1094    /// +-----------+-------------------+
1095    /// | Palladium | 1828.05           |
1096    /// +-----------+-------------------+
1097    /// ```
1098    pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1099        let mut df = self.clone();
1100        df.vstack_mut(other)?;
1101        Ok(df)
1102    }
1103
1104    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1105    ///
1106    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1107    ///
1108    /// # Example
1109    ///
1110    /// ```rust
1111    /// # use polars_core::prelude::*;
1112    /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1113    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1114    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1115    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1116    ///
1117    /// df1.vstack_mut(&df2)?;
1118    ///
1119    /// assert_eq!(df1.shape(), (5, 2));
1120    /// println!("{}", df1);
1121    /// # Ok::<(), PolarsError>(())
1122    /// ```
1123    ///
1124    /// Output:
1125    ///
1126    /// ```text
1127    /// shape: (5, 2)
1128    /// +-----------+-------------------+
1129    /// | Element   | Melting Point (K) |
1130    /// | ---       | ---               |
1131    /// | str       | f64               |
1132    /// +===========+===================+
1133    /// | Copper    | 1357.77           |
1134    /// +-----------+-------------------+
1135    /// | Silver    | 1234.93           |
1136    /// +-----------+-------------------+
1137    /// | Gold      | 1337.33           |
1138    /// +-----------+-------------------+
1139    /// | Platinum  | 2041.4            |
1140    /// +-----------+-------------------+
1141    /// | Palladium | 1828.05           |
1142    /// +-----------+-------------------+
1143    /// ```
1144    pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1145        if self.width() != other.width() {
1146            polars_ensure!(
1147                self.width() == 0,
1148                ShapeMismatch:
1149                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1150                self.width(), other.width(),
1151            );
1152            self.columns.clone_from(&other.columns);
1153            self.height = other.height;
1154            return Ok(self);
1155        }
1156
1157        self.columns
1158            .iter_mut()
1159            .zip(other.columns.iter())
1160            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1161                ensure_can_extend(&*left, right)?;
1162                left.append(right).map_err(|e| {
1163                    e.context(format!("failed to vstack column '{}'", right.name()).into())
1164                })?;
1165                Ok(())
1166            })?;
1167        self.height += other.height;
1168        Ok(self)
1169    }
1170
1171    pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1172        if self.width() != other.width() {
1173            polars_ensure!(
1174                self.width() == 0,
1175                ShapeMismatch:
1176                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1177                self.width(), other.width(),
1178            );
1179            self.columns = other.columns;
1180            self.height = other.height;
1181            return Ok(self);
1182        }
1183
1184        self.columns
1185            .iter_mut()
1186            .zip(other.columns.into_iter())
1187            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1188                ensure_can_extend(&*left, &right)?;
1189                let right_name = right.name().clone();
1190                left.append_owned(right).map_err(|e| {
1191                    e.context(format!("failed to vstack column '{right_name}'").into())
1192                })?;
1193                Ok(())
1194            })?;
1195        self.height += other.height;
1196        Ok(self)
1197    }
1198
1199    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1200    ///
1201    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1202    ///
1203    /// # Panics
1204    /// Panics if the schema's don't match.
1205    pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1206        self.columns
1207            .iter_mut()
1208            .zip(other.columns.iter())
1209            .for_each(|(left, right)| {
1210                left.append(right)
1211                    .map_err(|e| {
1212                        e.context(format!("failed to vstack column '{}'", right.name()).into())
1213                    })
1214                    .expect("should not fail");
1215            });
1216        self.height += other.height;
1217    }
1218
1219    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1220    ///
1221    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1222    ///
1223    /// # Panics
1224    /// Panics if the schema's don't match.
1225    pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1226        self.columns
1227            .iter_mut()
1228            .zip(other.columns)
1229            .for_each(|(left, right)| {
1230                left.append_owned(right).expect("should not fail");
1231            });
1232        self.height += other.height;
1233    }
1234
1235    /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1236    ///
1237    /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1238    /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1239    ///
1240    /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1241    /// and thus will yield faster queries.
1242    ///
1243    /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1244    /// online operations where you add `n` rows and rerun a query.
1245    ///
1246    /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1247    /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1248    /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1249    pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1250        polars_ensure!(
1251            self.width() == other.width(),
1252            ShapeMismatch:
1253            "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1254            self.width(), other.width(),
1255        );
1256
1257        self.columns
1258            .iter_mut()
1259            .zip(other.columns.iter())
1260            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1261                ensure_can_extend(&*left, right)?;
1262                left.extend(right).map_err(|e| {
1263                    e.context(format!("failed to extend column '{}'", right.name()).into())
1264                })?;
1265                Ok(())
1266            })?;
1267        self.height += other.height;
1268        self.clear_schema();
1269        Ok(())
1270    }
1271
1272    /// Remove a column by name and return the column removed.
1273    ///
1274    /// # Example
1275    ///
1276    /// ```rust
1277    /// # use polars_core::prelude::*;
1278    /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1279    ///                             "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1280    ///
1281    /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1282    /// assert!(s1.is_err());
1283    ///
1284    /// let s2: Column = df.drop_in_place("Animal")?;
1285    /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1286    /// # Ok::<(), PolarsError>(())
1287    /// ```
1288    pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1289        let idx = self.check_name_to_idx(name)?;
1290        self.clear_schema();
1291        Ok(self.columns.remove(idx))
1292    }
1293
1294    /// Return a new [`DataFrame`] where all null values are dropped.
1295    ///
1296    /// # Example
1297    ///
1298    /// ```no_run
1299    /// # use polars_core::prelude::*;
1300    /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1301    ///                         "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1302    /// assert_eq!(df1.shape(), (3, 2));
1303    ///
1304    /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1305    /// assert_eq!(df2.shape(), (1, 2));
1306    /// println!("{}", df2);
1307    /// # Ok::<(), PolarsError>(())
1308    /// ```
1309    ///
1310    /// Output:
1311    ///
1312    /// ```text
1313    /// shape: (1, 2)
1314    /// +---------+---------------------+
1315    /// | Country | Tax revenue (% GDP) |
1316    /// | ---     | ---                 |
1317    /// | str     | f64                 |
1318    /// +=========+=====================+
1319    /// | Malta   | 32.7                |
1320    /// +---------+---------------------+
1321    /// ```
1322    pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1323    where
1324        for<'a> &'a S: Into<PlSmallStr>,
1325    {
1326        if let Some(v) = subset {
1327            let v = self.select_columns(v)?;
1328            self._drop_nulls_impl(v.as_slice())
1329        } else {
1330            self._drop_nulls_impl(self.columns.as_slice())
1331        }
1332    }
1333
1334    fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1335        // fast path for no nulls in df
1336        if subset.iter().all(|s| !s.has_nulls()) {
1337            return Ok(self.clone());
1338        }
1339
1340        let mut iter = subset.iter();
1341
1342        let mask = iter
1343            .next()
1344            .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1345        let mut mask = mask.is_not_null();
1346
1347        for c in iter {
1348            mask = mask & c.is_not_null();
1349        }
1350        self.filter(&mask)
1351    }
1352
1353    /// Drop a column by name.
1354    /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1355    /// the current one in place.
1356    ///
1357    /// # Example
1358    ///
1359    /// ```rust
1360    /// # use polars_core::prelude::*;
1361    /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1362    /// let df2: DataFrame = df1.drop("Ray type")?;
1363    ///
1364    /// assert!(df2.is_empty());
1365    /// # Ok::<(), PolarsError>(())
1366    /// ```
1367    pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1368        let idx = self.check_name_to_idx(name)?;
1369        let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1370
1371        self.columns.iter().enumerate().for_each(|(i, s)| {
1372            if i != idx {
1373                new_cols.push(s.clone())
1374            }
1375        });
1376
1377        Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1378    }
1379
1380    /// Drop columns that are in `names`.
1381    pub fn drop_many<I, S>(&self, names: I) -> Self
1382    where
1383        I: IntoIterator<Item = S>,
1384        S: Into<PlSmallStr>,
1385    {
1386        let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1387        self.drop_many_amortized(&names)
1388    }
1389
1390    /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1391    pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1392        if names.is_empty() {
1393            return self.clone();
1394        }
1395        let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1396        self.columns.iter().for_each(|s| {
1397            if !names.contains(s.name()) {
1398                new_cols.push(s.clone())
1399            }
1400        });
1401
1402        unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1403    }
1404
1405    /// Insert a new column at a given index without checking for duplicates.
1406    /// This can leave the [`DataFrame`] at an invalid state
1407    fn insert_column_no_name_check(
1408        &mut self,
1409        index: usize,
1410        column: Column,
1411    ) -> PolarsResult<&mut Self> {
1412        polars_ensure!(
1413            self.width() == 0 || column.len() == self.height(),
1414            ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1415            column.len(), self.height(),
1416        );
1417
1418        if self.width() == 0 {
1419            self.height = column.len();
1420        }
1421
1422        self.columns.insert(index, column);
1423        self.clear_schema();
1424        Ok(self)
1425    }
1426
1427    /// Insert a new column at a given index.
1428    pub fn insert_column<S: IntoColumn>(
1429        &mut self,
1430        index: usize,
1431        column: S,
1432    ) -> PolarsResult<&mut Self> {
1433        let column = column.into_column();
1434        self.check_already_present(column.name().as_str())?;
1435        self.insert_column_no_name_check(index, column)
1436    }
1437
1438    fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1439        if let Some(idx) = self.get_column_index(column.name().as_str()) {
1440            self.replace_column(idx, column)?;
1441        } else {
1442            if self.width() == 0 {
1443                self.height = column.len();
1444            }
1445
1446            self.columns.push(column);
1447            self.clear_schema();
1448        }
1449        Ok(())
1450    }
1451
1452    /// Add a new column to this [`DataFrame`] or replace an existing one.
1453    pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1454        fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1455            let height = df.height();
1456            if column.len() == 1 && height > 1 {
1457                column = column.new_from_index(0, height);
1458            }
1459
1460            if column.len() == height || df.get_columns().is_empty() {
1461                df.add_column_by_search(column)?;
1462                Ok(df)
1463            }
1464            // special case for literals
1465            else if height == 0 && column.len() == 1 {
1466                let s = column.clear();
1467                df.add_column_by_search(s)?;
1468                Ok(df)
1469            } else {
1470                polars_bail!(
1471                    ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1472                    column.len(), height,
1473                );
1474            }
1475        }
1476        let column = column.into_column();
1477        inner(self, column)
1478    }
1479
1480    /// Adds a column to the [`DataFrame`] without doing any checks
1481    /// on length or duplicates.
1482    ///
1483    /// # Safety
1484    /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1485    pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1486        debug_assert!(self.width() == 0 || self.height() == column.len());
1487        debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1488
1489        // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1490        // properly for `width` == 0.
1491        if self.width() == 0 {
1492            unsafe { self.set_height(column.len()) };
1493        }
1494        unsafe { self.get_columns_mut() }.push(column);
1495        self.clear_schema();
1496
1497        self
1498    }
1499
1500    // Note: Schema can be both input or output_schema
1501    fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1502        let name = c.name();
1503        if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1504            if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1505                // Given schema is output_schema and we can push.
1506                if idx == self.columns.len() {
1507                    if self.width() == 0 {
1508                        self.height = c.len();
1509                    }
1510
1511                    self.columns.push(c);
1512                    self.clear_schema();
1513                }
1514                // Schema is incorrect fallback to search
1515                else {
1516                    debug_assert!(false);
1517                    self.add_column_by_search(c)?;
1518                }
1519            } else {
1520                self.replace_column(idx, c)?;
1521            }
1522        } else {
1523            if self.width() == 0 {
1524                self.height = c.len();
1525            }
1526
1527            self.columns.push(c);
1528            self.clear_schema();
1529        }
1530
1531        Ok(())
1532    }
1533
1534    // Note: Schema can be both input or output_schema
1535    pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1536        for (i, s) in series.into_iter().enumerate() {
1537            // we need to branch here
1538            // because users can add multiple columns with the same name
1539            if i == 0 || schema.get(s.name().as_str()).is_some() {
1540                self.with_column_and_schema(s.into_column(), schema)?;
1541            } else {
1542                self.with_column(s.clone().into_column())?;
1543            }
1544        }
1545        Ok(())
1546    }
1547
1548    pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1549        for (i, s) in columns.into_iter().enumerate() {
1550            // we need to branch here
1551            // because users can add multiple columns with the same name
1552            if i == 0 || schema.get(s.name().as_str()).is_some() {
1553                self.with_column_and_schema(s, schema)?;
1554            } else {
1555                self.with_column(s.clone())?;
1556            }
1557        }
1558
1559        Ok(())
1560    }
1561
1562    /// Add a new column to this [`DataFrame`] or replace an existing one.
1563    /// Uses an existing schema to amortize lookups.
1564    /// If the schema is incorrect, we will fallback to linear search.
1565    ///
1566    /// Note: Schema can be both input or output_schema
1567    pub fn with_column_and_schema<C: IntoColumn>(
1568        &mut self,
1569        column: C,
1570        schema: &Schema,
1571    ) -> PolarsResult<&mut Self> {
1572        let mut column = column.into_column();
1573
1574        let height = self.height();
1575        if column.len() == 1 && height > 1 {
1576            column = column.new_from_index(0, height);
1577        }
1578
1579        if column.len() == height || self.columns.is_empty() {
1580            self.add_column_by_schema(column, schema)?;
1581            Ok(self)
1582        }
1583        // special case for literals
1584        else if height == 0 && column.len() == 1 {
1585            let s = column.clear();
1586            self.add_column_by_schema(s, schema)?;
1587            Ok(self)
1588        } else {
1589            polars_bail!(
1590                ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1591                column.len(), height,
1592            );
1593        }
1594    }
1595
1596    /// Get a row in the [`DataFrame`]. Beware this is slow.
1597    ///
1598    /// # Example
1599    ///
1600    /// ```
1601    /// # use polars_core::prelude::*;
1602    /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1603    ///     df.get(idx)
1604    /// }
1605    /// ```
1606    pub fn get(&self, idx: usize) -> Option<Vec<AnyValue>> {
1607        match self.columns.first() {
1608            Some(s) => {
1609                if s.len() <= idx {
1610                    return None;
1611                }
1612            },
1613            None => return None,
1614        }
1615        // SAFETY: we just checked bounds
1616        unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1617    }
1618
1619    /// Select a [`Series`] by index.
1620    ///
1621    /// # Example
1622    ///
1623    /// ```rust
1624    /// # use polars_core::prelude::*;
1625    /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1626    ///                         "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1627    ///
1628    /// let s1: Option<&Column> = df.select_at_idx(0);
1629    /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1630    ///
1631    /// assert_eq!(s1, Some(&s2));
1632    /// # Ok::<(), PolarsError>(())
1633    /// ```
1634    pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1635        self.columns.get(idx)
1636    }
1637
1638    /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1639    ///
1640    /// # Examples
1641    ///
1642    /// ```rust
1643    /// # use polars_core::prelude::*;
1644    /// let df = df! {
1645    ///     "0" => [0, 0, 0],
1646    ///     "1" => [1, 1, 1],
1647    ///     "2" => [2, 2, 2]
1648    /// }?;
1649    ///
1650    /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1651    /// assert!(df.equals(&df.select_by_range(..)?));
1652    /// # Ok::<(), PolarsError>(())
1653    /// ```
1654    pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1655    where
1656        R: ops::RangeBounds<usize>,
1657    {
1658        // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1659        // because it is the nightly feature. We should change here if this function were stable.
1660        fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1661        where
1662            R: ops::RangeBounds<usize>,
1663        {
1664            let len = bounds.end;
1665
1666            let start: ops::Bound<&usize> = range.start_bound();
1667            let start = match start {
1668                ops::Bound::Included(&start) => start,
1669                ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1670                    panic!("attempted to index slice from after maximum usize");
1671                }),
1672                ops::Bound::Unbounded => 0,
1673            };
1674
1675            let end: ops::Bound<&usize> = range.end_bound();
1676            let end = match end {
1677                ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1678                    panic!("attempted to index slice up to maximum usize");
1679                }),
1680                ops::Bound::Excluded(&end) => end,
1681                ops::Bound::Unbounded => len,
1682            };
1683
1684            if start > end {
1685                panic!("slice index starts at {start} but ends at {end}");
1686            }
1687            if end > len {
1688                panic!("range end index {end} out of range for slice of length {len}",);
1689            }
1690
1691            ops::Range { start, end }
1692        }
1693
1694        let colnames = self.get_column_names_owned();
1695        let range = get_range(range, ..colnames.len());
1696
1697        self._select_impl(&colnames[range])
1698    }
1699
1700    /// Get column index of a [`Series`] by name.
1701    /// # Example
1702    ///
1703    /// ```rust
1704    /// # use polars_core::prelude::*;
1705    /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1706    ///                         "Health" => [100, 200, 500],
1707    ///                         "Mana" => [250, 100, 0],
1708    ///                         "Strength" => [30, 150, 300])?;
1709    ///
1710    /// assert_eq!(df.get_column_index("Name"), Some(0));
1711    /// assert_eq!(df.get_column_index("Health"), Some(1));
1712    /// assert_eq!(df.get_column_index("Mana"), Some(2));
1713    /// assert_eq!(df.get_column_index("Strength"), Some(3));
1714    /// assert_eq!(df.get_column_index("Haste"), None);
1715    /// # Ok::<(), PolarsError>(())
1716    /// ```
1717    pub fn get_column_index(&self, name: &str) -> Option<usize> {
1718        let schema = self.schema();
1719        if let Some(idx) = schema.index_of(name) {
1720            if self
1721                .get_columns()
1722                .get(idx)
1723                .is_some_and(|c| c.name() == name)
1724            {
1725                return Some(idx);
1726            }
1727        }
1728
1729        self.columns.iter().position(|s| s.name().as_str() == name)
1730    }
1731
1732    /// Get column index of a [`Series`] by name.
1733    pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1734        self.get_column_index(name)
1735            .ok_or_else(|| polars_err!(col_not_found = name))
1736    }
1737
1738    /// Select a single column by name.
1739    ///
1740    /// # Example
1741    ///
1742    /// ```rust
1743    /// # use polars_core::prelude::*;
1744    /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1745    /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1746    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1747    ///
1748    /// assert_eq!(df.column("Password")?, &s1);
1749    /// # Ok::<(), PolarsError>(())
1750    /// ```
1751    pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1752        let idx = self.try_get_column_index(name)?;
1753        Ok(self.select_at_idx(idx).unwrap())
1754    }
1755
1756    /// Selected multiple columns by name.
1757    ///
1758    /// # Example
1759    ///
1760    /// ```rust
1761    /// # use polars_core::prelude::*;
1762    /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1763    ///                         "Max weight (kg)" => [16.0, 35.89])?;
1764    /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1765    ///
1766    /// assert_eq!(&df[0], sv[0]);
1767    /// assert_eq!(&df[1], sv[1]);
1768    /// # Ok::<(), PolarsError>(())
1769    /// ```
1770    pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1771    where
1772        I: IntoIterator<Item = S>,
1773        S: AsRef<str>,
1774    {
1775        names
1776            .into_iter()
1777            .map(|name| self.column(name.as_ref()))
1778            .collect()
1779    }
1780
1781    /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1782    ///
1783    /// # Examples
1784    ///
1785    /// ```
1786    /// # use polars_core::prelude::*;
1787    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1788    ///     df.select(["foo", "bar"])
1789    /// }
1790    /// ```
1791    pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1792    where
1793        I: IntoIterator<Item = S>,
1794        S: Into<PlSmallStr>,
1795    {
1796        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1797        self._select_impl(cols.as_slice())
1798    }
1799
1800    pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1801        ensure_names_unique(cols, |s| s.as_str())?;
1802        self._select_impl_unchecked(cols)
1803    }
1804
1805    pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1806        let selected = self.select_columns_impl(cols)?;
1807        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1808    }
1809
1810    /// Select with a known schema. The schema names must match the column names of this DataFrame.
1811    pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1812    where
1813        I: IntoIterator<Item = S>,
1814        S: Into<PlSmallStr>,
1815    {
1816        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1817        self._select_with_schema_impl(&cols, schema, true)
1818    }
1819
1820    /// Select with a known schema without checking for duplicates in `selection`.
1821    /// The schema names must match the column names of this DataFrame.
1822    pub fn select_with_schema_unchecked<I, S>(
1823        &self,
1824        selection: I,
1825        schema: &Schema,
1826    ) -> PolarsResult<Self>
1827    where
1828        I: IntoIterator<Item = S>,
1829        S: Into<PlSmallStr>,
1830    {
1831        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1832        self._select_with_schema_impl(&cols, schema, false)
1833    }
1834
1835    /// * The schema names must match the column names of this DataFrame.
1836    pub fn _select_with_schema_impl(
1837        &self,
1838        cols: &[PlSmallStr],
1839        schema: &Schema,
1840        check_duplicates: bool,
1841    ) -> PolarsResult<Self> {
1842        if check_duplicates {
1843            ensure_names_unique(cols, |s| s.as_str())?;
1844        }
1845
1846        let selected = self.select_columns_impl_with_schema(cols, schema)?;
1847        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1848    }
1849
1850    /// A non generic implementation to reduce compiler bloat.
1851    fn select_columns_impl_with_schema(
1852        &self,
1853        cols: &[PlSmallStr],
1854        schema: &Schema,
1855    ) -> PolarsResult<Vec<Column>> {
1856        if cfg!(debug_assertions) {
1857            ensure_matching_schema_names(schema, self.schema())?;
1858        }
1859
1860        cols.iter()
1861            .map(|name| {
1862                let index = schema.try_get_full(name.as_str())?.0;
1863                Ok(self.columns[index].clone())
1864            })
1865            .collect()
1866    }
1867
1868    pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1869    where
1870        I: IntoIterator<Item = S>,
1871        S: Into<PlSmallStr>,
1872    {
1873        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1874        self.select_physical_impl(&cols)
1875    }
1876
1877    fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1878        ensure_names_unique(cols, |s| s.as_str())?;
1879        let selected = self.select_columns_physical_impl(cols)?;
1880        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1881    }
1882
1883    /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1884    ///
1885    /// # Example
1886    ///
1887    /// ```rust
1888    /// # use polars_core::prelude::*;
1889    /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1890    ///                         "Carbon" => [1, 2, 3],
1891    ///                         "Hydrogen" => [4, 6, 8])?;
1892    /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1893    ///
1894    /// assert_eq!(df["Carbon"], sv[0]);
1895    /// assert_eq!(df["Hydrogen"], sv[1]);
1896    /// # Ok::<(), PolarsError>(())
1897    /// ```
1898    pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1899        let cols = selection.into_vec();
1900        self.select_columns_impl(&cols)
1901    }
1902
1903    fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1904        self.columns
1905            .iter()
1906            .enumerate()
1907            .map(|(i, s)| (s.name().as_str(), i))
1908            .collect()
1909    }
1910
1911    /// A non generic implementation to reduce compiler bloat.
1912    fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1913        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1914            let name_to_idx = self._names_to_idx_map();
1915            cols.iter()
1916                .map(|name| {
1917                    let idx = *name_to_idx
1918                        .get(name.as_str())
1919                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1920                    Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1921                })
1922                .collect::<PolarsResult<Vec<_>>>()?
1923        } else {
1924            cols.iter()
1925                .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1926                .collect::<PolarsResult<Vec<_>>>()?
1927        };
1928
1929        Ok(selected)
1930    }
1931
1932    /// A non generic implementation to reduce compiler bloat.
1933    fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1934        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1935            // we hash, because there are user that having millions of columns.
1936            // # https://github.com/pola-rs/polars/issues/1023
1937            let name_to_idx = self._names_to_idx_map();
1938
1939            cols.iter()
1940                .map(|name| {
1941                    let idx = *name_to_idx
1942                        .get(name.as_str())
1943                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1944                    Ok(self.select_at_idx(idx).unwrap().clone())
1945                })
1946                .collect::<PolarsResult<Vec<_>>>()?
1947        } else {
1948            cols.iter()
1949                .map(|c| self.column(c.as_str()).cloned())
1950                .collect::<PolarsResult<Vec<_>>>()?
1951        };
1952
1953        Ok(selected)
1954    }
1955
1956    fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1957        // If there is a filtered column just see how many columns there are left.
1958        if let Some(fst) = filtered.first() {
1959            return fst.len();
1960        }
1961
1962        // Otherwise, count the number of values that would be filtered and return that height.
1963        let num_trues = mask.num_trues();
1964        if mask.len() == self.height() {
1965            num_trues
1966        } else {
1967            // This is for broadcasting masks
1968            debug_assert!(num_trues == 0 || num_trues == 1);
1969            self.height() * num_trues
1970        }
1971    }
1972
1973    /// Take the [`DataFrame`] rows by a boolean mask.
1974    ///
1975    /// # Example
1976    ///
1977    /// ```
1978    /// # use polars_core::prelude::*;
1979    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1980    ///     let mask = df.column("sepal_width")?.is_not_null();
1981    ///     df.filter(&mask)
1982    /// }
1983    /// ```
1984    pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1985        let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
1986        let height = self.filter_height(&new_col, mask);
1987
1988        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1989    }
1990
1991    /// Same as `filter` but does not parallelize.
1992    pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1993        let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
1994        let height = self.filter_height(&new_col, mask);
1995
1996        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1997    }
1998
1999    /// Take [`DataFrame`] rows by index values.
2000    ///
2001    /// # Example
2002    ///
2003    /// ```
2004    /// # use polars_core::prelude::*;
2005    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2006    ///     let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2007    ///     df.take(&idx)
2008    /// }
2009    /// ```
2010    pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2011        let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2012
2013        Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2014    }
2015
2016    /// # Safety
2017    /// The indices must be in-bounds.
2018    pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2019        self.take_unchecked_impl(idx, true)
2020    }
2021
2022    /// # Safety
2023    /// The indices must be in-bounds.
2024    pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2025        let cols = if allow_threads {
2026            POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
2027        } else {
2028            self._apply_columns(&|s| s.take_unchecked(idx))
2029        };
2030        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2031    }
2032
2033    /// # Safety
2034    /// The indices must be in-bounds.
2035    pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2036        self.take_slice_unchecked_impl(idx, true)
2037    }
2038
2039    /// # Safety
2040    /// The indices must be in-bounds.
2041    pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2042        let cols = if allow_threads {
2043            POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2044        } else {
2045            self._apply_columns(&|s| s.take_slice_unchecked(idx))
2046        };
2047        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2048    }
2049
2050    /// Rename a column in the [`DataFrame`].
2051    ///
2052    /// # Example
2053    ///
2054    /// ```
2055    /// # use polars_core::prelude::*;
2056    /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2057    ///     let original_name = "foo";
2058    ///     let new_name = "bar";
2059    ///     df.rename(original_name, new_name.into())
2060    /// }
2061    /// ```
2062    pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2063        if column == name.as_str() {
2064            return Ok(self);
2065        }
2066        polars_ensure!(
2067            !self.schema().contains(&name),
2068            Duplicate: "column rename attempted with already existing name \"{name}\""
2069        );
2070
2071        self.get_column_index(column)
2072            .and_then(|idx| self.columns.get_mut(idx))
2073            .ok_or_else(|| polars_err!(col_not_found = column))
2074            .map(|c| c.rename(name))?;
2075        Ok(self)
2076    }
2077
2078    /// Sort [`DataFrame`] in place.
2079    ///
2080    /// See [`DataFrame::sort`] for more instruction.
2081    pub fn sort_in_place(
2082        &mut self,
2083        by: impl IntoVec<PlSmallStr>,
2084        sort_options: SortMultipleOptions,
2085    ) -> PolarsResult<&mut Self> {
2086        let by_column = self.select_columns(by)?;
2087        self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2088        Ok(self)
2089    }
2090
2091    #[doc(hidden)]
2092    /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2093    pub fn sort_impl(
2094        &self,
2095        by_column: Vec<Column>,
2096        mut sort_options: SortMultipleOptions,
2097        slice: Option<(i64, usize)>,
2098    ) -> PolarsResult<Self> {
2099        if by_column.is_empty() {
2100            // If no columns selected, any order (including original order) is correct.
2101            return if let Some((offset, len)) = slice {
2102                Ok(self.slice(offset, len))
2103            } else {
2104                Ok(self.clone())
2105            };
2106        }
2107
2108        // note that the by_column argument also contains evaluated expression from
2109        // polars-lazy that may not even be present in this dataframe. therefore
2110        // when we try to set the first columns as sorted, we ignore the error as
2111        // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2112        let first_descending = sort_options.descending[0];
2113        let first_by_column = by_column[0].name().to_string();
2114
2115        let set_sorted = |df: &mut DataFrame| {
2116            // Mark the first sort column as sorted; if the column does not exist it
2117            // is ok, because we sorted by an expression not present in the dataframe
2118            let _ = df.apply(&first_by_column, |s| {
2119                let mut s = s.clone();
2120                if first_descending {
2121                    s.set_sorted_flag(IsSorted::Descending)
2122                } else {
2123                    s.set_sorted_flag(IsSorted::Ascending)
2124                }
2125                s
2126            });
2127        };
2128        if self.is_empty() {
2129            let mut out = self.clone();
2130            set_sorted(&mut out);
2131            return Ok(out);
2132        }
2133
2134        if let Some((0, k)) = slice {
2135            if k < self.len() {
2136                return self.bottom_k_impl(k, by_column, sort_options);
2137            }
2138        }
2139        // Check if the required column is already sorted; if so we can exit early
2140        // We can do so when there is only one column to sort by, for multiple columns
2141        // it will be complicated to do so
2142        #[cfg(feature = "dtype-categorical")]
2143        let is_not_categorical_enum =
2144            !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2145                || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2146
2147        #[cfg(not(feature = "dtype-categorical"))]
2148        #[allow(non_upper_case_globals)]
2149        const is_not_categorical_enum: bool = true;
2150
2151        if by_column.len() == 1 && is_not_categorical_enum {
2152            let required_sorting = if sort_options.descending[0] {
2153                IsSorted::Descending
2154            } else {
2155                IsSorted::Ascending
2156            };
2157            // If null count is 0 then nulls_last doesnt matter
2158            // Safe to get value at last position since the dataframe is not empty (taken care above)
2159            let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2160                && ((by_column[0].null_count() == 0)
2161                    || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2162                        == sort_options.nulls_last[0]);
2163
2164            if no_sorting_required {
2165                return if let Some((offset, len)) = slice {
2166                    Ok(self.slice(offset, len))
2167                } else {
2168                    Ok(self.clone())
2169                };
2170            }
2171        }
2172
2173        let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2174
2175        // a lot of indirection in both sorting and take
2176        let mut df = self.clone();
2177        let df = df.as_single_chunk_par();
2178        let mut take = match (by_column.len(), has_nested) {
2179            (1, false) => {
2180                let s = &by_column[0];
2181                let options = SortOptions {
2182                    descending: sort_options.descending[0],
2183                    nulls_last: sort_options.nulls_last[0],
2184                    multithreaded: sort_options.multithreaded,
2185                    maintain_order: sort_options.maintain_order,
2186                    limit: sort_options.limit,
2187                };
2188                // fast path for a frame with a single series
2189                // no need to compute the sort indices and then take by these indices
2190                // simply sort and return as frame
2191                if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2192                    let mut out = s.sort_with(options)?;
2193                    if let Some((offset, len)) = slice {
2194                        out = out.slice(offset, len);
2195                    }
2196                    return Ok(out.into_frame());
2197                }
2198                s.arg_sort(options)
2199            },
2200            _ => {
2201                if sort_options.nulls_last.iter().all(|&x| x)
2202                    || has_nested
2203                    || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2204                {
2205                    argsort_multiple_row_fmt(
2206                        &by_column,
2207                        sort_options.descending,
2208                        sort_options.nulls_last,
2209                        sort_options.multithreaded,
2210                    )?
2211                } else {
2212                    let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2213                    first
2214                        .as_materialized_series()
2215                        .arg_sort_multiple(&other, &sort_options)?
2216                }
2217            },
2218        };
2219
2220        if let Some((offset, len)) = slice {
2221            take = take.slice(offset, len);
2222        }
2223
2224        // SAFETY:
2225        // the created indices are in bounds
2226        let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2227        set_sorted(&mut df);
2228        Ok(df)
2229    }
2230
2231    /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2232    ///
2233    /// This dataframe does not necessarily have a specified schema and may be changed at any
2234    /// point. It is primarily used for debugging.
2235    pub fn _to_metadata(&self) -> DataFrame {
2236        let num_columns = self.columns.len();
2237
2238        let mut column_names =
2239            StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2240        let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2241        let mut sorted_asc_ca =
2242            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2243        let mut sorted_dsc_ca =
2244            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2245        let mut fast_explode_list_ca =
2246            BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2247        let mut materialized_at_ca =
2248            StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2249
2250        for col in &self.columns {
2251            let flags = col.get_flags();
2252
2253            let (repr, materialized_at) = match col {
2254                Column::Series(s) => ("series", s.materialized_at()),
2255                Column::Partitioned(_) => ("partitioned", None),
2256                Column::Scalar(_) => ("scalar", None),
2257            };
2258            let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2259            let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2260            let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2261
2262            column_names.append_value(col.name().clone());
2263            repr_ca.append_value(repr);
2264            sorted_asc_ca.append_value(sorted_asc);
2265            sorted_dsc_ca.append_value(sorted_dsc);
2266            fast_explode_list_ca.append_value(fast_explode_list);
2267            materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2268        }
2269
2270        unsafe {
2271            DataFrame::new_no_checks(
2272                self.width(),
2273                vec![
2274                    column_names.finish().into_column(),
2275                    repr_ca.finish().into_column(),
2276                    sorted_asc_ca.finish().into_column(),
2277                    sorted_dsc_ca.finish().into_column(),
2278                    fast_explode_list_ca.finish().into_column(),
2279                    materialized_at_ca.finish().into_column(),
2280                ],
2281            )
2282        }
2283    }
2284
2285    /// Return a sorted clone of this [`DataFrame`].
2286    ///
2287    /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2288    /// # Example
2289    ///
2290    /// Sort by a single column with default options:
2291    /// ```
2292    /// # use polars_core::prelude::*;
2293    /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2294    ///     df.sort(["sepal_width"], Default::default())
2295    /// }
2296    /// ```
2297    /// Sort by a single column with specific order:
2298    /// ```
2299    /// # use polars_core::prelude::*;
2300    /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2301    ///     df.sort(
2302    ///         ["sepal_width"],
2303    ///         SortMultipleOptions::new()
2304    ///             .with_order_descending(descending)
2305    ///     )
2306    /// }
2307    /// ```
2308    /// Sort by multiple columns with specifying order for each column:
2309    /// ```
2310    /// # use polars_core::prelude::*;
2311    /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2312    ///     df.sort(
2313    ///         ["sepal_width", "sepal_length"],
2314    ///         SortMultipleOptions::new()
2315    ///             .with_order_descending_multi([false, true])
2316    ///     )
2317    /// }
2318    /// ```
2319    /// See [`SortMultipleOptions`] for more options.
2320    ///
2321    /// Also see [`DataFrame::sort_in_place`].
2322    pub fn sort(
2323        &self,
2324        by: impl IntoVec<PlSmallStr>,
2325        sort_options: SortMultipleOptions,
2326    ) -> PolarsResult<Self> {
2327        let mut df = self.clone();
2328        df.sort_in_place(by, sort_options)?;
2329        Ok(df)
2330    }
2331
2332    /// Replace a column with a [`Series`].
2333    ///
2334    /// # Example
2335    ///
2336    /// ```rust
2337    /// # use polars_core::prelude::*;
2338    /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2339    ///                         "Area (km²)" => [9_833_520, 9_596_961])?;
2340    /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2341    ///
2342    /// assert!(df.replace("Nation", s.clone()).is_err());
2343    /// assert!(df.replace("Country", s).is_ok());
2344    /// # Ok::<(), PolarsError>(())
2345    /// ```
2346    pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2347        self.apply(column, |_| new_col.into_series())
2348    }
2349
2350    /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2351    /// is that now the value of `column: &str` determines the name of the column and not the name
2352    /// of the `Series` passed to this method.
2353    pub fn replace_or_add<S: IntoSeries>(
2354        &mut self,
2355        column: PlSmallStr,
2356        new_col: S,
2357    ) -> PolarsResult<&mut Self> {
2358        let mut new_col = new_col.into_series();
2359        new_col.rename(column);
2360        self.with_column(new_col)
2361    }
2362
2363    /// Replace column at index `idx` with a [`Series`].
2364    ///
2365    /// # Example
2366    ///
2367    /// ```ignored
2368    /// # use polars_core::prelude::*;
2369    /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2370    /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2371    /// let mut df = DataFrame::new(vec![s0, s1])?;
2372    ///
2373    /// // Add 32 to get lowercase ascii values
2374    /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2375    /// # Ok::<(), PolarsError>(())
2376    /// ```
2377    pub fn replace_column<C: IntoColumn>(
2378        &mut self,
2379        index: usize,
2380        new_column: C,
2381    ) -> PolarsResult<&mut Self> {
2382        polars_ensure!(
2383            index < self.width(),
2384            ShapeMismatch:
2385            "unable to replace at index {}, the DataFrame has only {} columns",
2386            index, self.width(),
2387        );
2388        let mut new_column = new_column.into_column();
2389        polars_ensure!(
2390            new_column.len() == self.height(),
2391            ShapeMismatch:
2392            "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2393            new_column.len(), self.height(),
2394        );
2395        let old_col = &mut self.columns[index];
2396        mem::swap(old_col, &mut new_column);
2397        self.clear_schema();
2398        Ok(self)
2399    }
2400
2401    /// Apply a closure to a column. This is the recommended way to do in place modification.
2402    ///
2403    /// # Example
2404    ///
2405    /// ```rust
2406    /// # use polars_core::prelude::*;
2407    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2408    /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2409    /// let mut df = DataFrame::new(vec![s0, s1])?;
2410    ///
2411    /// fn str_to_len(str_val: &Column) -> Column {
2412    ///     str_val.str()
2413    ///         .unwrap()
2414    ///         .into_iter()
2415    ///         .map(|opt_name: Option<&str>| {
2416    ///             opt_name.map(|name: &str| name.len() as u32)
2417    ///          })
2418    ///         .collect::<UInt32Chunked>()
2419    ///         .into_column()
2420    /// }
2421    ///
2422    /// // Replace the names column by the length of the names.
2423    /// df.apply("names", str_to_len);
2424    /// # Ok::<(), PolarsError>(())
2425    /// ```
2426    /// Results in:
2427    ///
2428    /// ```text
2429    /// +--------+-------+
2430    /// | foo    |       |
2431    /// | ---    | names |
2432    /// | str    | u32   |
2433    /// +========+=======+
2434    /// | "ham"  | 4     |
2435    /// +--------+-------+
2436    /// | "spam" | 6     |
2437    /// +--------+-------+
2438    /// | "egg"  | 3     |
2439    /// +--------+-------+
2440    /// ```
2441    pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2442    where
2443        F: FnOnce(&Column) -> C,
2444        C: IntoColumn,
2445    {
2446        let idx = self.check_name_to_idx(name)?;
2447        self.apply_at_idx(idx, f)
2448    }
2449
2450    /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2451    /// modification.
2452    ///
2453    /// # Example
2454    ///
2455    /// ```rust
2456    /// # use polars_core::prelude::*;
2457    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2458    /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2459    /// let mut df = DataFrame::new(vec![s0, s1])?;
2460    ///
2461    /// // Add 32 to get lowercase ascii values
2462    /// df.apply_at_idx(1, |s| s + 32);
2463    /// # Ok::<(), PolarsError>(())
2464    /// ```
2465    /// Results in:
2466    ///
2467    /// ```text
2468    /// +--------+-------+
2469    /// | foo    | ascii |
2470    /// | ---    | ---   |
2471    /// | str    | i32   |
2472    /// +========+=======+
2473    /// | "ham"  | 102   |
2474    /// +--------+-------+
2475    /// | "spam" | 111   |
2476    /// +--------+-------+
2477    /// | "egg"  | 111   |
2478    /// +--------+-------+
2479    /// ```
2480    pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2481    where
2482        F: FnOnce(&Column) -> C,
2483        C: IntoColumn,
2484    {
2485        let df_height = self.height();
2486        let width = self.width();
2487        let col = self.columns.get_mut(idx).ok_or_else(|| {
2488            polars_err!(
2489                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2490                idx, width
2491            )
2492        })?;
2493        let name = col.name().clone();
2494        let new_col = f(col).into_column();
2495        match new_col.len() {
2496            1 => {
2497                let new_col = new_col.new_from_index(0, df_height);
2498                let _ = mem::replace(col, new_col);
2499            },
2500            len if (len == df_height) => {
2501                let _ = mem::replace(col, new_col);
2502            },
2503            len => polars_bail!(
2504                ShapeMismatch:
2505                "resulting Series has length {} while the DataFrame has height {}",
2506                len, df_height
2507            ),
2508        }
2509
2510        // make sure the name remains the same after applying the closure
2511        unsafe {
2512            let col = self.columns.get_unchecked_mut(idx);
2513            col.rename(name);
2514        }
2515        Ok(self)
2516    }
2517
2518    /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2519    /// modification.
2520    ///
2521    /// # Example
2522    ///
2523    /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2524    ///
2525    /// ```rust
2526    /// # use polars_core::prelude::*;
2527    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2528    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2529    /// let mut df = DataFrame::new(vec![s0, s1])?;
2530    ///
2531    /// let idx = vec![0, 1, 4];
2532    ///
2533    /// df.try_apply("foo", |c| {
2534    ///     c.str()?
2535    ///     .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2536    /// });
2537    /// # Ok::<(), PolarsError>(())
2538    /// ```
2539    /// Results in:
2540    ///
2541    /// ```text
2542    /// +---------------------+--------+
2543    /// | foo                 | values |
2544    /// | ---                 | ---    |
2545    /// | str                 | i32    |
2546    /// +=====================+========+
2547    /// | "ham-is-modified"   | 1      |
2548    /// +---------------------+--------+
2549    /// | "spam-is-modified"  | 2      |
2550    /// +---------------------+--------+
2551    /// | "egg"               | 3      |
2552    /// +---------------------+--------+
2553    /// | "bacon"             | 4      |
2554    /// +---------------------+--------+
2555    /// | "quack-is-modified" | 5      |
2556    /// +---------------------+--------+
2557    /// ```
2558    pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2559    where
2560        F: FnOnce(&Column) -> PolarsResult<C>,
2561        C: IntoColumn,
2562    {
2563        let width = self.width();
2564        let col = self.columns.get_mut(idx).ok_or_else(|| {
2565            polars_err!(
2566                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2567                idx, width
2568            )
2569        })?;
2570        let name = col.name().clone();
2571
2572        let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2573
2574        // make sure the name remains the same after applying the closure
2575        unsafe {
2576            let col = self.columns.get_unchecked_mut(idx);
2577            col.rename(name);
2578        }
2579        Ok(self)
2580    }
2581
2582    /// Apply a closure that may fail to a column. This is the recommended way to do in place
2583    /// modification.
2584    ///
2585    /// # Example
2586    ///
2587    /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2588    ///
2589    /// ```rust
2590    /// # use polars_core::prelude::*;
2591    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2592    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2593    /// let mut df = DataFrame::new(vec![s0, s1])?;
2594    ///
2595    /// // create a mask
2596    /// let values = df.column("values")?.as_materialized_series();
2597    /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2598    ///
2599    /// df.try_apply("foo", |c| {
2600    ///     c.str()?
2601    ///     .set(&mask, Some("not_within_bounds"))
2602    /// });
2603    /// # Ok::<(), PolarsError>(())
2604    /// ```
2605    /// Results in:
2606    ///
2607    /// ```text
2608    /// +---------------------+--------+
2609    /// | foo                 | values |
2610    /// | ---                 | ---    |
2611    /// | str                 | i32    |
2612    /// +=====================+========+
2613    /// | "not_within_bounds" | 1      |
2614    /// +---------------------+--------+
2615    /// | "spam"              | 2      |
2616    /// +---------------------+--------+
2617    /// | "egg"               | 3      |
2618    /// +---------------------+--------+
2619    /// | "bacon"             | 4      |
2620    /// +---------------------+--------+
2621    /// | "not_within_bounds" | 5      |
2622    /// +---------------------+--------+
2623    /// ```
2624    pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2625    where
2626        F: FnOnce(&Series) -> PolarsResult<C>,
2627        C: IntoColumn,
2628    {
2629        let idx = self.try_get_column_index(column)?;
2630        self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2631    }
2632
2633    /// Slice the [`DataFrame`] along the rows.
2634    ///
2635    /// # Example
2636    ///
2637    /// ```rust
2638    /// # use polars_core::prelude::*;
2639    /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2640    ///                         "Color" => ["Green", "Red", "White", "White", "Red"])?;
2641    /// let sl: DataFrame = df.slice(2, 3);
2642    ///
2643    /// assert_eq!(sl.shape(), (3, 2));
2644    /// println!("{}", sl);
2645    /// # Ok::<(), PolarsError>(())
2646    /// ```
2647    /// Output:
2648    /// ```text
2649    /// shape: (3, 2)
2650    /// +-------+-------+
2651    /// | Fruit | Color |
2652    /// | ---   | ---   |
2653    /// | str   | str   |
2654    /// +=======+=======+
2655    /// | Grape | White |
2656    /// +-------+-------+
2657    /// | Fig   | White |
2658    /// +-------+-------+
2659    /// | Fig   | Red   |
2660    /// +-------+-------+
2661    /// ```
2662    #[must_use]
2663    pub fn slice(&self, offset: i64, length: usize) -> Self {
2664        if offset == 0 && length == self.height() {
2665            return self.clone();
2666        }
2667        if length == 0 {
2668            return self.clear();
2669        }
2670        let col = self
2671            .columns
2672            .iter()
2673            .map(|s| s.slice(offset, length))
2674            .collect::<Vec<_>>();
2675
2676        let height = if let Some(fst) = col.first() {
2677            fst.len()
2678        } else {
2679            let (_, length) = slice_offsets(offset, length, self.height());
2680            length
2681        };
2682
2683        unsafe { DataFrame::new_no_checks(height, col) }
2684    }
2685
2686    /// Split [`DataFrame`] at the given `offset`.
2687    pub fn split_at(&self, offset: i64) -> (Self, Self) {
2688        let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2689
2690        let (idx, _) = slice_offsets(offset, 0, self.height());
2691
2692        let a = unsafe { DataFrame::new_no_checks(idx, a) };
2693        let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2694        (a, b)
2695    }
2696
2697    pub fn clear(&self) -> Self {
2698        let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2699        unsafe { DataFrame::new_no_checks(0, col) }
2700    }
2701
2702    #[must_use]
2703    pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2704        if offset == 0 && length == self.height() {
2705            return self.clone();
2706        }
2707        let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2708        unsafe { DataFrame::new_no_checks(length, columns) }
2709    }
2710
2711    #[must_use]
2712    pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2713        if offset == 0 && length == self.height() {
2714            return self.clone();
2715        }
2716        // @scalar-opt
2717        let columns = self._apply_columns(&|s| {
2718            let mut out = s.slice(offset, length);
2719            out.shrink_to_fit();
2720            out
2721        });
2722        unsafe { DataFrame::new_no_checks(length, columns) }
2723    }
2724
2725    /// Get the head of the [`DataFrame`].
2726    ///
2727    /// # Example
2728    ///
2729    /// ```rust
2730    /// # use polars_core::prelude::*;
2731    /// let countries: DataFrame =
2732    ///     df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2733    ///         "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2734    ///         "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2735    ///         "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2736    /// assert_eq!(countries.shape(), (5, 4));
2737    ///
2738    /// println!("{}", countries.head(Some(3)));
2739    /// # Ok::<(), PolarsError>(())
2740    /// ```
2741    ///
2742    /// Output:
2743    ///
2744    /// ```text
2745    /// shape: (3, 4)
2746    /// +--------------------+---------------+---------------+------------+
2747    /// | Rank by GDP (2021) | Continent     | Country       | Capital    |
2748    /// | ---                | ---           | ---           | ---        |
2749    /// | i32                | str           | str           | str        |
2750    /// +====================+===============+===============+============+
2751    /// | 1                  | North America | United States | Washington |
2752    /// +--------------------+---------------+---------------+------------+
2753    /// | 2                  | Asia          | China         | Beijing    |
2754    /// +--------------------+---------------+---------------+------------+
2755    /// | 3                  | Asia          | Japan         | Tokyo      |
2756    /// +--------------------+---------------+---------------+------------+
2757    /// ```
2758    #[must_use]
2759    pub fn head(&self, length: Option<usize>) -> Self {
2760        let col = self
2761            .columns
2762            .iter()
2763            .map(|c| c.head(length))
2764            .collect::<Vec<_>>();
2765
2766        let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2767        let height = usize::min(height, self.height());
2768        unsafe { DataFrame::new_no_checks(height, col) }
2769    }
2770
2771    /// Get the tail of the [`DataFrame`].
2772    ///
2773    /// # Example
2774    ///
2775    /// ```rust
2776    /// # use polars_core::prelude::*;
2777    /// let countries: DataFrame =
2778    ///     df!("Rank (2021)" => [105, 106, 107, 108, 109],
2779    ///         "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2780    ///         "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2781    /// assert_eq!(countries.shape(), (5, 3));
2782    ///
2783    /// println!("{}", countries.tail(Some(2)));
2784    /// # Ok::<(), PolarsError>(())
2785    /// ```
2786    ///
2787    /// Output:
2788    ///
2789    /// ```text
2790    /// shape: (2, 3)
2791    /// +-------------+--------------------+---------+
2792    /// | Rank (2021) | Apple Price (€/kg) | Country |
2793    /// | ---         | ---                | ---     |
2794    /// | i32         | f64                | str     |
2795    /// +=============+====================+=========+
2796    /// | 108         | 0.63               | Syria   |
2797    /// +-------------+--------------------+---------+
2798    /// | 109         | 0.63               | Turkey  |
2799    /// +-------------+--------------------+---------+
2800    /// ```
2801    #[must_use]
2802    pub fn tail(&self, length: Option<usize>) -> Self {
2803        let col = self
2804            .columns
2805            .iter()
2806            .map(|c| c.tail(length))
2807            .collect::<Vec<_>>();
2808
2809        let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2810        let height = usize::min(height, self.height());
2811        unsafe { DataFrame::new_no_checks(height, col) }
2812    }
2813
2814    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2815    ///
2816    /// # Panics
2817    ///
2818    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2819    ///
2820    /// This responsibility is left to the caller as we don't want to take mutable references here,
2821    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2822    /// as well.
2823    pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter {
2824        debug_assert!(!self.should_rechunk(), "expected equal chunks");
2825        // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2826        // as we must allocate arrow strings/binaries.
2827        let must_convert = compat_level.0 == 0;
2828        let parallel = parallel
2829            && must_convert
2830            && self.columns.len() > 1
2831            && self
2832                .columns
2833                .iter()
2834                .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2835
2836        RecordBatchIter {
2837            columns: &self.columns,
2838            schema: Arc::new(
2839                self.columns
2840                    .iter()
2841                    .map(|c| c.field().to_arrow(compat_level))
2842                    .collect(),
2843            ),
2844            idx: 0,
2845            n_chunks: self.first_col_n_chunks(),
2846            compat_level,
2847            parallel,
2848        }
2849    }
2850
2851    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2852    ///
2853    /// # Panics
2854    ///
2855    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2856    ///
2857    /// This responsibility is left to the caller as we don't want to take mutable references here,
2858    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2859    /// as well.
2860    pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2861        PhysRecordBatchIter {
2862            schema: Arc::new(
2863                self.get_columns()
2864                    .iter()
2865                    .map(|c| c.field().to_arrow(CompatLevel::newest()))
2866                    .collect(),
2867            ),
2868            arr_iters: self
2869                .materialized_column_iter()
2870                .map(|s| s.chunks().iter())
2871                .collect(),
2872        }
2873    }
2874
2875    /// Get a [`DataFrame`] with all the columns in reversed order.
2876    #[must_use]
2877    pub fn reverse(&self) -> Self {
2878        let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2879        unsafe { DataFrame::new_no_checks(self.height(), col) }
2880    }
2881
2882    /// Shift the values by a given period and fill the parts that will be empty due to this operation
2883    /// with `Nones`.
2884    ///
2885    /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2886    #[must_use]
2887    pub fn shift(&self, periods: i64) -> Self {
2888        let col = self._apply_columns_par(&|s| s.shift(periods));
2889        unsafe { DataFrame::new_no_checks(self.height(), col) }
2890    }
2891
2892    /// Replace None values with one of the following strategies:
2893    /// * Forward fill (replace None with the previous value)
2894    /// * Backward fill (replace None with the next value)
2895    /// * Mean fill (replace None with the mean of the whole array)
2896    /// * Min fill (replace None with the minimum of the whole array)
2897    /// * Max fill (replace None with the maximum of the whole array)
2898    ///
2899    /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2900    pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2901        let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2902
2903        Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2904    }
2905
2906    /// Pipe different functions/ closure operations that work on a DataFrame together.
2907    pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2908    where
2909        F: Fn(DataFrame) -> PolarsResult<B>,
2910    {
2911        f(self)
2912    }
2913
2914    /// Pipe different functions/ closure operations that work on a DataFrame together.
2915    pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2916    where
2917        F: Fn(&mut DataFrame) -> PolarsResult<B>,
2918    {
2919        f(self)
2920    }
2921
2922    /// Pipe different functions/ closure operations that work on a DataFrame together.
2923    pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2924    where
2925        F: Fn(DataFrame, Args) -> PolarsResult<B>,
2926    {
2927        f(self, args)
2928    }
2929
2930    /// Drop duplicate rows from a [`DataFrame`].
2931    /// *This fails when there is a column of type List in DataFrame*
2932    ///
2933    /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2934    ///
2935    /// # Example
2936    ///
2937    /// ```no_run
2938    /// # use polars_core::prelude::*;
2939    /// let df = df! {
2940    ///               "flt" => [1., 1., 2., 2., 3., 3.],
2941    ///               "int" => [1, 1, 2, 2, 3, 3, ],
2942    ///               "str" => ["a", "a", "b", "b", "c", "c"]
2943    ///           }?;
2944    ///
2945    /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2946    /// # Ok::<(), PolarsError>(())
2947    /// ```
2948    /// Returns
2949    ///
2950    /// ```text
2951    /// +-----+-----+-----+
2952    /// | flt | int | str |
2953    /// | --- | --- | --- |
2954    /// | f64 | i32 | str |
2955    /// +=====+=====+=====+
2956    /// | 1   | 1   | "a" |
2957    /// +-----+-----+-----+
2958    /// | 2   | 2   | "b" |
2959    /// +-----+-----+-----+
2960    /// | 3   | 3   | "c" |
2961    /// +-----+-----+-----+
2962    /// ```
2963    #[cfg(feature = "algorithm_group_by")]
2964    pub fn unique_stable(
2965        &self,
2966        subset: Option<&[String]>,
2967        keep: UniqueKeepStrategy,
2968        slice: Option<(i64, usize)>,
2969    ) -> PolarsResult<DataFrame> {
2970        self.unique_impl(
2971            true,
2972            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2973            keep,
2974            slice,
2975        )
2976    }
2977
2978    /// Unstable distinct. See [`DataFrame::unique_stable`].
2979    #[cfg(feature = "algorithm_group_by")]
2980    pub fn unique<I, S>(
2981        &self,
2982        subset: Option<&[String]>,
2983        keep: UniqueKeepStrategy,
2984        slice: Option<(i64, usize)>,
2985    ) -> PolarsResult<DataFrame> {
2986        self.unique_impl(
2987            false,
2988            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2989            keep,
2990            slice,
2991        )
2992    }
2993
2994    #[cfg(feature = "algorithm_group_by")]
2995    pub fn unique_impl(
2996        &self,
2997        maintain_order: bool,
2998        subset: Option<Vec<PlSmallStr>>,
2999        keep: UniqueKeepStrategy,
3000        slice: Option<(i64, usize)>,
3001    ) -> PolarsResult<Self> {
3002        let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3003        let mut df = self.clone();
3004        // take on multiple chunks is terrible
3005        df.as_single_chunk_par();
3006
3007        let columns = match (keep, maintain_order) {
3008            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3009                let gb = df.group_by_stable(names)?;
3010                let groups = gb.get_groups();
3011                let (offset, len) = slice.unwrap_or((0, groups.len()));
3012                let groups = groups.slice(offset, len);
3013                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3014            },
3015            (UniqueKeepStrategy::Last, true) => {
3016                // maintain order by last values, so the sorted groups are not correct as they
3017                // are sorted by the first value
3018                let gb = df.group_by(names)?;
3019                let groups = gb.get_groups();
3020
3021                let func = |g: GroupsIndicator| match g {
3022                    GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3023                    GroupsIndicator::Slice([first, len]) => first + len - 1,
3024                };
3025
3026                let last_idx: NoNull<IdxCa> = match slice {
3027                    None => groups.iter().map(func).collect(),
3028                    Some((offset, len)) => {
3029                        let (offset, len) = slice_offsets(offset, len, groups.len());
3030                        groups.iter().skip(offset).take(len).map(func).collect()
3031                    },
3032                };
3033
3034                let last_idx = last_idx.sort(false);
3035                return Ok(unsafe { df.take_unchecked(&last_idx) });
3036            },
3037            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3038                let gb = df.group_by(names)?;
3039                let groups = gb.get_groups();
3040                let (offset, len) = slice.unwrap_or((0, groups.len()));
3041                let groups = groups.slice(offset, len);
3042                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3043            },
3044            (UniqueKeepStrategy::Last, false) => {
3045                let gb = df.group_by(names)?;
3046                let groups = gb.get_groups();
3047                let (offset, len) = slice.unwrap_or((0, groups.len()));
3048                let groups = groups.slice(offset, len);
3049                df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3050            },
3051            (UniqueKeepStrategy::None, _) => {
3052                let df_part = df.select(names)?;
3053                let mask = df_part.is_unique()?;
3054                let mask = match slice {
3055                    None => mask,
3056                    Some((offset, len)) => mask.slice(offset, len),
3057                };
3058                return df.filter(&mask);
3059            },
3060        };
3061
3062        let height = Self::infer_height(&columns);
3063        Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3064    }
3065
3066    /// Get a mask of all the unique rows in the [`DataFrame`].
3067    ///
3068    /// # Example
3069    ///
3070    /// ```no_run
3071    /// # use polars_core::prelude::*;
3072    /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3073    ///                         "ISIN" => ["US0378331005", "US5949181045"])?;
3074    /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3075    ///
3076    /// assert!(ca.all());
3077    /// # Ok::<(), PolarsError>(())
3078    /// ```
3079    #[cfg(feature = "algorithm_group_by")]
3080    pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3081        let gb = self.group_by(self.get_column_names_owned())?;
3082        let groups = gb.get_groups();
3083        Ok(is_unique_helper(
3084            groups,
3085            self.height() as IdxSize,
3086            true,
3087            false,
3088        ))
3089    }
3090
3091    /// Get a mask of all the duplicated rows in the [`DataFrame`].
3092    ///
3093    /// # Example
3094    ///
3095    /// ```no_run
3096    /// # use polars_core::prelude::*;
3097    /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3098    ///                         "ISIN" => ["US02079K3059", "US02079K1079"])?;
3099    /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3100    ///
3101    /// assert!(!ca.all());
3102    /// # Ok::<(), PolarsError>(())
3103    /// ```
3104    #[cfg(feature = "algorithm_group_by")]
3105    pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3106        let gb = self.group_by(self.get_column_names_owned())?;
3107        let groups = gb.get_groups();
3108        Ok(is_unique_helper(
3109            groups,
3110            self.height() as IdxSize,
3111            false,
3112            true,
3113        ))
3114    }
3115
3116    /// Create a new [`DataFrame`] that shows the null counts per column.
3117    #[must_use]
3118    pub fn null_count(&self) -> Self {
3119        let cols = self
3120            .columns
3121            .iter()
3122            .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3123            .collect();
3124        unsafe { Self::new_no_checks(1, cols) }
3125    }
3126
3127    /// Hash and combine the row values
3128    #[cfg(feature = "row_hash")]
3129    pub fn hash_rows(
3130        &mut self,
3131        hasher_builder: Option<PlSeedableRandomStateQuality>,
3132    ) -> PolarsResult<UInt64Chunked> {
3133        let dfs = split_df(self, POOL.current_num_threads(), false);
3134        let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3135
3136        let mut iter = cas.into_iter();
3137        let mut acc_ca = iter.next().unwrap();
3138        for ca in iter {
3139            acc_ca.append(&ca)?;
3140        }
3141        Ok(acc_ca.rechunk().into_owned())
3142    }
3143
3144    /// Get the supertype of the columns in this DataFrame
3145    pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3146        self.columns
3147            .iter()
3148            .map(|s| Ok(s.dtype().clone()))
3149            .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3150    }
3151
3152    /// Take by index values given by the slice `idx`.
3153    /// # Warning
3154    /// Be careful with allowing threads when calling this in a large hot loop
3155    /// every thread split may be on rayon stack and lead to SO
3156    #[doc(hidden)]
3157    pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3158        self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3159    }
3160
3161    /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3162    /// if the index value in `idx` are sorted. This will maintain sorted flags.
3163    ///
3164    /// # Warning
3165    /// Be careful with allowing threads when calling this in a large hot loop
3166    /// every thread split may be on rayon stack and lead to SO
3167    #[doc(hidden)]
3168    pub unsafe fn _take_unchecked_slice_sorted(
3169        &self,
3170        idx: &[IdxSize],
3171        allow_threads: bool,
3172        sorted: IsSorted,
3173    ) -> Self {
3174        #[cfg(debug_assertions)]
3175        {
3176            if idx.len() > 2 {
3177                match sorted {
3178                    IsSorted::Ascending => {
3179                        assert!(idx[0] <= idx[idx.len() - 1]);
3180                    },
3181                    IsSorted::Descending => {
3182                        assert!(idx[0] >= idx[idx.len() - 1]);
3183                    },
3184                    _ => {},
3185                }
3186            }
3187        }
3188        let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3189        ca.set_sorted_flag(sorted);
3190        self.take_unchecked_impl(&ca, allow_threads)
3191    }
3192
3193    #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3194    #[doc(hidden)]
3195    pub fn _partition_by_impl(
3196        &self,
3197        cols: &[PlSmallStr],
3198        stable: bool,
3199        include_key: bool,
3200        parallel: bool,
3201    ) -> PolarsResult<Vec<DataFrame>> {
3202        let selected_keys = self.select_columns(cols.iter().cloned())?;
3203        let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3204        let groups = groups.take_groups();
3205
3206        // drop key columns prior to calculation if requested
3207        let df = if include_key {
3208            self.clone()
3209        } else {
3210            self.drop_many(cols.iter().cloned())
3211        };
3212
3213        if parallel {
3214            // don't parallelize this
3215            // there is a lot of parallelization in take and this may easily SO
3216            POOL.install(|| {
3217                match groups.as_ref() {
3218                    GroupsType::Idx(idx) => {
3219                        // Rechunk as the gather may rechunk for every group #17562.
3220                        let mut df = df.clone();
3221                        df.as_single_chunk_par();
3222                        Ok(idx
3223                            .into_par_iter()
3224                            .map(|(_, group)| {
3225                                // groups are in bounds
3226                                unsafe {
3227                                    df._take_unchecked_slice_sorted(
3228                                        group,
3229                                        false,
3230                                        IsSorted::Ascending,
3231                                    )
3232                                }
3233                            })
3234                            .collect())
3235                    },
3236                    GroupsType::Slice { groups, .. } => Ok(groups
3237                        .into_par_iter()
3238                        .map(|[first, len]| df.slice(*first as i64, *len as usize))
3239                        .collect()),
3240                }
3241            })
3242        } else {
3243            match groups.as_ref() {
3244                GroupsType::Idx(idx) => {
3245                    // Rechunk as the gather may rechunk for every group #17562.
3246                    let mut df = df.clone();
3247                    df.as_single_chunk();
3248                    Ok(idx
3249                        .into_iter()
3250                        .map(|(_, group)| {
3251                            // groups are in bounds
3252                            unsafe {
3253                                df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3254                            }
3255                        })
3256                        .collect())
3257                },
3258                GroupsType::Slice { groups, .. } => Ok(groups
3259                    .iter()
3260                    .map(|[first, len]| df.slice(*first as i64, *len as usize))
3261                    .collect()),
3262            }
3263        }
3264    }
3265
3266    /// Split into multiple DataFrames partitioned by groups
3267    #[cfg(feature = "partition_by")]
3268    pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3269    where
3270        I: IntoIterator<Item = S>,
3271        S: Into<PlSmallStr>,
3272    {
3273        let cols = cols
3274            .into_iter()
3275            .map(Into::into)
3276            .collect::<Vec<PlSmallStr>>();
3277        self._partition_by_impl(cols.as_slice(), false, include_key, true)
3278    }
3279
3280    /// Split into multiple DataFrames partitioned by groups
3281    /// Order of the groups are maintained.
3282    #[cfg(feature = "partition_by")]
3283    pub fn partition_by_stable<I, S>(
3284        &self,
3285        cols: I,
3286        include_key: bool,
3287    ) -> PolarsResult<Vec<DataFrame>>
3288    where
3289        I: IntoIterator<Item = S>,
3290        S: Into<PlSmallStr>,
3291    {
3292        let cols = cols
3293            .into_iter()
3294            .map(Into::into)
3295            .collect::<Vec<PlSmallStr>>();
3296        self._partition_by_impl(cols.as_slice(), true, include_key, true)
3297    }
3298
3299    /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3300    /// inserted as columns.
3301    #[cfg(feature = "dtype-struct")]
3302    pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3303        let cols = cols.into_vec();
3304        self.unnest_impl(cols.into_iter().collect())
3305    }
3306
3307    #[cfg(feature = "dtype-struct")]
3308    fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3309        let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3310        let mut count = 0;
3311        for s in &self.columns {
3312            if cols.contains(s.name()) {
3313                let ca = s.struct_()?.clone();
3314                new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3315                count += 1;
3316            } else {
3317                new_cols.push(s.clone())
3318            }
3319        }
3320        if count != cols.len() {
3321            // one or more columns not found
3322            // the code below will return an error with the missing name
3323            let schema = self.schema();
3324            for col in cols {
3325                let _ = schema
3326                    .get(col.as_str())
3327                    .ok_or_else(|| polars_err!(col_not_found = col))?;
3328            }
3329        }
3330        DataFrame::new(new_cols)
3331    }
3332
3333    pub(crate) fn infer_height(cols: &[Column]) -> usize {
3334        cols.first().map_or(0, Column::len)
3335    }
3336
3337    pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3338        // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3339        // append_chunk or something like this. It is just quite difficult to make that safe.
3340        let df = DataFrame::from(rb);
3341        polars_ensure!(
3342            self.schema() == df.schema(),
3343            SchemaMismatch: "cannot append record batch with different schema",
3344        );
3345        self.vstack_mut_owned_unchecked(df);
3346        Ok(())
3347    }
3348}
3349
3350pub struct RecordBatchIter<'a> {
3351    columns: &'a Vec<Column>,
3352    schema: ArrowSchemaRef,
3353    idx: usize,
3354    n_chunks: usize,
3355    compat_level: CompatLevel,
3356    parallel: bool,
3357}
3358
3359impl Iterator for RecordBatchIter<'_> {
3360    type Item = RecordBatch;
3361
3362    fn next(&mut self) -> Option<Self::Item> {
3363        if self.idx >= self.n_chunks {
3364            return None;
3365        }
3366
3367        // Create a batch of the columns with the same chunk no.
3368        let batch_cols: Vec<ArrayRef> = if self.parallel {
3369            let iter = self
3370                .columns
3371                .par_iter()
3372                .map(Column::as_materialized_series)
3373                .map(|s| s.to_arrow(self.idx, self.compat_level));
3374            POOL.install(|| iter.collect())
3375        } else {
3376            self.columns
3377                .iter()
3378                .map(Column::as_materialized_series)
3379                .map(|s| s.to_arrow(self.idx, self.compat_level))
3380                .collect()
3381        };
3382        self.idx += 1;
3383
3384        let length = batch_cols.first().map_or(0, |arr| arr.len());
3385        Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3386    }
3387
3388    fn size_hint(&self) -> (usize, Option<usize>) {
3389        let n = self.n_chunks - self.idx;
3390        (n, Some(n))
3391    }
3392}
3393
3394pub struct PhysRecordBatchIter<'a> {
3395    schema: ArrowSchemaRef,
3396    arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3397}
3398
3399impl Iterator for PhysRecordBatchIter<'_> {
3400    type Item = RecordBatch;
3401
3402    fn next(&mut self) -> Option<Self::Item> {
3403        let arrs = self
3404            .arr_iters
3405            .iter_mut()
3406            .map(|phys_iter| phys_iter.next().cloned())
3407            .collect::<Option<Vec<_>>>()?;
3408
3409        let length = arrs.first().map_or(0, |arr| arr.len());
3410        Some(RecordBatch::new(length, self.schema.clone(), arrs))
3411    }
3412
3413    fn size_hint(&self) -> (usize, Option<usize>) {
3414        if let Some(iter) = self.arr_iters.first() {
3415            iter.size_hint()
3416        } else {
3417            (0, None)
3418        }
3419    }
3420}
3421
3422impl Default for DataFrame {
3423    fn default() -> Self {
3424        DataFrame::empty()
3425    }
3426}
3427
3428impl From<DataFrame> for Vec<Column> {
3429    fn from(df: DataFrame) -> Self {
3430        df.columns
3431    }
3432}
3433
3434// utility to test if we can vstack/extend the columns
3435fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3436    polars_ensure!(
3437        left.name() == right.name(),
3438        ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3439        left.name(), right.name(),
3440    );
3441    Ok(())
3442}
3443
3444#[cfg(test)]
3445mod test {
3446    use super::*;
3447
3448    fn create_frame() -> DataFrame {
3449        let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3450        let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3451        DataFrame::new(vec![s0, s1]).unwrap()
3452    }
3453
3454    #[test]
3455    #[cfg_attr(miri, ignore)]
3456    fn test_recordbatch_iterator() {
3457        let df = df!(
3458            "foo" => [1, 2, 3, 4, 5]
3459        )
3460        .unwrap();
3461        let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3462        assert_eq!(5, iter.next().unwrap().len());
3463        assert!(iter.next().is_none());
3464    }
3465
3466    #[test]
3467    #[cfg_attr(miri, ignore)]
3468    fn test_select() {
3469        let df = create_frame();
3470        assert_eq!(
3471            df.column("days")
3472                .unwrap()
3473                .as_series()
3474                .unwrap()
3475                .equal(1)
3476                .unwrap()
3477                .sum(),
3478            Some(1)
3479        );
3480    }
3481
3482    #[test]
3483    #[cfg_attr(miri, ignore)]
3484    fn test_filter_broadcast_on_string_col() {
3485        let col_name = "some_col";
3486        let v = vec!["test".to_string()];
3487        let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3488        let mut df = DataFrame::new(vec![s0]).unwrap();
3489
3490        df = df
3491            .filter(
3492                &df.column(col_name)
3493                    .unwrap()
3494                    .as_materialized_series()
3495                    .equal("")
3496                    .unwrap(),
3497            )
3498            .unwrap();
3499        assert_eq!(
3500            df.column(col_name)
3501                .unwrap()
3502                .as_materialized_series()
3503                .n_chunks(),
3504            1
3505        );
3506    }
3507
3508    #[test]
3509    #[cfg_attr(miri, ignore)]
3510    fn test_filter_broadcast_on_list_col() {
3511        let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3512        let ll: ListChunked = [&s1].iter().copied().collect();
3513
3514        let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3515        let new = ll.filter(&mask).unwrap();
3516
3517        assert_eq!(new.chunks.len(), 1);
3518        assert_eq!(new.len(), 0);
3519    }
3520
3521    #[test]
3522    fn slice() {
3523        let df = create_frame();
3524        let sliced_df = df.slice(0, 2);
3525        assert_eq!(sliced_df.shape(), (2, 2));
3526    }
3527
3528    #[test]
3529    fn rechunk_false() {
3530        let df = create_frame();
3531        assert!(!df.should_rechunk())
3532    }
3533
3534    #[test]
3535    fn rechunk_true() -> PolarsResult<()> {
3536        let mut base = df!(
3537            "a" => [1, 2, 3],
3538            "b" => [1, 2, 3]
3539        )?;
3540
3541        // Create a series with multiple chunks
3542        let mut s = Series::new("foo".into(), 0..2);
3543        let s2 = Series::new("bar".into(), 0..1);
3544        s.append(&s2)?;
3545
3546        // Append series to frame
3547        let out = base.with_column(s)?;
3548
3549        // Now we should rechunk
3550        assert!(out.should_rechunk());
3551        Ok(())
3552    }
3553
3554    #[test]
3555    fn test_duplicate_column() {
3556        let mut df = df! {
3557            "foo" => [1, 2, 3]
3558        }
3559        .unwrap();
3560        // check if column is replaced
3561        assert!(
3562            df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3563                .is_ok()
3564        );
3565        assert!(
3566            df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3567                .is_ok()
3568        );
3569        assert!(df.column("bar").is_ok())
3570    }
3571
3572    #[test]
3573    #[cfg_attr(miri, ignore)]
3574    fn distinct() {
3575        let df = df! {
3576            "flt" => [1., 1., 2., 2., 3., 3.],
3577            "int" => [1, 1, 2, 2, 3, 3, ],
3578            "str" => ["a", "a", "b", "b", "c", "c"]
3579        }
3580        .unwrap();
3581        let df = df
3582            .unique_stable(None, UniqueKeepStrategy::First, None)
3583            .unwrap()
3584            .sort(["flt"], SortMultipleOptions::default())
3585            .unwrap();
3586        let valid = df! {
3587            "flt" => [1., 2., 3.],
3588            "int" => [1, 2, 3],
3589            "str" => ["a", "b", "c"]
3590        }
3591        .unwrap();
3592        assert!(df.equals(&valid));
3593    }
3594
3595    #[test]
3596    fn test_vstack() {
3597        // check that it does not accidentally rechunks
3598        let mut df = df! {
3599            "flt" => [1., 1., 2., 2., 3., 3.],
3600            "int" => [1, 1, 2, 2, 3, 3, ],
3601            "str" => ["a", "a", "b", "b", "c", "c"]
3602        }
3603        .unwrap();
3604
3605        df.vstack_mut(&df.slice(0, 3)).unwrap();
3606        assert_eq!(df.first_col_n_chunks(), 2)
3607    }
3608
3609    #[test]
3610    fn test_vstack_on_empty_dataframe() {
3611        let mut df = DataFrame::empty();
3612
3613        let df_data = df! {
3614            "flt" => [1., 1., 2., 2., 3., 3.],
3615            "int" => [1, 1, 2, 2, 3, 3, ],
3616            "str" => ["a", "a", "b", "b", "c", "c"]
3617        }
3618        .unwrap();
3619
3620        df.vstack_mut(&df_data).unwrap();
3621        assert_eq!(df.height, 6)
3622    }
3623
3624    #[test]
3625    fn test_replace_or_add() -> PolarsResult<()> {
3626        let mut df = df!(
3627            "a" => [1, 2, 3],
3628            "b" => [1, 2, 3]
3629        )?;
3630
3631        // check that the new column is "c" and not "bar".
3632        df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3633
3634        assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3635        Ok(())
3636    }
3637}