polars_core/frame/
mod.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(any(feature = "rows", feature = "object"))]
33pub mod row;
34mod top_k;
35mod upstream_traits;
36mod validation;
37
38use arrow::record_batch::{RecordBatch, RecordBatchT};
39use polars_utils::pl_str::PlSmallStr;
40#[cfg(feature = "serde")]
41use serde::{Deserialize, Serialize};
42use strum_macros::IntoStaticStr;
43
44use crate::POOL;
45#[cfg(feature = "row_hash")]
46use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48use crate::series::IsSorted;
49
50#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52#[strum(serialize_all = "snake_case")]
53pub enum UniqueKeepStrategy {
54    /// Keep the first unique row.
55    First,
56    /// Keep the last unique row.
57    Last,
58    /// Keep None of the unique rows.
59    None,
60    /// Keep any of the unique rows
61    /// This allows more optimizations
62    #[default]
63    Any,
64}
65
66fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
67where
68    F: for<'a> FnMut(&'a T) -> &'a str,
69{
70    // Always unique.
71    if items.len() <= 1 {
72        return Ok(());
73    }
74
75    if items.len() <= 4 {
76        // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
77        for i in 0..items.len() - 1 {
78            let name = get_name(&items[i]);
79            for other in items.iter().skip(i + 1) {
80                if name == get_name(other) {
81                    polars_bail!(duplicate = name);
82                }
83            }
84        }
85    } else {
86        let mut names = PlHashSet::with_capacity(items.len());
87        for item in items {
88            let name = get_name(item);
89            if !names.insert(name) {
90                polars_bail!(duplicate = name);
91            }
92        }
93    }
94    Ok(())
95}
96
97/// A contiguous growable collection of `Series` that have the same length.
98///
99/// ## Use declarations
100///
101/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
102///
103/// ```rust
104/// use polars_core::prelude::*; // if the crate polars-core is used directly
105/// // use polars::prelude::*;      if the crate polars is used
106/// ```
107///
108/// # Initialization
109/// ## Default
110///
111/// A `DataFrame` can be initialized empty:
112///
113/// ```rust
114/// # use polars_core::prelude::*;
115/// let df = DataFrame::default();
116/// assert!(df.is_empty());
117/// ```
118///
119/// ## Wrapping a `Vec<Series>`
120///
121/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
122///
123/// ```rust
124/// # use polars_core::prelude::*;
125/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
126/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
127///
128/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
129/// ```
130///
131/// ## Using a macro
132///
133/// The [`df!`] macro is a convenient method:
134///
135/// ```rust
136/// # use polars_core::prelude::*;
137/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
138///                                       "Color" => ["Red", "Yellow", "Green"]);
139/// ```
140///
141/// ## Using a CSV file
142///
143/// See the `polars_io::csv::CsvReader`.
144///
145/// # Indexing
146/// ## By a number
147///
148/// The `Index<usize>` is implemented for the `DataFrame`.
149///
150/// ```rust
151/// # use polars_core::prelude::*;
152/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
153///              "Color" => ["Red", "Yellow", "Green"])?;
154///
155/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
156/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
157/// # Ok::<(), PolarsError>(())
158/// ```
159///
160/// ## By a `Series` name
161///
162/// ```rust
163/// # use polars_core::prelude::*;
164/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
165///              "Color" => ["Red", "Yellow", "Green"])?;
166///
167/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
168/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
169/// # Ok::<(), PolarsError>(())
170/// ```
171#[derive(Clone)]
172pub struct DataFrame {
173    height: usize,
174    // invariant: columns[i].len() == height for each 0 >= i > columns.len()
175    pub(crate) columns: Vec<Column>,
176
177    /// A cached schema. This might not give correct results if the DataFrame was modified in place
178    /// between schema and reading.
179    cached_schema: OnceLock<SchemaRef>,
180}
181
182impl DataFrame {
183    pub fn clear_schema(&mut self) {
184        self.cached_schema = OnceLock::new();
185    }
186
187    #[inline]
188    pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
189        self.columns.iter()
190    }
191
192    #[inline]
193    pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
194        self.columns.iter().map(Column::as_materialized_series)
195    }
196
197    #[inline]
198    pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
199        self.columns.par_iter().map(Column::as_materialized_series)
200    }
201
202    /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
203    ///
204    /// # Implementation
205    /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
206    /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
207    /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
208    ///
209    /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
210    /// However, this function will yield a smaller number. This is because this function returns
211    /// the visible size of the buffer, not its total capacity.
212    ///
213    /// FFI buffers are included in this estimation.
214    pub fn estimated_size(&self) -> usize {
215        self.columns.iter().map(Column::estimated_size).sum()
216    }
217
218    // Reduce monomorphization.
219    fn try_apply_columns(
220        &self,
221        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
222    ) -> PolarsResult<Vec<Column>> {
223        self.columns.iter().map(func).collect()
224    }
225    // Reduce monomorphization.
226    pub fn _apply_columns(&self, func: &(dyn Fn(&Column) -> Column)) -> Vec<Column> {
227        self.columns.iter().map(func).collect()
228    }
229    // Reduce monomorphization.
230    fn try_apply_columns_par(
231        &self,
232        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
233    ) -> PolarsResult<Vec<Column>> {
234        POOL.install(|| self.columns.par_iter().map(func).collect())
235    }
236    // Reduce monomorphization.
237    pub fn _apply_columns_par(
238        &self,
239        func: &(dyn Fn(&Column) -> Column + Send + Sync),
240    ) -> Vec<Column> {
241        POOL.install(|| self.columns.par_iter().map(func).collect())
242    }
243
244    /// Get the index of the column.
245    fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
246        self.get_column_index(name)
247            .ok_or_else(|| polars_err!(col_not_found = name))
248    }
249
250    fn check_already_present(&self, name: &str) -> PolarsResult<()> {
251        polars_ensure!(
252            self.columns.iter().all(|s| s.name().as_str() != name),
253            Duplicate: "column with name {:?} is already present in the DataFrame", name
254        );
255        Ok(())
256    }
257
258    /// Reserve additional slots into the chunks of the series.
259    pub(crate) fn reserve_chunks(&mut self, additional: usize) {
260        for s in &mut self.columns {
261            if let Column::Series(s) = s {
262                // SAFETY:
263                // do not modify the data, simply resize.
264                unsafe { s.chunks_mut().reserve(additional) }
265            }
266        }
267    }
268
269    /// Create a DataFrame from a Vector of Series.
270    ///
271    /// Errors if a column names are not unique, or if heights are not all equal.
272    ///
273    /// # Example
274    ///
275    /// ```
276    /// # use polars_core::prelude::*;
277    /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
278    /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
279    ///
280    /// let df = DataFrame::new(vec![s0, s1])?;
281    /// # Ok::<(), PolarsError>(())
282    /// ```
283    pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
284        DataFrame::validate_columns_slice(&columns)
285            .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {}", e)))?;
286        Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
287    }
288
289    pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
290        for col in &columns {
291            polars_ensure!(
292                col.len() == height,
293                ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
294                columns[0].name(), height, col.name(), col.len()
295            );
296        }
297
298        Ok(DataFrame {
299            height,
300            columns,
301            cached_schema: OnceLock::new(),
302        })
303    }
304
305    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
306    /// columns to match the other columns.
307    pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
308        // The length of the longest non-unit length column determines the
309        // broadcast length. If all columns are unit-length the broadcast length
310        // is one.
311        let broadcast_len = columns
312            .iter()
313            .map(|s| s.len())
314            .filter(|l| *l != 1)
315            .max()
316            .unwrap_or(1);
317        Self::new_with_broadcast_len(columns, broadcast_len)
318    }
319
320    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
321    /// columns to broadcast_len.
322    pub fn new_with_broadcast_len(
323        columns: Vec<Column>,
324        broadcast_len: usize,
325    ) -> PolarsResult<Self> {
326        ensure_names_unique(&columns, |s| s.name().as_str())?;
327        unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
328    }
329
330    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
331    /// columns to match the other columns.
332    ///  
333    /// # Safety
334    /// Does not check that the column names are unique (which they must be).
335    pub unsafe fn new_with_broadcast_no_namecheck(
336        mut columns: Vec<Column>,
337        broadcast_len: usize,
338    ) -> PolarsResult<Self> {
339        for col in &mut columns {
340            // Length not equal to the broadcast len, needs broadcast or is an error.
341            let len = col.len();
342            if len != broadcast_len {
343                if len != 1 {
344                    let name = col.name().to_owned();
345                    let extra_info =
346                        if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
347                            format!(" (matching column '{}')", c.name())
348                        } else {
349                            String::new()
350                        };
351                    polars_bail!(
352                        ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
353                    );
354                }
355                *col = col.new_from_index(0, broadcast_len);
356            }
357        }
358
359        let length = if columns.is_empty() { 0 } else { broadcast_len };
360
361        Ok(unsafe { DataFrame::new_no_checks(length, columns) })
362    }
363
364    /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
365    ///
366    /// # Example
367    ///
368    /// ```rust
369    /// use polars_core::prelude::DataFrame;
370    /// static EMPTY: DataFrame = DataFrame::empty();
371    /// ```
372    pub const fn empty() -> Self {
373        Self::empty_with_height(0)
374    }
375
376    /// Creates an empty `DataFrame` with a specific `height`.
377    pub const fn empty_with_height(height: usize) -> Self {
378        DataFrame {
379            height,
380            columns: vec![],
381            cached_schema: OnceLock::new(),
382        }
383    }
384
385    /// Create an empty `DataFrame` with empty columns as per the `schema`.
386    pub fn empty_with_schema(schema: &Schema) -> Self {
387        let cols = schema
388            .iter()
389            .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
390            .collect();
391        unsafe { DataFrame::new_no_checks(0, cols) }
392    }
393
394    /// Create an empty `DataFrame` with empty columns as per the `schema`.
395    pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
396        let cols = schema
397            .iter_values()
398            .map(|fld| {
399                Column::from(Series::new_empty(
400                    fld.name.clone(),
401                    &(DataType::from_arrow_field(fld)),
402                ))
403            })
404            .collect();
405        unsafe { DataFrame::new_no_checks(0, cols) }
406    }
407
408    /// Create a new `DataFrame` with the given schema, only containing nulls.
409    pub fn full_null(schema: &Schema, height: usize) -> Self {
410        let columns = schema
411            .iter_fields()
412            .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
413            .collect();
414        unsafe { DataFrame::new_no_checks(height, columns) }
415    }
416
417    /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
418    ///
419    /// # Example
420    ///
421    /// ```rust
422    /// # use polars_core::prelude::*;
423    /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
424    /// let s2 = Column::new("Area (kmĀ²)".into(), [106_460_000, 70_560_000]);
425    /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
426    ///
427    /// assert_eq!(df.pop(), Some(s2));
428    /// assert_eq!(df.pop(), Some(s1));
429    /// assert_eq!(df.pop(), None);
430    /// assert!(df.is_empty());
431    /// # Ok::<(), PolarsError>(())
432    /// ```
433    pub fn pop(&mut self) -> Option<Column> {
434        self.clear_schema();
435
436        self.columns.pop()
437    }
438
439    /// Add a new column at index 0 that counts the rows.
440    ///
441    /// # Example
442    ///
443    /// ```
444    /// # use polars_core::prelude::*;
445    /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
446    /// assert_eq!(df1.shape(), (4, 1));
447    ///
448    /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
449    /// assert_eq!(df2.shape(), (4, 2));
450    /// println!("{}", df2);
451    ///
452    /// # Ok::<(), PolarsError>(())
453    /// ```
454    ///
455    /// Output:
456    ///
457    /// ```text
458    ///  shape: (4, 2)
459    ///  +-----+----------+
460    ///  | Id  | Name     |
461    ///  | --- | ---      |
462    ///  | u32 | str      |
463    ///  +=====+==========+
464    ///  | 0   | James    |
465    ///  +-----+----------+
466    ///  | 1   | Mary     |
467    ///  +-----+----------+
468    ///  | 2   | John     |
469    ///  +-----+----------+
470    ///  | 3   | Patricia |
471    ///  +-----+----------+
472    /// ```
473    pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
474        let mut columns = Vec::with_capacity(self.columns.len() + 1);
475        let offset = offset.unwrap_or(0);
476
477        let col = Column::new_row_index(name, offset, self.height())?;
478        columns.push(col);
479        columns.extend_from_slice(&self.columns);
480        DataFrame::new(columns)
481    }
482
483    /// Add a row index column in place.
484    ///
485    /// # Safety
486    /// The caller should ensure the DataFrame does not already contain a column with the given name.
487    ///
488    /// # Panics
489    /// Panics if the resulting column would reach or overflow IdxSize::MAX.
490    pub unsafe fn with_row_index_mut(
491        &mut self,
492        name: PlSmallStr,
493        offset: Option<IdxSize>,
494    ) -> &mut Self {
495        // TODO: Make this function unsafe
496        debug_assert!(
497            self.columns.iter().all(|c| c.name() != &name),
498            "with_row_index_mut(): column with name {} already exists",
499            &name
500        );
501
502        let offset = offset.unwrap_or(0);
503        let col = Column::new_row_index(name, offset, self.height()).unwrap();
504
505        self.clear_schema();
506        self.columns.insert(0, col);
507        self
508    }
509
510    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
511    /// `Series`.
512    ///
513    /// Calculates the height from the first column or `0` if no columns are given.
514    ///
515    /// # Safety
516    ///
517    /// It is the callers responsibility to uphold the contract of all `Series`
518    /// having an equal length and a unique name, if not this may panic down the line.
519    pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
520        let height = columns.first().map_or(0, Column::len);
521        unsafe { Self::new_no_checks(height, columns) }
522    }
523
524    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
525    /// `Series`.
526    ///
527    /// It is advised to use [DataFrame::new] in favor of this method.
528    ///
529    /// # Safety
530    ///
531    /// It is the callers responsibility to uphold the contract of all `Series`
532    /// having an equal length and a unique name, if not this may panic down the line.
533    pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
534        if cfg!(debug_assertions) {
535            DataFrame::validate_columns_slice(&columns).unwrap();
536        }
537
538        unsafe { Self::_new_no_checks_impl(height, columns) }
539    }
540
541    /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
542    /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
543    /// constructed with this method is generally highly unsafe and should not be long-lived.
544    #[allow(clippy::missing_safety_doc)]
545    pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
546        DataFrame {
547            height,
548            columns,
549            cached_schema: OnceLock::new(),
550        }
551    }
552
553    /// Shrink the capacity of this DataFrame to fit its length.
554    pub fn shrink_to_fit(&mut self) {
555        // Don't parallelize this. Memory overhead
556        for s in &mut self.columns {
557            s.shrink_to_fit();
558        }
559    }
560
561    /// Aggregate all the chunks in the DataFrame to a single chunk.
562    pub fn as_single_chunk(&mut self) -> &mut Self {
563        // Don't parallelize this. Memory overhead
564        for s in &mut self.columns {
565            if let Column::Series(s) = s {
566                *s = s.rechunk().into();
567            }
568        }
569        self
570    }
571
572    /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
573    /// This may lead to more peak memory consumption.
574    pub fn as_single_chunk_par(&mut self) -> &mut Self {
575        if self.columns.iter().any(|c| c.n_chunks() > 1) {
576            self.columns = self._apply_columns_par(&|s| s.rechunk());
577        }
578        self
579    }
580
581    /// Rechunks all columns to only have a single chunk.
582    pub fn rechunk_mut(&mut self) {
583        // SAFETY: We never adjust the length or names of the columns.
584        let columns = unsafe { self.get_columns_mut() };
585
586        for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
587            *col = col.rechunk();
588        }
589    }
590
591    pub fn _deshare_views_mut(&mut self) {
592        // SAFETY: We never adjust the length or names of the columns.
593        unsafe {
594            let columns = self.get_columns_mut();
595            for col in columns {
596                let Column::Series(s) = col else { continue };
597
598                if let Ok(ca) = s.binary() {
599                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
600                    *col = Column::from(gc_ca.into_series());
601                } else if let Ok(ca) = s.str() {
602                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
603                    *col = Column::from(gc_ca.into_series());
604                }
605            }
606        }
607    }
608
609    /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
610    pub fn rechunk_to_record_batch(
611        self,
612        compat_level: CompatLevel,
613    ) -> RecordBatchT<Box<dyn Array>> {
614        let height = self.height();
615
616        let (schema, arrays) = self
617            .columns
618            .into_iter()
619            .map(|col| {
620                let mut series = col.take_materialized_series();
621                // Rechunk to one chunk if necessary
622                if series.n_chunks() > 1 {
623                    series = series.rechunk();
624                }
625                (
626                    series.field().to_arrow(compat_level),
627                    series.to_arrow(0, compat_level),
628                )
629            })
630            .collect();
631
632        RecordBatchT::new(height, Arc::new(schema), arrays)
633    }
634
635    /// Returns true if the chunks of the columns do not align and re-chunking should be done
636    pub fn should_rechunk(&self) -> bool {
637        // Fast check. It is also needed for correctness, as code below doesn't check if the number
638        // of chunks is equal.
639        if !self
640            .get_columns()
641            .iter()
642            .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
643            .all_equal()
644        {
645            return true;
646        }
647
648        // From here we check chunk lengths.
649        let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
650        match chunk_lengths.next() {
651            None => false,
652            Some(first_column_chunk_lengths) => {
653                // Fast Path for single Chunk Series
654                if first_column_chunk_lengths.size_hint().0 == 1 {
655                    return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
656                }
657                // Always rechunk if we have more chunks than rows.
658                // except when we have an empty df containing a single chunk
659                let height = self.height();
660                let n_chunks = first_column_chunk_lengths.size_hint().0;
661                if n_chunks > height && !(height == 0 && n_chunks == 1) {
662                    return true;
663                }
664                // Slow Path for multi Chunk series
665                let v: Vec<_> = first_column_chunk_lengths.collect();
666                for cl in chunk_lengths {
667                    if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
668                        return true;
669                    }
670                }
671                false
672            },
673        }
674    }
675
676    /// Ensure all the chunks in the [`DataFrame`] are aligned.
677    pub fn align_chunks_par(&mut self) -> &mut Self {
678        if self.should_rechunk() {
679            self.as_single_chunk_par()
680        } else {
681            self
682        }
683    }
684
685    pub fn align_chunks(&mut self) -> &mut Self {
686        if self.should_rechunk() {
687            self.as_single_chunk()
688        } else {
689            self
690        }
691    }
692
693    /// Get the [`DataFrame`] schema.
694    ///
695    /// # Example
696    ///
697    /// ```rust
698    /// # use polars_core::prelude::*;
699    /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
700    ///                         "Diameter (m)" => [8.8e26, f64::INFINITY])?;
701    ///
702    /// let f1: Field = Field::new("Thing".into(), DataType::String);
703    /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
704    /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
705    ///
706    /// assert_eq!(&**df.schema(), &sc);
707    /// # Ok::<(), PolarsError>(())
708    /// ```
709    pub fn schema(&self) -> &SchemaRef {
710        let out = self.cached_schema.get_or_init(|| {
711            Arc::new(
712                self.columns
713                    .iter()
714                    .map(|x| (x.name().clone(), x.dtype().clone()))
715                    .collect(),
716            )
717        });
718
719        debug_assert_eq!(out.len(), self.width());
720
721        out
722    }
723
724    /// Get a reference to the [`DataFrame`] columns.
725    ///
726    /// # Example
727    ///
728    /// ```rust
729    /// # use polars_core::prelude::*;
730    /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
731    ///                         "Symbol" => ["A", "C", "G", "T"])?;
732    /// let columns: &[Column] = df.get_columns();
733    ///
734    /// assert_eq!(columns[0].name(), "Name");
735    /// assert_eq!(columns[1].name(), "Symbol");
736    /// # Ok::<(), PolarsError>(())
737    /// ```
738    #[inline]
739    pub fn get_columns(&self) -> &[Column] {
740        &self.columns
741    }
742
743    #[inline]
744    /// Get mutable access to the underlying columns.
745    ///
746    /// # Safety
747    ///
748    /// The caller must ensure the length of all [`Series`] remains equal to `height` or
749    /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
750    /// The caller must ensure that the cached schema is cleared if it modifies the schema by
751    /// calling [`DataFrame::clear_schema`].
752    pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
753        &mut self.columns
754    }
755
756    #[inline]
757    /// Remove all the columns in the [`DataFrame`] but keep the `height`.
758    pub fn clear_columns(&mut self) {
759        unsafe { self.get_columns_mut() }.clear();
760        self.clear_schema();
761    }
762
763    #[inline]
764    /// Extend the columns without checking for name collisions or height.
765    ///
766    /// # Safety
767    ///
768    /// The caller needs to ensure that:
769    /// - Column names are unique within the resulting [`DataFrame`].
770    /// - The length of each appended column matches the height of the [`DataFrame`]. For
771    ///   `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
772    ///   with [`DataFrame::set_height`].
773    pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
774        unsafe { self.get_columns_mut() }.extend(iter);
775        self.clear_schema();
776    }
777
778    /// Take ownership of the underlying columns vec.
779    pub fn take_columns(self) -> Vec<Column> {
780        self.columns
781    }
782
783    /// Iterator over the columns as [`Series`].
784    ///
785    /// # Example
786    ///
787    /// ```rust
788    /// # use polars_core::prelude::*;
789    /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
790    /// let s2 = Column::new("Formula".into(), ["aĀ²+bĀ²=cĀ²", "H=-Ī£[P(x)log|P(x)|]"]);
791    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
792    ///
793    /// let mut iterator = df.iter();
794    ///
795    /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
796    /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
797    /// assert_eq!(iterator.next(), None);
798    /// # Ok::<(), PolarsError>(())
799    /// ```
800    pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
801        self.materialized_column_iter()
802    }
803
804    /// # Example
805    ///
806    /// ```rust
807    /// # use polars_core::prelude::*;
808    /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
809    ///                         "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
810    ///
811    /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
812    /// # Ok::<(), PolarsError>(())
813    /// ```
814    pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
815        self.columns.iter().map(|s| s.name()).collect()
816    }
817
818    /// Get the [`Vec<PlSmallStr>`] representing the column names.
819    pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
820        self.columns.iter().map(|s| s.name().clone()).collect()
821    }
822
823    pub fn get_column_names_str(&self) -> Vec<&str> {
824        self.columns.iter().map(|s| s.name().as_str()).collect()
825    }
826
827    /// Set the column names.
828    /// # Example
829    ///
830    /// ```rust
831    /// # use polars_core::prelude::*;
832    /// let mut df: DataFrame = df!("Mathematical set" => ["ā„•", "ā„¤", "š”»", "ā„š", "ā„", "ā„‚"])?;
833    /// df.set_column_names(["Set"])?;
834    ///
835    /// assert_eq!(df.get_column_names(), &["Set"]);
836    /// # Ok::<(), PolarsError>(())
837    /// ```
838    pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
839    where
840        I: IntoIterator<Item = S>,
841        S: Into<PlSmallStr>,
842    {
843        let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
844        self._set_column_names_impl(names.as_slice())
845    }
846
847    fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
848        polars_ensure!(
849            names.len() == self.width(),
850            ShapeMismatch: "{} column names provided for a DataFrame of width {}",
851            names.len(), self.width()
852        );
853        ensure_names_unique(names, |s| s.as_str())?;
854
855        let columns = mem::take(&mut self.columns);
856        self.columns = columns
857            .into_iter()
858            .zip(names)
859            .map(|(s, name)| {
860                let mut s = s;
861                s.rename(name.clone());
862                s
863            })
864            .collect();
865        self.clear_schema();
866        Ok(())
867    }
868
869    /// Get the data types of the columns in the [`DataFrame`].
870    ///
871    /// # Example
872    ///
873    /// ```rust
874    /// # use polars_core::prelude::*;
875    /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
876    ///                                "Fraction" => [0.965, 0.035])?;
877    ///
878    /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
879    /// # Ok::<(), PolarsError>(())
880    /// ```
881    pub fn dtypes(&self) -> Vec<DataType> {
882        self.columns.iter().map(|s| s.dtype().clone()).collect()
883    }
884
885    pub(crate) fn first_series_column(&self) -> Option<&Series> {
886        self.columns.iter().find_map(|col| col.as_series())
887    }
888
889    /// The number of chunks for the first column.
890    pub fn first_col_n_chunks(&self) -> usize {
891        match self.first_series_column() {
892            None if self.columns.is_empty() => 0,
893            None => 1,
894            Some(s) => s.n_chunks(),
895        }
896    }
897
898    /// The highest number of chunks for any column.
899    pub fn max_n_chunks(&self) -> usize {
900        self.columns
901            .iter()
902            .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
903            .max()
904            .unwrap_or(0)
905    }
906
907    /// Get a reference to the schema fields of the [`DataFrame`].
908    ///
909    /// # Example
910    ///
911    /// ```rust
912    /// # use polars_core::prelude::*;
913    /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
914    ///                            "Fraction" => [0.708, 0.292])?;
915    ///
916    /// let f1: Field = Field::new("Surface type".into(), DataType::String);
917    /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
918    ///
919    /// assert_eq!(earth.fields(), &[f1, f2]);
920    /// # Ok::<(), PolarsError>(())
921    /// ```
922    pub fn fields(&self) -> Vec<Field> {
923        self.columns
924            .iter()
925            .map(|s| s.field().into_owned())
926            .collect()
927    }
928
929    /// Get (height, width) of the [`DataFrame`].
930    ///
931    /// # Example
932    ///
933    /// ```rust
934    /// # use polars_core::prelude::*;
935    /// let df0: DataFrame = DataFrame::default();
936    /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
937    /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
938    ///                          "2" => [1, 2, 3, 4, 5])?;
939    ///
940    /// assert_eq!(df0.shape(), (0 ,0));
941    /// assert_eq!(df1.shape(), (5, 1));
942    /// assert_eq!(df2.shape(), (5, 2));
943    /// # Ok::<(), PolarsError>(())
944    /// ```
945    pub fn shape(&self) -> (usize, usize) {
946        (self.height, self.columns.len())
947    }
948
949    /// Get the width of the [`DataFrame`] which is the number of columns.
950    ///
951    /// # Example
952    ///
953    /// ```rust
954    /// # use polars_core::prelude::*;
955    /// let df0: DataFrame = DataFrame::default();
956    /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
957    /// let df2: DataFrame = df!("Series 1" => [0; 0],
958    ///                          "Series 2" => [0; 0])?;
959    ///
960    /// assert_eq!(df0.width(), 0);
961    /// assert_eq!(df1.width(), 1);
962    /// assert_eq!(df2.width(), 2);
963    /// # Ok::<(), PolarsError>(())
964    /// ```
965    pub fn width(&self) -> usize {
966        self.columns.len()
967    }
968
969    /// Get the height of the [`DataFrame`] which is the number of rows.
970    ///
971    /// # Example
972    ///
973    /// ```rust
974    /// # use polars_core::prelude::*;
975    /// let df0: DataFrame = DataFrame::default();
976    /// let df1: DataFrame = df!("Currency" => ["ā‚¬", "$"])?;
977    /// let df2: DataFrame = df!("Currency" => ["ā‚¬", "$", "Ā„", "Ā£", "ā‚æ"])?;
978    ///
979    /// assert_eq!(df0.height(), 0);
980    /// assert_eq!(df1.height(), 2);
981    /// assert_eq!(df2.height(), 5);
982    /// # Ok::<(), PolarsError>(())
983    /// ```
984    pub fn height(&self) -> usize {
985        self.height
986    }
987
988    /// Returns the size as number of rows * number of columns
989    pub fn size(&self) -> usize {
990        let s = self.shape();
991        s.0 * s.1
992    }
993
994    /// Returns `true` if the [`DataFrame`] contains no rows.
995    ///
996    /// # Example
997    ///
998    /// ```rust
999    /// # use polars_core::prelude::*;
1000    /// let df1: DataFrame = DataFrame::default();
1001    /// assert!(df1.is_empty());
1002    ///
1003    /// let df2: DataFrame = df!("First name" => ["Forever"],
1004    ///                          "Last name" => ["Alone"])?;
1005    /// assert!(!df2.is_empty());
1006    /// # Ok::<(), PolarsError>(())
1007    /// ```
1008    pub fn is_empty(&self) -> bool {
1009        matches!(self.shape(), (0, _) | (_, 0))
1010    }
1011
1012    /// Set the height (i.e. number of rows) of this [`DataFrame`].
1013    ///
1014    /// # Safety
1015    ///
1016    /// This needs to be equal to the length of all the columns.
1017    pub unsafe fn set_height(&mut self, height: usize) {
1018        self.height = height;
1019    }
1020
1021    /// Add multiple [`Series`] to a [`DataFrame`].
1022    /// The added `Series` are required to have the same length.
1023    ///
1024    /// # Example
1025    ///
1026    /// ```rust
1027    /// # use polars_core::prelude::*;
1028    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1029    /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1030    /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1031    ///
1032    /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1033    /// assert_eq!(df2.shape(), (3, 3));
1034    /// println!("{}", df2);
1035    /// # Ok::<(), PolarsError>(())
1036    /// ```
1037    ///
1038    /// Output:
1039    ///
1040    /// ```text
1041    /// shape: (3, 3)
1042    /// +---------+--------+----------+
1043    /// | Element | Proton | Electron |
1044    /// | ---     | ---    | ---      |
1045    /// | str     | i32    | i32      |
1046    /// +=========+========+==========+
1047    /// | Copper  | 29     | 29       |
1048    /// +---------+--------+----------+
1049    /// | Silver  | 47     | 47       |
1050    /// +---------+--------+----------+
1051    /// | Gold    | 79     | 79       |
1052    /// +---------+--------+----------+
1053    /// ```
1054    pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1055        let mut new_cols = self.columns.clone();
1056        new_cols.extend_from_slice(columns);
1057        DataFrame::new(new_cols)
1058    }
1059
1060    /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1061    ///
1062    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1063    ///
1064    /// # Example
1065    ///
1066    /// ```rust
1067    /// # use polars_core::prelude::*;
1068    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1069    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1070    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1071    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1072    ///
1073    /// let df3: DataFrame = df1.vstack(&df2)?;
1074    ///
1075    /// assert_eq!(df3.shape(), (5, 2));
1076    /// println!("{}", df3);
1077    /// # Ok::<(), PolarsError>(())
1078    /// ```
1079    ///
1080    /// Output:
1081    ///
1082    /// ```text
1083    /// shape: (5, 2)
1084    /// +-----------+-------------------+
1085    /// | Element   | Melting Point (K) |
1086    /// | ---       | ---               |
1087    /// | str       | f64               |
1088    /// +===========+===================+
1089    /// | Copper    | 1357.77           |
1090    /// +-----------+-------------------+
1091    /// | Silver    | 1234.93           |
1092    /// +-----------+-------------------+
1093    /// | Gold      | 1337.33           |
1094    /// +-----------+-------------------+
1095    /// | Platinum  | 2041.4            |
1096    /// +-----------+-------------------+
1097    /// | Palladium | 1828.05           |
1098    /// +-----------+-------------------+
1099    /// ```
1100    pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1101        let mut df = self.clone();
1102        df.vstack_mut(other)?;
1103        Ok(df)
1104    }
1105
1106    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1107    ///
1108    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1109    ///
1110    /// # Example
1111    ///
1112    /// ```rust
1113    /// # use polars_core::prelude::*;
1114    /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1115    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1116    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1117    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1118    ///
1119    /// df1.vstack_mut(&df2)?;
1120    ///
1121    /// assert_eq!(df1.shape(), (5, 2));
1122    /// println!("{}", df1);
1123    /// # Ok::<(), PolarsError>(())
1124    /// ```
1125    ///
1126    /// Output:
1127    ///
1128    /// ```text
1129    /// shape: (5, 2)
1130    /// +-----------+-------------------+
1131    /// | Element   | Melting Point (K) |
1132    /// | ---       | ---               |
1133    /// | str       | f64               |
1134    /// +===========+===================+
1135    /// | Copper    | 1357.77           |
1136    /// +-----------+-------------------+
1137    /// | Silver    | 1234.93           |
1138    /// +-----------+-------------------+
1139    /// | Gold      | 1337.33           |
1140    /// +-----------+-------------------+
1141    /// | Platinum  | 2041.4            |
1142    /// +-----------+-------------------+
1143    /// | Palladium | 1828.05           |
1144    /// +-----------+-------------------+
1145    /// ```
1146    pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1147        if self.width() != other.width() {
1148            polars_ensure!(
1149                self.width() == 0,
1150                ShapeMismatch:
1151                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1152                self.width(), other.width(),
1153            );
1154            self.columns.clone_from(&other.columns);
1155            self.height = other.height;
1156            return Ok(self);
1157        }
1158
1159        self.columns
1160            .iter_mut()
1161            .zip(other.columns.iter())
1162            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1163                ensure_can_extend(&*left, right)?;
1164                left.append(right).map_err(|e| {
1165                    e.context(format!("failed to vstack column '{}'", right.name()).into())
1166                })?;
1167                Ok(())
1168            })?;
1169        self.height += other.height;
1170        Ok(self)
1171    }
1172
1173    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1174    ///
1175    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1176    ///
1177    /// # Panics
1178    /// Panics if the schema's don't match.
1179    pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1180        self.columns
1181            .iter_mut()
1182            .zip(other.columns.iter())
1183            .for_each(|(left, right)| {
1184                left.append(right)
1185                    .map_err(|e| {
1186                        e.context(format!("failed to vstack column '{}'", right.name()).into())
1187                    })
1188                    .expect("should not fail");
1189            });
1190        self.height += other.height;
1191    }
1192
1193    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1194    ///
1195    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1196    ///
1197    /// # Panics
1198    /// Panics if the schema's don't match.
1199    pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1200        self.columns
1201            .iter_mut()
1202            .zip(other.columns)
1203            .for_each(|(left, right)| {
1204                left.append_owned(right).expect("should not fail");
1205            });
1206        self.height += other.height;
1207    }
1208
1209    /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1210    ///
1211    /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1212    /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1213    ///
1214    /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1215    /// and thus will yield faster queries.
1216    ///
1217    /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1218    /// online operations where you add `n` rows and rerun a query.
1219    ///
1220    /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1221    /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1222    /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1223    pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1224        polars_ensure!(
1225            self.width() == other.width(),
1226            ShapeMismatch:
1227            "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1228            self.width(), other.width(),
1229        );
1230
1231        self.columns
1232            .iter_mut()
1233            .zip(other.columns.iter())
1234            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1235                ensure_can_extend(&*left, right)?;
1236                left.extend(right).map_err(|e| {
1237                    e.context(format!("failed to extend column '{}'", right.name()).into())
1238                })?;
1239                Ok(())
1240            })?;
1241        self.height += other.height;
1242        self.clear_schema();
1243        Ok(())
1244    }
1245
1246    /// Remove a column by name and return the column removed.
1247    ///
1248    /// # Example
1249    ///
1250    /// ```rust
1251    /// # use polars_core::prelude::*;
1252    /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1253    ///                             "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1254    ///
1255    /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1256    /// assert!(s1.is_err());
1257    ///
1258    /// let s2: Column = df.drop_in_place("Animal")?;
1259    /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1260    /// # Ok::<(), PolarsError>(())
1261    /// ```
1262    pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1263        let idx = self.check_name_to_idx(name)?;
1264        self.clear_schema();
1265        Ok(self.columns.remove(idx))
1266    }
1267
1268    /// Return a new [`DataFrame`] where all null values are dropped.
1269    ///
1270    /// # Example
1271    ///
1272    /// ```no_run
1273    /// # use polars_core::prelude::*;
1274    /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1275    ///                         "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1276    /// assert_eq!(df1.shape(), (3, 2));
1277    ///
1278    /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1279    /// assert_eq!(df2.shape(), (1, 2));
1280    /// println!("{}", df2);
1281    /// # Ok::<(), PolarsError>(())
1282    /// ```
1283    ///
1284    /// Output:
1285    ///
1286    /// ```text
1287    /// shape: (1, 2)
1288    /// +---------+---------------------+
1289    /// | Country | Tax revenue (% GDP) |
1290    /// | ---     | ---                 |
1291    /// | str     | f64                 |
1292    /// +=========+=====================+
1293    /// | Malta   | 32.7                |
1294    /// +---------+---------------------+
1295    /// ```
1296    pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1297    where
1298        for<'a> &'a S: Into<PlSmallStr>,
1299    {
1300        if let Some(v) = subset {
1301            let v = self.select_columns(v)?;
1302            self._drop_nulls_impl(v.as_slice())
1303        } else {
1304            self._drop_nulls_impl(self.columns.as_slice())
1305        }
1306    }
1307
1308    fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1309        // fast path for no nulls in df
1310        if subset.iter().all(|s| !s.has_nulls()) {
1311            return Ok(self.clone());
1312        }
1313
1314        let mut iter = subset.iter();
1315
1316        let mask = iter
1317            .next()
1318            .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1319        let mut mask = mask.is_not_null();
1320
1321        for c in iter {
1322            mask = mask & c.is_not_null();
1323        }
1324        self.filter(&mask)
1325    }
1326
1327    /// Drop a column by name.
1328    /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1329    /// the current one in place.
1330    ///
1331    /// # Example
1332    ///
1333    /// ```rust
1334    /// # use polars_core::prelude::*;
1335    /// let df1: DataFrame = df!("Ray type" => ["Ī±", "Ī²", "X", "Ī³"])?;
1336    /// let df2: DataFrame = df1.drop("Ray type")?;
1337    ///
1338    /// assert!(df2.is_empty());
1339    /// # Ok::<(), PolarsError>(())
1340    /// ```
1341    pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1342        let idx = self.check_name_to_idx(name)?;
1343        let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1344
1345        self.columns.iter().enumerate().for_each(|(i, s)| {
1346            if i != idx {
1347                new_cols.push(s.clone())
1348            }
1349        });
1350
1351        Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1352    }
1353
1354    /// Drop columns that are in `names`.
1355    pub fn drop_many<I, S>(&self, names: I) -> Self
1356    where
1357        I: IntoIterator<Item = S>,
1358        S: Into<PlSmallStr>,
1359    {
1360        let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1361        self.drop_many_amortized(&names)
1362    }
1363
1364    /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1365    pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1366        if names.is_empty() {
1367            return self.clone();
1368        }
1369        let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1370        self.columns.iter().for_each(|s| {
1371            if !names.contains(s.name()) {
1372                new_cols.push(s.clone())
1373            }
1374        });
1375
1376        unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1377    }
1378
1379    /// Insert a new column at a given index without checking for duplicates.
1380    /// This can leave the [`DataFrame`] at an invalid state
1381    fn insert_column_no_name_check(
1382        &mut self,
1383        index: usize,
1384        column: Column,
1385    ) -> PolarsResult<&mut Self> {
1386        polars_ensure!(
1387            self.width() == 0 || column.len() == self.height(),
1388            ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1389            column.len(), self.height(),
1390        );
1391
1392        if self.width() == 0 {
1393            self.height = column.len();
1394        }
1395
1396        self.columns.insert(index, column);
1397        self.clear_schema();
1398        Ok(self)
1399    }
1400
1401    /// Insert a new column at a given index.
1402    pub fn insert_column<S: IntoColumn>(
1403        &mut self,
1404        index: usize,
1405        column: S,
1406    ) -> PolarsResult<&mut Self> {
1407        let column = column.into_column();
1408        self.check_already_present(column.name().as_str())?;
1409        self.insert_column_no_name_check(index, column)
1410    }
1411
1412    fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1413        if let Some(idx) = self.get_column_index(column.name().as_str()) {
1414            self.replace_column(idx, column)?;
1415        } else {
1416            if self.width() == 0 {
1417                self.height = column.len();
1418            }
1419
1420            self.columns.push(column);
1421            self.clear_schema();
1422        }
1423        Ok(())
1424    }
1425
1426    /// Add a new column to this [`DataFrame`] or replace an existing one.
1427    pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1428        fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1429            let height = df.height();
1430            if column.len() == 1 && height > 1 {
1431                column = column.new_from_index(0, height);
1432            }
1433
1434            if column.len() == height || df.get_columns().is_empty() {
1435                df.add_column_by_search(column)?;
1436                Ok(df)
1437            }
1438            // special case for literals
1439            else if height == 0 && column.len() == 1 {
1440                let s = column.clear();
1441                df.add_column_by_search(s)?;
1442                Ok(df)
1443            } else {
1444                polars_bail!(
1445                    ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1446                    column.len(), height,
1447                );
1448            }
1449        }
1450        let column = column.into_column();
1451        inner(self, column)
1452    }
1453
1454    /// Adds a column to the [`DataFrame`] without doing any checks
1455    /// on length or duplicates.
1456    ///
1457    /// # Safety
1458    /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1459    pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1460        debug_assert!(self.width() == 0 || self.height() == column.len());
1461        debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1462
1463        // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1464        // properly for `width` == 0.
1465        if self.width() == 0 {
1466            unsafe { self.set_height(column.len()) };
1467        }
1468        unsafe { self.get_columns_mut() }.push(column);
1469        self.clear_schema();
1470
1471        self
1472    }
1473
1474    // Note: Schema can be both input or output_schema
1475    fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1476        let name = c.name();
1477        if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1478            if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1479                // Given schema is output_schema and we can push.
1480                if idx == self.columns.len() {
1481                    if self.width() == 0 {
1482                        self.height = c.len();
1483                    }
1484
1485                    self.columns.push(c);
1486                    self.clear_schema();
1487                }
1488                // Schema is incorrect fallback to search
1489                else {
1490                    debug_assert!(false);
1491                    self.add_column_by_search(c)?;
1492                }
1493            } else {
1494                self.replace_column(idx, c)?;
1495            }
1496        } else {
1497            if self.width() == 0 {
1498                self.height = c.len();
1499            }
1500
1501            self.columns.push(c);
1502            self.clear_schema();
1503        }
1504
1505        Ok(())
1506    }
1507
1508    // Note: Schema can be both input or output_schema
1509    pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1510        for (i, s) in series.into_iter().enumerate() {
1511            // we need to branch here
1512            // because users can add multiple columns with the same name
1513            if i == 0 || schema.get(s.name().as_str()).is_some() {
1514                self.with_column_and_schema(s.into_column(), schema)?;
1515            } else {
1516                self.with_column(s.clone().into_column())?;
1517            }
1518        }
1519        Ok(())
1520    }
1521
1522    pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1523        for (i, s) in columns.into_iter().enumerate() {
1524            // we need to branch here
1525            // because users can add multiple columns with the same name
1526            if i == 0 || schema.get(s.name().as_str()).is_some() {
1527                self.with_column_and_schema(s, schema)?;
1528            } else {
1529                self.with_column(s.clone())?;
1530            }
1531        }
1532
1533        Ok(())
1534    }
1535
1536    /// Add a new column to this [`DataFrame`] or replace an existing one.
1537    /// Uses an existing schema to amortize lookups.
1538    /// If the schema is incorrect, we will fallback to linear search.
1539    ///
1540    /// Note: Schema can be both input or output_schema
1541    pub fn with_column_and_schema<C: IntoColumn>(
1542        &mut self,
1543        column: C,
1544        schema: &Schema,
1545    ) -> PolarsResult<&mut Self> {
1546        let mut column = column.into_column();
1547
1548        let height = self.height();
1549        if column.len() == 1 && height > 1 {
1550            column = column.new_from_index(0, height);
1551        }
1552
1553        if column.len() == height || self.columns.is_empty() {
1554            self.add_column_by_schema(column, schema)?;
1555            Ok(self)
1556        }
1557        // special case for literals
1558        else if height == 0 && column.len() == 1 {
1559            let s = column.clear();
1560            self.add_column_by_schema(s, schema)?;
1561            Ok(self)
1562        } else {
1563            polars_bail!(
1564                ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1565                column.len(), height,
1566            );
1567        }
1568    }
1569
1570    /// Get a row in the [`DataFrame`]. Beware this is slow.
1571    ///
1572    /// # Example
1573    ///
1574    /// ```
1575    /// # use polars_core::prelude::*;
1576    /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1577    ///     df.get(idx)
1578    /// }
1579    /// ```
1580    pub fn get(&self, idx: usize) -> Option<Vec<AnyValue>> {
1581        match self.columns.first() {
1582            Some(s) => {
1583                if s.len() <= idx {
1584                    return None;
1585                }
1586            },
1587            None => return None,
1588        }
1589        // SAFETY: we just checked bounds
1590        unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1591    }
1592
1593    /// Select a [`Series`] by index.
1594    ///
1595    /// # Example
1596    ///
1597    /// ```rust
1598    /// # use polars_core::prelude::*;
1599    /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1600    ///                         "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1601    ///
1602    /// let s1: Option<&Column> = df.select_at_idx(0);
1603    /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1604    ///
1605    /// assert_eq!(s1, Some(&s2));
1606    /// # Ok::<(), PolarsError>(())
1607    /// ```
1608    pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1609        self.columns.get(idx)
1610    }
1611
1612    /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1613    ///
1614    /// # Examples
1615    ///
1616    /// ```rust
1617    /// # use polars_core::prelude::*;
1618    /// let df = df! {
1619    ///     "0" => [0, 0, 0],
1620    ///     "1" => [1, 1, 1],
1621    ///     "2" => [2, 2, 2]
1622    /// }?;
1623    ///
1624    /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1625    /// assert!(df.equals(&df.select_by_range(..)?));
1626    /// # Ok::<(), PolarsError>(())
1627    /// ```
1628    pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1629    where
1630        R: ops::RangeBounds<usize>,
1631    {
1632        // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1633        // because it is the nightly feature. We should change here if this function were stable.
1634        fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1635        where
1636            R: ops::RangeBounds<usize>,
1637        {
1638            let len = bounds.end;
1639
1640            let start: ops::Bound<&usize> = range.start_bound();
1641            let start = match start {
1642                ops::Bound::Included(&start) => start,
1643                ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1644                    panic!("attempted to index slice from after maximum usize");
1645                }),
1646                ops::Bound::Unbounded => 0,
1647            };
1648
1649            let end: ops::Bound<&usize> = range.end_bound();
1650            let end = match end {
1651                ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1652                    panic!("attempted to index slice up to maximum usize");
1653                }),
1654                ops::Bound::Excluded(&end) => end,
1655                ops::Bound::Unbounded => len,
1656            };
1657
1658            if start > end {
1659                panic!("slice index starts at {start} but ends at {end}");
1660            }
1661            if end > len {
1662                panic!("range end index {end} out of range for slice of length {len}",);
1663            }
1664
1665            ops::Range { start, end }
1666        }
1667
1668        let colnames = self.get_column_names_owned();
1669        let range = get_range(range, ..colnames.len());
1670
1671        self._select_impl(&colnames[range])
1672    }
1673
1674    /// Get column index of a [`Series`] by name.
1675    /// # Example
1676    ///
1677    /// ```rust
1678    /// # use polars_core::prelude::*;
1679    /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1680    ///                         "Health" => [100, 200, 500],
1681    ///                         "Mana" => [250, 100, 0],
1682    ///                         "Strength" => [30, 150, 300])?;
1683    ///
1684    /// assert_eq!(df.get_column_index("Name"), Some(0));
1685    /// assert_eq!(df.get_column_index("Health"), Some(1));
1686    /// assert_eq!(df.get_column_index("Mana"), Some(2));
1687    /// assert_eq!(df.get_column_index("Strength"), Some(3));
1688    /// assert_eq!(df.get_column_index("Haste"), None);
1689    /// # Ok::<(), PolarsError>(())
1690    /// ```
1691    pub fn get_column_index(&self, name: &str) -> Option<usize> {
1692        let schema = self.schema();
1693        if let Some(idx) = schema.index_of(name) {
1694            if self
1695                .get_columns()
1696                .get(idx)
1697                .is_some_and(|c| c.name() == name)
1698            {
1699                return Some(idx);
1700            }
1701        }
1702
1703        self.columns.iter().position(|s| s.name().as_str() == name)
1704    }
1705
1706    /// Get column index of a [`Series`] by name.
1707    pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1708        self.get_column_index(name)
1709            .ok_or_else(|| polars_err!(col_not_found = name))
1710    }
1711
1712    /// Select a single column by name.
1713    ///
1714    /// # Example
1715    ///
1716    /// ```rust
1717    /// # use polars_core::prelude::*;
1718    /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1719    /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1720    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1721    ///
1722    /// assert_eq!(df.column("Password")?, &s1);
1723    /// # Ok::<(), PolarsError>(())
1724    /// ```
1725    pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1726        let idx = self.try_get_column_index(name)?;
1727        Ok(self.select_at_idx(idx).unwrap())
1728    }
1729
1730    /// Selected multiple columns by name.
1731    ///
1732    /// # Example
1733    ///
1734    /// ```rust
1735    /// # use polars_core::prelude::*;
1736    /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1737    ///                         "Max weight (kg)" => [16.0, 35.89])?;
1738    /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1739    ///
1740    /// assert_eq!(&df[0], sv[0]);
1741    /// assert_eq!(&df[1], sv[1]);
1742    /// # Ok::<(), PolarsError>(())
1743    /// ```
1744    pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1745    where
1746        I: IntoIterator<Item = S>,
1747        S: AsRef<str>,
1748    {
1749        names
1750            .into_iter()
1751            .map(|name| self.column(name.as_ref()))
1752            .collect()
1753    }
1754
1755    /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1756    ///
1757    /// # Examples
1758    ///
1759    /// ```
1760    /// # use polars_core::prelude::*;
1761    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1762    ///     df.select(["foo", "bar"])
1763    /// }
1764    /// ```
1765    pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1766    where
1767        I: IntoIterator<Item = S>,
1768        S: Into<PlSmallStr>,
1769    {
1770        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1771        self._select_impl(cols.as_slice())
1772    }
1773
1774    pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1775        ensure_names_unique(cols, |s| s.as_str())?;
1776        self._select_impl_unchecked(cols)
1777    }
1778
1779    pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1780        let selected = self.select_columns_impl(cols)?;
1781        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1782    }
1783
1784    /// Select with a known schema. The schema names must match the column names of this DataFrame.
1785    pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1786    where
1787        I: IntoIterator<Item = S>,
1788        S: Into<PlSmallStr>,
1789    {
1790        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1791        self._select_with_schema_impl(&cols, schema, true)
1792    }
1793
1794    /// Select with a known schema without checking for duplicates in `selection`.
1795    /// The schema names must match the column names of this DataFrame.
1796    pub fn select_with_schema_unchecked<I, S>(
1797        &self,
1798        selection: I,
1799        schema: &Schema,
1800    ) -> PolarsResult<Self>
1801    where
1802        I: IntoIterator<Item = S>,
1803        S: Into<PlSmallStr>,
1804    {
1805        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1806        self._select_with_schema_impl(&cols, schema, false)
1807    }
1808
1809    /// * The schema names must match the column names of this DataFrame.
1810    pub fn _select_with_schema_impl(
1811        &self,
1812        cols: &[PlSmallStr],
1813        schema: &Schema,
1814        check_duplicates: bool,
1815    ) -> PolarsResult<Self> {
1816        if check_duplicates {
1817            ensure_names_unique(cols, |s| s.as_str())?;
1818        }
1819
1820        let selected = self.select_columns_impl_with_schema(cols, schema)?;
1821        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1822    }
1823
1824    /// A non generic implementation to reduce compiler bloat.
1825    fn select_columns_impl_with_schema(
1826        &self,
1827        cols: &[PlSmallStr],
1828        schema: &Schema,
1829    ) -> PolarsResult<Vec<Column>> {
1830        if cfg!(debug_assertions) {
1831            ensure_matching_schema_names(schema, self.schema())?;
1832        }
1833
1834        cols.iter()
1835            .map(|name| {
1836                let index = schema.try_get_full(name.as_str())?.0;
1837                Ok(self.columns[index].clone())
1838            })
1839            .collect()
1840    }
1841
1842    pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1843    where
1844        I: IntoIterator<Item = S>,
1845        S: Into<PlSmallStr>,
1846    {
1847        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1848        self.select_physical_impl(&cols)
1849    }
1850
1851    fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1852        ensure_names_unique(cols, |s| s.as_str())?;
1853        let selected = self.select_columns_physical_impl(cols)?;
1854        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1855    }
1856
1857    /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1858    ///
1859    /// # Example
1860    ///
1861    /// ```rust
1862    /// # use polars_core::prelude::*;
1863    /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1864    ///                         "Carbon" => [1, 2, 3],
1865    ///                         "Hydrogen" => [4, 6, 8])?;
1866    /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1867    ///
1868    /// assert_eq!(df["Carbon"], sv[0]);
1869    /// assert_eq!(df["Hydrogen"], sv[1]);
1870    /// # Ok::<(), PolarsError>(())
1871    /// ```
1872    pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1873        let cols = selection.into_vec();
1874        self.select_columns_impl(&cols)
1875    }
1876
1877    fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1878        self.columns
1879            .iter()
1880            .enumerate()
1881            .map(|(i, s)| (s.name().as_str(), i))
1882            .collect()
1883    }
1884
1885    /// A non generic implementation to reduce compiler bloat.
1886    fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1887        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1888            let name_to_idx = self._names_to_idx_map();
1889            cols.iter()
1890                .map(|name| {
1891                    let idx = *name_to_idx
1892                        .get(name.as_str())
1893                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1894                    Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1895                })
1896                .collect::<PolarsResult<Vec<_>>>()?
1897        } else {
1898            cols.iter()
1899                .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1900                .collect::<PolarsResult<Vec<_>>>()?
1901        };
1902
1903        Ok(selected)
1904    }
1905
1906    /// A non generic implementation to reduce compiler bloat.
1907    fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1908        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1909            // we hash, because there are user that having millions of columns.
1910            // # https://github.com/pola-rs/polars/issues/1023
1911            let name_to_idx = self._names_to_idx_map();
1912
1913            cols.iter()
1914                .map(|name| {
1915                    let idx = *name_to_idx
1916                        .get(name.as_str())
1917                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1918                    Ok(self.select_at_idx(idx).unwrap().clone())
1919                })
1920                .collect::<PolarsResult<Vec<_>>>()?
1921        } else {
1922            cols.iter()
1923                .map(|c| self.column(c.as_str()).cloned())
1924                .collect::<PolarsResult<Vec<_>>>()?
1925        };
1926
1927        Ok(selected)
1928    }
1929
1930    fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1931        // If there is a filtered column just see how many columns there are left.
1932        if let Some(fst) = filtered.first() {
1933            return fst.len();
1934        }
1935
1936        // Otherwise, count the number of values that would be filtered and return that height.
1937        let num_trues = mask.num_trues();
1938        if mask.len() == self.height() {
1939            num_trues
1940        } else {
1941            // This is for broadcasting masks
1942            debug_assert!(num_trues == 0 || num_trues == 1);
1943            self.height() * num_trues
1944        }
1945    }
1946
1947    /// Take the [`DataFrame`] rows by a boolean mask.
1948    ///
1949    /// # Example
1950    ///
1951    /// ```
1952    /// # use polars_core::prelude::*;
1953    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1954    ///     let mask = df.column("sepal_width")?.is_not_null();
1955    ///     df.filter(&mask)
1956    /// }
1957    /// ```
1958    pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1959        let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
1960        let height = self.filter_height(&new_col, mask);
1961
1962        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1963    }
1964
1965    /// Same as `filter` but does not parallelize.
1966    pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1967        let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
1968        let height = self.filter_height(&new_col, mask);
1969
1970        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1971    }
1972
1973    /// Take [`DataFrame`] rows by index values.
1974    ///
1975    /// # Example
1976    ///
1977    /// ```
1978    /// # use polars_core::prelude::*;
1979    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1980    ///     let idx = IdxCa::new("idx".into(), [0, 1, 9]);
1981    ///     df.take(&idx)
1982    /// }
1983    /// ```
1984    pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
1985        let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
1986
1987        Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
1988    }
1989
1990    /// # Safety
1991    /// The indices must be in-bounds.
1992    pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
1993        self.take_unchecked_impl(idx, true)
1994    }
1995
1996    /// # Safety
1997    /// The indices must be in-bounds.
1998    pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
1999        let cols = if allow_threads {
2000            POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
2001        } else {
2002            self._apply_columns(&|s| s.take_unchecked(idx))
2003        };
2004        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2005    }
2006
2007    /// # Safety
2008    /// The indices must be in-bounds.
2009    pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2010        self.take_slice_unchecked_impl(idx, true)
2011    }
2012
2013    /// # Safety
2014    /// The indices must be in-bounds.
2015    pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2016        let cols = if allow_threads {
2017            POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2018        } else {
2019            self._apply_columns(&|s| s.take_slice_unchecked(idx))
2020        };
2021        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2022    }
2023
2024    /// Rename a column in the [`DataFrame`].
2025    ///
2026    /// # Example
2027    ///
2028    /// ```
2029    /// # use polars_core::prelude::*;
2030    /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2031    ///     let original_name = "foo";
2032    ///     let new_name = "bar";
2033    ///     df.rename(original_name, new_name.into())
2034    /// }
2035    /// ```
2036    pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2037        if column == name.as_str() {
2038            return Ok(self);
2039        }
2040        polars_ensure!(
2041            !self.schema().contains(&name),
2042            Duplicate: "column rename attempted with already existing name \"{name}\""
2043        );
2044
2045        self.get_column_index(column)
2046            .and_then(|idx| self.columns.get_mut(idx))
2047            .ok_or_else(|| polars_err!(col_not_found = column))
2048            .map(|c| c.rename(name))?;
2049        Ok(self)
2050    }
2051
2052    /// Sort [`DataFrame`] in place.
2053    ///
2054    /// See [`DataFrame::sort`] for more instruction.
2055    pub fn sort_in_place(
2056        &mut self,
2057        by: impl IntoVec<PlSmallStr>,
2058        sort_options: SortMultipleOptions,
2059    ) -> PolarsResult<&mut Self> {
2060        let by_column = self.select_columns(by)?;
2061        self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2062        Ok(self)
2063    }
2064
2065    #[doc(hidden)]
2066    /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2067    pub fn sort_impl(
2068        &self,
2069        by_column: Vec<Column>,
2070        mut sort_options: SortMultipleOptions,
2071        slice: Option<(i64, usize)>,
2072    ) -> PolarsResult<Self> {
2073        if by_column.is_empty() {
2074            // If no columns selected, any order (including original order) is correct.
2075            return if let Some((offset, len)) = slice {
2076                Ok(self.slice(offset, len))
2077            } else {
2078                Ok(self.clone())
2079            };
2080        }
2081
2082        // note that the by_column argument also contains evaluated expression from
2083        // polars-lazy that may not even be present in this dataframe. therefore
2084        // when we try to set the first columns as sorted, we ignore the error as
2085        // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2086        let first_descending = sort_options.descending[0];
2087        let first_by_column = by_column[0].name().to_string();
2088
2089        let set_sorted = |df: &mut DataFrame| {
2090            // Mark the first sort column as sorted; if the column does not exist it
2091            // is ok, because we sorted by an expression not present in the dataframe
2092            let _ = df.apply(&first_by_column, |s| {
2093                let mut s = s.clone();
2094                if first_descending {
2095                    s.set_sorted_flag(IsSorted::Descending)
2096                } else {
2097                    s.set_sorted_flag(IsSorted::Ascending)
2098                }
2099                s
2100            });
2101        };
2102        if self.is_empty() {
2103            let mut out = self.clone();
2104            set_sorted(&mut out);
2105            return Ok(out);
2106        }
2107
2108        if let Some((0, k)) = slice {
2109            if k < self.len() {
2110                return self.bottom_k_impl(k, by_column, sort_options);
2111            }
2112        }
2113        // Check if the required column is already sorted; if so we can exit early
2114        // We can do so when there is only one column to sort by, for multiple columns
2115        // it will be complicated to do so
2116        #[cfg(feature = "dtype-categorical")]
2117        let is_not_categorical_enum =
2118            !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2119                || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2120
2121        #[cfg(not(feature = "dtype-categorical"))]
2122        #[allow(non_upper_case_globals)]
2123        const is_not_categorical_enum: bool = true;
2124
2125        if by_column.len() == 1 && is_not_categorical_enum {
2126            let required_sorting = if sort_options.descending[0] {
2127                IsSorted::Descending
2128            } else {
2129                IsSorted::Ascending
2130            };
2131            // If null count is 0 then nulls_last doesnt matter
2132            // Safe to get value at last position since the dataframe is not empty (taken care above)
2133            let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2134                && ((by_column[0].null_count() == 0)
2135                    || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2136                        == sort_options.nulls_last[0]);
2137
2138            if no_sorting_required {
2139                return if let Some((offset, len)) = slice {
2140                    Ok(self.slice(offset, len))
2141                } else {
2142                    Ok(self.clone())
2143                };
2144            }
2145        }
2146
2147        let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2148
2149        // a lot of indirection in both sorting and take
2150        let mut df = self.clone();
2151        let df = df.as_single_chunk_par();
2152        let mut take = match (by_column.len(), has_nested) {
2153            (1, false) => {
2154                let s = &by_column[0];
2155                let options = SortOptions {
2156                    descending: sort_options.descending[0],
2157                    nulls_last: sort_options.nulls_last[0],
2158                    multithreaded: sort_options.multithreaded,
2159                    maintain_order: sort_options.maintain_order,
2160                    limit: sort_options.limit,
2161                };
2162                // fast path for a frame with a single series
2163                // no need to compute the sort indices and then take by these indices
2164                // simply sort and return as frame
2165                if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2166                    let mut out = s.sort_with(options)?;
2167                    if let Some((offset, len)) = slice {
2168                        out = out.slice(offset, len);
2169                    }
2170                    return Ok(out.into_frame());
2171                }
2172                s.arg_sort(options)
2173            },
2174            _ => {
2175                if sort_options.nulls_last.iter().all(|&x| x)
2176                    || has_nested
2177                    || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2178                {
2179                    argsort_multiple_row_fmt(
2180                        &by_column,
2181                        sort_options.descending,
2182                        sort_options.nulls_last,
2183                        sort_options.multithreaded,
2184                    )?
2185                } else {
2186                    let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2187                    first
2188                        .as_materialized_series()
2189                        .arg_sort_multiple(&other, &sort_options)?
2190                }
2191            },
2192        };
2193
2194        if let Some((offset, len)) = slice {
2195            take = take.slice(offset, len);
2196        }
2197
2198        // SAFETY:
2199        // the created indices are in bounds
2200        let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2201        set_sorted(&mut df);
2202        Ok(df)
2203    }
2204
2205    /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2206    ///
2207    /// This dataframe does not necessarily have a specified schema and may be changed at any
2208    /// point. It is primarily used for debugging.
2209    pub fn _to_metadata(&self) -> DataFrame {
2210        let num_columns = self.columns.len();
2211
2212        let mut column_names =
2213            StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2214        let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2215        let mut sorted_asc_ca =
2216            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2217        let mut sorted_dsc_ca =
2218            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2219        let mut fast_explode_list_ca =
2220            BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2221        let mut materialized_at_ca =
2222            StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2223
2224        for col in &self.columns {
2225            let flags = col.get_flags();
2226
2227            let (repr, materialized_at) = match col {
2228                Column::Series(s) => ("series", s.materialized_at()),
2229                Column::Partitioned(_) => ("partitioned", None),
2230                Column::Scalar(_) => ("scalar", None),
2231            };
2232            let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2233            let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2234            let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2235
2236            column_names.append_value(col.name().clone());
2237            repr_ca.append_value(repr);
2238            sorted_asc_ca.append_value(sorted_asc);
2239            sorted_dsc_ca.append_value(sorted_dsc);
2240            fast_explode_list_ca.append_value(fast_explode_list);
2241            materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2242        }
2243
2244        unsafe {
2245            DataFrame::new_no_checks(
2246                self.width(),
2247                vec![
2248                    column_names.finish().into_column(),
2249                    repr_ca.finish().into_column(),
2250                    sorted_asc_ca.finish().into_column(),
2251                    sorted_dsc_ca.finish().into_column(),
2252                    fast_explode_list_ca.finish().into_column(),
2253                    materialized_at_ca.finish().into_column(),
2254                ],
2255            )
2256        }
2257    }
2258
2259    /// Return a sorted clone of this [`DataFrame`].
2260    ///
2261    /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2262    /// # Example
2263    ///
2264    /// Sort by a single column with default options:
2265    /// ```
2266    /// # use polars_core::prelude::*;
2267    /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2268    ///     df.sort(["sepal_width"], Default::default())
2269    /// }
2270    /// ```
2271    /// Sort by a single column with specific order:
2272    /// ```
2273    /// # use polars_core::prelude::*;
2274    /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2275    ///     df.sort(
2276    ///         ["sepal_width"],
2277    ///         SortMultipleOptions::new()
2278    ///             .with_order_descending(descending)
2279    ///     )
2280    /// }
2281    /// ```
2282    /// Sort by multiple columns with specifying order for each column:
2283    /// ```
2284    /// # use polars_core::prelude::*;
2285    /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2286    ///     df.sort(
2287    ///         ["sepal_width", "sepal_length"],
2288    ///         SortMultipleOptions::new()
2289    ///             .with_order_descending_multi([false, true])
2290    ///     )
2291    /// }
2292    /// ```
2293    /// See [`SortMultipleOptions`] for more options.
2294    ///
2295    /// Also see [`DataFrame::sort_in_place`].
2296    pub fn sort(
2297        &self,
2298        by: impl IntoVec<PlSmallStr>,
2299        sort_options: SortMultipleOptions,
2300    ) -> PolarsResult<Self> {
2301        let mut df = self.clone();
2302        df.sort_in_place(by, sort_options)?;
2303        Ok(df)
2304    }
2305
2306    /// Replace a column with a [`Series`].
2307    ///
2308    /// # Example
2309    ///
2310    /// ```rust
2311    /// # use polars_core::prelude::*;
2312    /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2313    ///                         "Area (kmĀ²)" => [9_833_520, 9_596_961])?;
2314    /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2315    ///
2316    /// assert!(df.replace("Nation", s.clone()).is_err());
2317    /// assert!(df.replace("Country", s).is_ok());
2318    /// # Ok::<(), PolarsError>(())
2319    /// ```
2320    pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2321        self.apply(column, |_| new_col.into_series())
2322    }
2323
2324    /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2325    /// is that now the value of `column: &str` determines the name of the column and not the name
2326    /// of the `Series` passed to this method.
2327    pub fn replace_or_add<S: IntoSeries>(
2328        &mut self,
2329        column: PlSmallStr,
2330        new_col: S,
2331    ) -> PolarsResult<&mut Self> {
2332        let mut new_col = new_col.into_series();
2333        new_col.rename(column);
2334        self.with_column(new_col)
2335    }
2336
2337    /// Replace column at index `idx` with a [`Series`].
2338    ///
2339    /// # Example
2340    ///
2341    /// ```ignored
2342    /// # use polars_core::prelude::*;
2343    /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2344    /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2345    /// let mut df = DataFrame::new(vec![s0, s1])?;
2346    ///
2347    /// // Add 32 to get lowercase ascii values
2348    /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2349    /// # Ok::<(), PolarsError>(())
2350    /// ```
2351    pub fn replace_column<C: IntoColumn>(
2352        &mut self,
2353        index: usize,
2354        new_column: C,
2355    ) -> PolarsResult<&mut Self> {
2356        polars_ensure!(
2357            index < self.width(),
2358            ShapeMismatch:
2359            "unable to replace at index {}, the DataFrame has only {} columns",
2360            index, self.width(),
2361        );
2362        let mut new_column = new_column.into_column();
2363        polars_ensure!(
2364            new_column.len() == self.height(),
2365            ShapeMismatch:
2366            "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2367            new_column.len(), self.height(),
2368        );
2369        let old_col = &mut self.columns[index];
2370        mem::swap(old_col, &mut new_column);
2371        self.clear_schema();
2372        Ok(self)
2373    }
2374
2375    /// Apply a closure to a column. This is the recommended way to do in place modification.
2376    ///
2377    /// # Example
2378    ///
2379    /// ```rust
2380    /// # use polars_core::prelude::*;
2381    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2382    /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2383    /// let mut df = DataFrame::new(vec![s0, s1])?;
2384    ///
2385    /// fn str_to_len(str_val: &Column) -> Column {
2386    ///     str_val.str()
2387    ///         .unwrap()
2388    ///         .into_iter()
2389    ///         .map(|opt_name: Option<&str>| {
2390    ///             opt_name.map(|name: &str| name.len() as u32)
2391    ///          })
2392    ///         .collect::<UInt32Chunked>()
2393    ///         .into_column()
2394    /// }
2395    ///
2396    /// // Replace the names column by the length of the names.
2397    /// df.apply("names", str_to_len);
2398    /// # Ok::<(), PolarsError>(())
2399    /// ```
2400    /// Results in:
2401    ///
2402    /// ```text
2403    /// +--------+-------+
2404    /// | foo    |       |
2405    /// | ---    | names |
2406    /// | str    | u32   |
2407    /// +========+=======+
2408    /// | "ham"  | 4     |
2409    /// +--------+-------+
2410    /// | "spam" | 6     |
2411    /// +--------+-------+
2412    /// | "egg"  | 3     |
2413    /// +--------+-------+
2414    /// ```
2415    pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2416    where
2417        F: FnOnce(&Column) -> C,
2418        C: IntoColumn,
2419    {
2420        let idx = self.check_name_to_idx(name)?;
2421        self.apply_at_idx(idx, f)
2422    }
2423
2424    /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2425    /// modification.
2426    ///
2427    /// # Example
2428    ///
2429    /// ```rust
2430    /// # use polars_core::prelude::*;
2431    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2432    /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2433    /// let mut df = DataFrame::new(vec![s0, s1])?;
2434    ///
2435    /// // Add 32 to get lowercase ascii values
2436    /// df.apply_at_idx(1, |s| s + 32);
2437    /// # Ok::<(), PolarsError>(())
2438    /// ```
2439    /// Results in:
2440    ///
2441    /// ```text
2442    /// +--------+-------+
2443    /// | foo    | ascii |
2444    /// | ---    | ---   |
2445    /// | str    | i32   |
2446    /// +========+=======+
2447    /// | "ham"  | 102   |
2448    /// +--------+-------+
2449    /// | "spam" | 111   |
2450    /// +--------+-------+
2451    /// | "egg"  | 111   |
2452    /// +--------+-------+
2453    /// ```
2454    pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2455    where
2456        F: FnOnce(&Column) -> C,
2457        C: IntoColumn,
2458    {
2459        let df_height = self.height();
2460        let width = self.width();
2461        let col = self.columns.get_mut(idx).ok_or_else(|| {
2462            polars_err!(
2463                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2464                idx, width
2465            )
2466        })?;
2467        let name = col.name().clone();
2468        let new_col = f(col).into_column();
2469        match new_col.len() {
2470            1 => {
2471                let new_col = new_col.new_from_index(0, df_height);
2472                let _ = mem::replace(col, new_col);
2473            },
2474            len if (len == df_height) => {
2475                let _ = mem::replace(col, new_col);
2476            },
2477            len => polars_bail!(
2478                ShapeMismatch:
2479                "resulting Series has length {} while the DataFrame has height {}",
2480                len, df_height
2481            ),
2482        }
2483
2484        // make sure the name remains the same after applying the closure
2485        unsafe {
2486            let col = self.columns.get_unchecked_mut(idx);
2487            col.rename(name);
2488        }
2489        Ok(self)
2490    }
2491
2492    /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2493    /// modification.
2494    ///
2495    /// # Example
2496    ///
2497    /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2498    ///
2499    /// ```rust
2500    /// # use polars_core::prelude::*;
2501    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2502    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2503    /// let mut df = DataFrame::new(vec![s0, s1])?;
2504    ///
2505    /// let idx = vec![0, 1, 4];
2506    ///
2507    /// df.try_apply("foo", |c| {
2508    ///     c.str()?
2509    ///     .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2510    /// });
2511    /// # Ok::<(), PolarsError>(())
2512    /// ```
2513    /// Results in:
2514    ///
2515    /// ```text
2516    /// +---------------------+--------+
2517    /// | foo                 | values |
2518    /// | ---                 | ---    |
2519    /// | str                 | i32    |
2520    /// +=====================+========+
2521    /// | "ham-is-modified"   | 1      |
2522    /// +---------------------+--------+
2523    /// | "spam-is-modified"  | 2      |
2524    /// +---------------------+--------+
2525    /// | "egg"               | 3      |
2526    /// +---------------------+--------+
2527    /// | "bacon"             | 4      |
2528    /// +---------------------+--------+
2529    /// | "quack-is-modified" | 5      |
2530    /// +---------------------+--------+
2531    /// ```
2532    pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2533    where
2534        F: FnOnce(&Column) -> PolarsResult<C>,
2535        C: IntoColumn,
2536    {
2537        let width = self.width();
2538        let col = self.columns.get_mut(idx).ok_or_else(|| {
2539            polars_err!(
2540                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2541                idx, width
2542            )
2543        })?;
2544        let name = col.name().clone();
2545
2546        let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2547
2548        // make sure the name remains the same after applying the closure
2549        unsafe {
2550            let col = self.columns.get_unchecked_mut(idx);
2551            col.rename(name);
2552        }
2553        Ok(self)
2554    }
2555
2556    /// Apply a closure that may fail to a column. This is the recommended way to do in place
2557    /// modification.
2558    ///
2559    /// # Example
2560    ///
2561    /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2562    ///
2563    /// ```rust
2564    /// # use polars_core::prelude::*;
2565    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2566    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2567    /// let mut df = DataFrame::new(vec![s0, s1])?;
2568    ///
2569    /// // create a mask
2570    /// let values = df.column("values")?.as_materialized_series();
2571    /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2572    ///
2573    /// df.try_apply("foo", |c| {
2574    ///     c.str()?
2575    ///     .set(&mask, Some("not_within_bounds"))
2576    /// });
2577    /// # Ok::<(), PolarsError>(())
2578    /// ```
2579    /// Results in:
2580    ///
2581    /// ```text
2582    /// +---------------------+--------+
2583    /// | foo                 | values |
2584    /// | ---                 | ---    |
2585    /// | str                 | i32    |
2586    /// +=====================+========+
2587    /// | "not_within_bounds" | 1      |
2588    /// +---------------------+--------+
2589    /// | "spam"              | 2      |
2590    /// +---------------------+--------+
2591    /// | "egg"               | 3      |
2592    /// +---------------------+--------+
2593    /// | "bacon"             | 4      |
2594    /// +---------------------+--------+
2595    /// | "not_within_bounds" | 5      |
2596    /// +---------------------+--------+
2597    /// ```
2598    pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2599    where
2600        F: FnOnce(&Series) -> PolarsResult<C>,
2601        C: IntoColumn,
2602    {
2603        let idx = self.try_get_column_index(column)?;
2604        self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2605    }
2606
2607    /// Slice the [`DataFrame`] along the rows.
2608    ///
2609    /// # Example
2610    ///
2611    /// ```rust
2612    /// # use polars_core::prelude::*;
2613    /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2614    ///                         "Color" => ["Green", "Red", "White", "White", "Red"])?;
2615    /// let sl: DataFrame = df.slice(2, 3);
2616    ///
2617    /// assert_eq!(sl.shape(), (3, 2));
2618    /// println!("{}", sl);
2619    /// # Ok::<(), PolarsError>(())
2620    /// ```
2621    /// Output:
2622    /// ```text
2623    /// shape: (3, 2)
2624    /// +-------+-------+
2625    /// | Fruit | Color |
2626    /// | ---   | ---   |
2627    /// | str   | str   |
2628    /// +=======+=======+
2629    /// | Grape | White |
2630    /// +-------+-------+
2631    /// | Fig   | White |
2632    /// +-------+-------+
2633    /// | Fig   | Red   |
2634    /// +-------+-------+
2635    /// ```
2636    #[must_use]
2637    pub fn slice(&self, offset: i64, length: usize) -> Self {
2638        if offset == 0 && length == self.height() {
2639            return self.clone();
2640        }
2641        if length == 0 {
2642            return self.clear();
2643        }
2644        let col = self
2645            .columns
2646            .iter()
2647            .map(|s| s.slice(offset, length))
2648            .collect::<Vec<_>>();
2649
2650        let height = if let Some(fst) = col.first() {
2651            fst.len()
2652        } else {
2653            let (_, length) = slice_offsets(offset, length, self.height());
2654            length
2655        };
2656
2657        unsafe { DataFrame::new_no_checks(height, col) }
2658    }
2659
2660    /// Split [`DataFrame`] at the given `offset`.
2661    pub fn split_at(&self, offset: i64) -> (Self, Self) {
2662        let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2663
2664        let (idx, _) = slice_offsets(offset, 0, self.height());
2665
2666        let a = unsafe { DataFrame::new_no_checks(idx, a) };
2667        let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2668        (a, b)
2669    }
2670
2671    pub fn clear(&self) -> Self {
2672        let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2673        unsafe { DataFrame::new_no_checks(0, col) }
2674    }
2675
2676    #[must_use]
2677    pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2678        if offset == 0 && length == self.height() {
2679            return self.clone();
2680        }
2681        let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2682        unsafe { DataFrame::new_no_checks(length, columns) }
2683    }
2684
2685    #[must_use]
2686    pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2687        if offset == 0 && length == self.height() {
2688            return self.clone();
2689        }
2690        // @scalar-opt
2691        let columns = self._apply_columns(&|s| {
2692            let mut out = s.slice(offset, length);
2693            out.shrink_to_fit();
2694            out
2695        });
2696        unsafe { DataFrame::new_no_checks(length, columns) }
2697    }
2698
2699    /// Get the head of the [`DataFrame`].
2700    ///
2701    /// # Example
2702    ///
2703    /// ```rust
2704    /// # use polars_core::prelude::*;
2705    /// let countries: DataFrame =
2706    ///     df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2707    ///         "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2708    ///         "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2709    ///         "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2710    /// assert_eq!(countries.shape(), (5, 4));
2711    ///
2712    /// println!("{}", countries.head(Some(3)));
2713    /// # Ok::<(), PolarsError>(())
2714    /// ```
2715    ///
2716    /// Output:
2717    ///
2718    /// ```text
2719    /// shape: (3, 4)
2720    /// +--------------------+---------------+---------------+------------+
2721    /// | Rank by GDP (2021) | Continent     | Country       | Capital    |
2722    /// | ---                | ---           | ---           | ---        |
2723    /// | i32                | str           | str           | str        |
2724    /// +====================+===============+===============+============+
2725    /// | 1                  | North America | United States | Washington |
2726    /// +--------------------+---------------+---------------+------------+
2727    /// | 2                  | Asia          | China         | Beijing    |
2728    /// +--------------------+---------------+---------------+------------+
2729    /// | 3                  | Asia          | Japan         | Tokyo      |
2730    /// +--------------------+---------------+---------------+------------+
2731    /// ```
2732    #[must_use]
2733    pub fn head(&self, length: Option<usize>) -> Self {
2734        let col = self
2735            .columns
2736            .iter()
2737            .map(|c| c.head(length))
2738            .collect::<Vec<_>>();
2739
2740        let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2741        let height = usize::min(height, self.height());
2742        unsafe { DataFrame::new_no_checks(height, col) }
2743    }
2744
2745    /// Get the tail of the [`DataFrame`].
2746    ///
2747    /// # Example
2748    ///
2749    /// ```rust
2750    /// # use polars_core::prelude::*;
2751    /// let countries: DataFrame =
2752    ///     df!("Rank (2021)" => [105, 106, 107, 108, 109],
2753    ///         "Apple Price (ā‚¬/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2754    ///         "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2755    /// assert_eq!(countries.shape(), (5, 3));
2756    ///
2757    /// println!("{}", countries.tail(Some(2)));
2758    /// # Ok::<(), PolarsError>(())
2759    /// ```
2760    ///
2761    /// Output:
2762    ///
2763    /// ```text
2764    /// shape: (2, 3)
2765    /// +-------------+--------------------+---------+
2766    /// | Rank (2021) | Apple Price (ā‚¬/kg) | Country |
2767    /// | ---         | ---                | ---     |
2768    /// | i32         | f64                | str     |
2769    /// +=============+====================+=========+
2770    /// | 108         | 0.63               | Syria   |
2771    /// +-------------+--------------------+---------+
2772    /// | 109         | 0.63               | Turkey  |
2773    /// +-------------+--------------------+---------+
2774    /// ```
2775    #[must_use]
2776    pub fn tail(&self, length: Option<usize>) -> Self {
2777        let col = self
2778            .columns
2779            .iter()
2780            .map(|c| c.tail(length))
2781            .collect::<Vec<_>>();
2782
2783        let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2784        let height = usize::min(height, self.height());
2785        unsafe { DataFrame::new_no_checks(height, col) }
2786    }
2787
2788    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2789    ///
2790    /// # Panics
2791    ///
2792    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2793    ///
2794    /// This responsibility is left to the caller as we don't want to take mutable references here,
2795    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2796    /// as well.
2797    pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter {
2798        debug_assert!(!self.should_rechunk(), "expected equal chunks");
2799        // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2800        // as we must allocate arrow strings/binaries.
2801        let must_convert = compat_level.0 == 0;
2802        let parallel = parallel
2803            && must_convert
2804            && self.columns.len() > 1
2805            && self
2806                .columns
2807                .iter()
2808                .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2809
2810        RecordBatchIter {
2811            columns: &self.columns,
2812            schema: Arc::new(
2813                self.columns
2814                    .iter()
2815                    .map(|c| c.field().to_arrow(compat_level))
2816                    .collect(),
2817            ),
2818            idx: 0,
2819            n_chunks: self.first_col_n_chunks(),
2820            compat_level,
2821            parallel,
2822        }
2823    }
2824
2825    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2826    ///
2827    /// # Panics
2828    ///
2829    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2830    ///
2831    /// This responsibility is left to the caller as we don't want to take mutable references here,
2832    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2833    /// as well.
2834    pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2835        PhysRecordBatchIter {
2836            schema: Arc::new(
2837                self.get_columns()
2838                    .iter()
2839                    .map(|c| c.field().to_arrow(CompatLevel::newest()))
2840                    .collect(),
2841            ),
2842            arr_iters: self
2843                .materialized_column_iter()
2844                .map(|s| s.chunks().iter())
2845                .collect(),
2846        }
2847    }
2848
2849    /// Get a [`DataFrame`] with all the columns in reversed order.
2850    #[must_use]
2851    pub fn reverse(&self) -> Self {
2852        let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2853        unsafe { DataFrame::new_no_checks(self.height(), col) }
2854    }
2855
2856    /// Shift the values by a given period and fill the parts that will be empty due to this operation
2857    /// with `Nones`.
2858    ///
2859    /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2860    #[must_use]
2861    pub fn shift(&self, periods: i64) -> Self {
2862        let col = self._apply_columns_par(&|s| s.shift(periods));
2863        unsafe { DataFrame::new_no_checks(self.height(), col) }
2864    }
2865
2866    /// Replace None values with one of the following strategies:
2867    /// * Forward fill (replace None with the previous value)
2868    /// * Backward fill (replace None with the next value)
2869    /// * Mean fill (replace None with the mean of the whole array)
2870    /// * Min fill (replace None with the minimum of the whole array)
2871    /// * Max fill (replace None with the maximum of the whole array)
2872    ///
2873    /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2874    pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2875        let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2876
2877        Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2878    }
2879
2880    /// Pipe different functions/ closure operations that work on a DataFrame together.
2881    pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2882    where
2883        F: Fn(DataFrame) -> PolarsResult<B>,
2884    {
2885        f(self)
2886    }
2887
2888    /// Pipe different functions/ closure operations that work on a DataFrame together.
2889    pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2890    where
2891        F: Fn(&mut DataFrame) -> PolarsResult<B>,
2892    {
2893        f(self)
2894    }
2895
2896    /// Pipe different functions/ closure operations that work on a DataFrame together.
2897    pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2898    where
2899        F: Fn(DataFrame, Args) -> PolarsResult<B>,
2900    {
2901        f(self, args)
2902    }
2903
2904    /// Drop duplicate rows from a [`DataFrame`].
2905    /// *This fails when there is a column of type List in DataFrame*
2906    ///
2907    /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2908    ///
2909    /// # Example
2910    ///
2911    /// ```no_run
2912    /// # use polars_core::prelude::*;
2913    /// let df = df! {
2914    ///               "flt" => [1., 1., 2., 2., 3., 3.],
2915    ///               "int" => [1, 1, 2, 2, 3, 3, ],
2916    ///               "str" => ["a", "a", "b", "b", "c", "c"]
2917    ///           }?;
2918    ///
2919    /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2920    /// # Ok::<(), PolarsError>(())
2921    /// ```
2922    /// Returns
2923    ///
2924    /// ```text
2925    /// +-----+-----+-----+
2926    /// | flt | int | str |
2927    /// | --- | --- | --- |
2928    /// | f64 | i32 | str |
2929    /// +=====+=====+=====+
2930    /// | 1   | 1   | "a" |
2931    /// +-----+-----+-----+
2932    /// | 2   | 2   | "b" |
2933    /// +-----+-----+-----+
2934    /// | 3   | 3   | "c" |
2935    /// +-----+-----+-----+
2936    /// ```
2937    #[cfg(feature = "algorithm_group_by")]
2938    pub fn unique_stable(
2939        &self,
2940        subset: Option<&[String]>,
2941        keep: UniqueKeepStrategy,
2942        slice: Option<(i64, usize)>,
2943    ) -> PolarsResult<DataFrame> {
2944        self.unique_impl(
2945            true,
2946            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2947            keep,
2948            slice,
2949        )
2950    }
2951
2952    /// Unstable distinct. See [`DataFrame::unique_stable`].
2953    #[cfg(feature = "algorithm_group_by")]
2954    pub fn unique<I, S>(
2955        &self,
2956        subset: Option<&[String]>,
2957        keep: UniqueKeepStrategy,
2958        slice: Option<(i64, usize)>,
2959    ) -> PolarsResult<DataFrame> {
2960        self.unique_impl(
2961            false,
2962            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2963            keep,
2964            slice,
2965        )
2966    }
2967
2968    #[cfg(feature = "algorithm_group_by")]
2969    pub fn unique_impl(
2970        &self,
2971        maintain_order: bool,
2972        subset: Option<Vec<PlSmallStr>>,
2973        keep: UniqueKeepStrategy,
2974        slice: Option<(i64, usize)>,
2975    ) -> PolarsResult<Self> {
2976        let names = subset.unwrap_or_else(|| self.get_column_names_owned());
2977        let mut df = self.clone();
2978        // take on multiple chunks is terrible
2979        df.as_single_chunk_par();
2980
2981        let columns = match (keep, maintain_order) {
2982            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
2983                let gb = df.group_by_stable(names)?;
2984                let groups = gb.get_groups();
2985                let (offset, len) = slice.unwrap_or((0, groups.len()));
2986                let groups = groups.slice(offset, len);
2987                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
2988            },
2989            (UniqueKeepStrategy::Last, true) => {
2990                // maintain order by last values, so the sorted groups are not correct as they
2991                // are sorted by the first value
2992                let gb = df.group_by(names)?;
2993                let groups = gb.get_groups();
2994
2995                let func = |g: GroupsIndicator| match g {
2996                    GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
2997                    GroupsIndicator::Slice([first, len]) => first + len - 1,
2998                };
2999
3000                let last_idx: NoNull<IdxCa> = match slice {
3001                    None => groups.iter().map(func).collect(),
3002                    Some((offset, len)) => {
3003                        let (offset, len) = slice_offsets(offset, len, groups.len());
3004                        groups.iter().skip(offset).take(len).map(func).collect()
3005                    },
3006                };
3007
3008                let last_idx = last_idx.sort(false);
3009                return Ok(unsafe { df.take_unchecked(&last_idx) });
3010            },
3011            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3012                let gb = df.group_by(names)?;
3013                let groups = gb.get_groups();
3014                let (offset, len) = slice.unwrap_or((0, groups.len()));
3015                let groups = groups.slice(offset, len);
3016                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3017            },
3018            (UniqueKeepStrategy::Last, false) => {
3019                let gb = df.group_by(names)?;
3020                let groups = gb.get_groups();
3021                let (offset, len) = slice.unwrap_or((0, groups.len()));
3022                let groups = groups.slice(offset, len);
3023                df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3024            },
3025            (UniqueKeepStrategy::None, _) => {
3026                let df_part = df.select(names)?;
3027                let mask = df_part.is_unique()?;
3028                let mask = match slice {
3029                    None => mask,
3030                    Some((offset, len)) => mask.slice(offset, len),
3031                };
3032                return df.filter(&mask);
3033            },
3034        };
3035
3036        let height = Self::infer_height(&columns);
3037        Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3038    }
3039
3040    /// Get a mask of all the unique rows in the [`DataFrame`].
3041    ///
3042    /// # Example
3043    ///
3044    /// ```no_run
3045    /// # use polars_core::prelude::*;
3046    /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3047    ///                         "ISIN" => ["US0378331005", "US5949181045"])?;
3048    /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3049    ///
3050    /// assert!(ca.all());
3051    /// # Ok::<(), PolarsError>(())
3052    /// ```
3053    #[cfg(feature = "algorithm_group_by")]
3054    pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3055        let gb = self.group_by(self.get_column_names_owned())?;
3056        let groups = gb.get_groups();
3057        Ok(is_unique_helper(
3058            groups,
3059            self.height() as IdxSize,
3060            true,
3061            false,
3062        ))
3063    }
3064
3065    /// Get a mask of all the duplicated rows in the [`DataFrame`].
3066    ///
3067    /// # Example
3068    ///
3069    /// ```no_run
3070    /// # use polars_core::prelude::*;
3071    /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3072    ///                         "ISIN" => ["US02079K3059", "US02079K1079"])?;
3073    /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3074    ///
3075    /// assert!(!ca.all());
3076    /// # Ok::<(), PolarsError>(())
3077    /// ```
3078    #[cfg(feature = "algorithm_group_by")]
3079    pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3080        let gb = self.group_by(self.get_column_names_owned())?;
3081        let groups = gb.get_groups();
3082        Ok(is_unique_helper(
3083            groups,
3084            self.height() as IdxSize,
3085            false,
3086            true,
3087        ))
3088    }
3089
3090    /// Create a new [`DataFrame`] that shows the null counts per column.
3091    #[must_use]
3092    pub fn null_count(&self) -> Self {
3093        let cols = self
3094            .columns
3095            .iter()
3096            .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3097            .collect();
3098        unsafe { Self::new_no_checks(1, cols) }
3099    }
3100
3101    /// Hash and combine the row values
3102    #[cfg(feature = "row_hash")]
3103    pub fn hash_rows(
3104        &mut self,
3105        hasher_builder: Option<PlSeedableRandomStateQuality>,
3106    ) -> PolarsResult<UInt64Chunked> {
3107        let dfs = split_df(self, POOL.current_num_threads(), false);
3108        let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3109
3110        let mut iter = cas.into_iter();
3111        let mut acc_ca = iter.next().unwrap();
3112        for ca in iter {
3113            acc_ca.append(&ca)?;
3114        }
3115        Ok(acc_ca.rechunk().into_owned())
3116    }
3117
3118    /// Get the supertype of the columns in this DataFrame
3119    pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3120        self.columns
3121            .iter()
3122            .map(|s| Ok(s.dtype().clone()))
3123            .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3124    }
3125
3126    /// Take by index values given by the slice `idx`.
3127    /// # Warning
3128    /// Be careful with allowing threads when calling this in a large hot loop
3129    /// every thread split may be on rayon stack and lead to SO
3130    #[doc(hidden)]
3131    pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3132        self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3133    }
3134
3135    /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3136    /// if the index value in `idx` are sorted. This will maintain sorted flags.
3137    ///
3138    /// # Warning
3139    /// Be careful with allowing threads when calling this in a large hot loop
3140    /// every thread split may be on rayon stack and lead to SO
3141    #[doc(hidden)]
3142    pub unsafe fn _take_unchecked_slice_sorted(
3143        &self,
3144        idx: &[IdxSize],
3145        allow_threads: bool,
3146        sorted: IsSorted,
3147    ) -> Self {
3148        #[cfg(debug_assertions)]
3149        {
3150            if idx.len() > 2 {
3151                match sorted {
3152                    IsSorted::Ascending => {
3153                        assert!(idx[0] <= idx[idx.len() - 1]);
3154                    },
3155                    IsSorted::Descending => {
3156                        assert!(idx[0] >= idx[idx.len() - 1]);
3157                    },
3158                    _ => {},
3159                }
3160            }
3161        }
3162        let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3163        ca.set_sorted_flag(sorted);
3164        self.take_unchecked_impl(&ca, allow_threads)
3165    }
3166
3167    #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3168    #[doc(hidden)]
3169    pub fn _partition_by_impl(
3170        &self,
3171        cols: &[PlSmallStr],
3172        stable: bool,
3173        include_key: bool,
3174        parallel: bool,
3175    ) -> PolarsResult<Vec<DataFrame>> {
3176        let selected_keys = self.select_columns(cols.iter().cloned())?;
3177        let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3178        let groups = groups.take_groups();
3179
3180        // drop key columns prior to calculation if requested
3181        let df = if include_key {
3182            self.clone()
3183        } else {
3184            self.drop_many(cols.iter().cloned())
3185        };
3186
3187        if parallel {
3188            // don't parallelize this
3189            // there is a lot of parallelization in take and this may easily SO
3190            POOL.install(|| {
3191                match groups.as_ref() {
3192                    GroupsType::Idx(idx) => {
3193                        // Rechunk as the gather may rechunk for every group #17562.
3194                        let mut df = df.clone();
3195                        df.as_single_chunk_par();
3196                        Ok(idx
3197                            .into_par_iter()
3198                            .map(|(_, group)| {
3199                                // groups are in bounds
3200                                unsafe {
3201                                    df._take_unchecked_slice_sorted(
3202                                        group,
3203                                        false,
3204                                        IsSorted::Ascending,
3205                                    )
3206                                }
3207                            })
3208                            .collect())
3209                    },
3210                    GroupsType::Slice { groups, .. } => Ok(groups
3211                        .into_par_iter()
3212                        .map(|[first, len]| df.slice(*first as i64, *len as usize))
3213                        .collect()),
3214                }
3215            })
3216        } else {
3217            match groups.as_ref() {
3218                GroupsType::Idx(idx) => {
3219                    // Rechunk as the gather may rechunk for every group #17562.
3220                    let mut df = df.clone();
3221                    df.as_single_chunk();
3222                    Ok(idx
3223                        .into_iter()
3224                        .map(|(_, group)| {
3225                            // groups are in bounds
3226                            unsafe {
3227                                df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3228                            }
3229                        })
3230                        .collect())
3231                },
3232                GroupsType::Slice { groups, .. } => Ok(groups
3233                    .iter()
3234                    .map(|[first, len]| df.slice(*first as i64, *len as usize))
3235                    .collect()),
3236            }
3237        }
3238    }
3239
3240    /// Split into multiple DataFrames partitioned by groups
3241    #[cfg(feature = "partition_by")]
3242    pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3243    where
3244        I: IntoIterator<Item = S>,
3245        S: Into<PlSmallStr>,
3246    {
3247        let cols = cols
3248            .into_iter()
3249            .map(Into::into)
3250            .collect::<Vec<PlSmallStr>>();
3251        self._partition_by_impl(cols.as_slice(), false, include_key, true)
3252    }
3253
3254    /// Split into multiple DataFrames partitioned by groups
3255    /// Order of the groups are maintained.
3256    #[cfg(feature = "partition_by")]
3257    pub fn partition_by_stable<I, S>(
3258        &self,
3259        cols: I,
3260        include_key: bool,
3261    ) -> PolarsResult<Vec<DataFrame>>
3262    where
3263        I: IntoIterator<Item = S>,
3264        S: Into<PlSmallStr>,
3265    {
3266        let cols = cols
3267            .into_iter()
3268            .map(Into::into)
3269            .collect::<Vec<PlSmallStr>>();
3270        self._partition_by_impl(cols.as_slice(), true, include_key, true)
3271    }
3272
3273    /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3274    /// inserted as columns.
3275    #[cfg(feature = "dtype-struct")]
3276    pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3277        let cols = cols.into_vec();
3278        self.unnest_impl(cols.into_iter().collect())
3279    }
3280
3281    #[cfg(feature = "dtype-struct")]
3282    fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3283        let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3284        let mut count = 0;
3285        for s in &self.columns {
3286            if cols.contains(s.name()) {
3287                let ca = s.struct_()?.clone();
3288                new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3289                count += 1;
3290            } else {
3291                new_cols.push(s.clone())
3292            }
3293        }
3294        if count != cols.len() {
3295            // one or more columns not found
3296            // the code below will return an error with the missing name
3297            let schema = self.schema();
3298            for col in cols {
3299                let _ = schema
3300                    .get(col.as_str())
3301                    .ok_or_else(|| polars_err!(col_not_found = col))?;
3302            }
3303        }
3304        DataFrame::new(new_cols)
3305    }
3306
3307    pub(crate) fn infer_height(cols: &[Column]) -> usize {
3308        cols.first().map_or(0, Column::len)
3309    }
3310
3311    pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3312        // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3313        // append_chunk or something like this. It is just quite difficult to make that safe.
3314        let df = DataFrame::from(rb);
3315        polars_ensure!(
3316            self.schema() == df.schema(),
3317            SchemaMismatch: "cannot append record batch with different schema",
3318        );
3319        self.vstack_mut_owned_unchecked(df);
3320        Ok(())
3321    }
3322}
3323
3324pub struct RecordBatchIter<'a> {
3325    columns: &'a Vec<Column>,
3326    schema: ArrowSchemaRef,
3327    idx: usize,
3328    n_chunks: usize,
3329    compat_level: CompatLevel,
3330    parallel: bool,
3331}
3332
3333impl Iterator for RecordBatchIter<'_> {
3334    type Item = RecordBatch;
3335
3336    fn next(&mut self) -> Option<Self::Item> {
3337        if self.idx >= self.n_chunks {
3338            return None;
3339        }
3340
3341        // Create a batch of the columns with the same chunk no.
3342        let batch_cols: Vec<ArrayRef> = if self.parallel {
3343            let iter = self
3344                .columns
3345                .par_iter()
3346                .map(Column::as_materialized_series)
3347                .map(|s| s.to_arrow(self.idx, self.compat_level));
3348            POOL.install(|| iter.collect())
3349        } else {
3350            self.columns
3351                .iter()
3352                .map(Column::as_materialized_series)
3353                .map(|s| s.to_arrow(self.idx, self.compat_level))
3354                .collect()
3355        };
3356        self.idx += 1;
3357
3358        let length = batch_cols.first().map_or(0, |arr| arr.len());
3359        Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3360    }
3361
3362    fn size_hint(&self) -> (usize, Option<usize>) {
3363        let n = self.n_chunks - self.idx;
3364        (n, Some(n))
3365    }
3366}
3367
3368pub struct PhysRecordBatchIter<'a> {
3369    schema: ArrowSchemaRef,
3370    arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3371}
3372
3373impl Iterator for PhysRecordBatchIter<'_> {
3374    type Item = RecordBatch;
3375
3376    fn next(&mut self) -> Option<Self::Item> {
3377        let arrs = self
3378            .arr_iters
3379            .iter_mut()
3380            .map(|phys_iter| phys_iter.next().cloned())
3381            .collect::<Option<Vec<_>>>()?;
3382
3383        let length = arrs.first().map_or(0, |arr| arr.len());
3384        Some(RecordBatch::new(length, self.schema.clone(), arrs))
3385    }
3386
3387    fn size_hint(&self) -> (usize, Option<usize>) {
3388        if let Some(iter) = self.arr_iters.first() {
3389            iter.size_hint()
3390        } else {
3391            (0, None)
3392        }
3393    }
3394}
3395
3396impl Default for DataFrame {
3397    fn default() -> Self {
3398        DataFrame::empty()
3399    }
3400}
3401
3402impl From<DataFrame> for Vec<Column> {
3403    fn from(df: DataFrame) -> Self {
3404        df.columns
3405    }
3406}
3407
3408// utility to test if we can vstack/extend the columns
3409fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3410    polars_ensure!(
3411        left.name() == right.name(),
3412        ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3413        left.name(), right.name(),
3414    );
3415    Ok(())
3416}
3417
3418#[cfg(test)]
3419mod test {
3420    use super::*;
3421
3422    fn create_frame() -> DataFrame {
3423        let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3424        let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3425        DataFrame::new(vec![s0, s1]).unwrap()
3426    }
3427
3428    #[test]
3429    #[cfg_attr(miri, ignore)]
3430    fn test_recordbatch_iterator() {
3431        let df = df!(
3432            "foo" => [1, 2, 3, 4, 5]
3433        )
3434        .unwrap();
3435        let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3436        assert_eq!(5, iter.next().unwrap().len());
3437        assert!(iter.next().is_none());
3438    }
3439
3440    #[test]
3441    #[cfg_attr(miri, ignore)]
3442    fn test_select() {
3443        let df = create_frame();
3444        assert_eq!(
3445            df.column("days")
3446                .unwrap()
3447                .as_series()
3448                .unwrap()
3449                .equal(1)
3450                .unwrap()
3451                .sum(),
3452            Some(1)
3453        );
3454    }
3455
3456    #[test]
3457    #[cfg_attr(miri, ignore)]
3458    fn test_filter_broadcast_on_string_col() {
3459        let col_name = "some_col";
3460        let v = vec!["test".to_string()];
3461        let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3462        let mut df = DataFrame::new(vec![s0]).unwrap();
3463
3464        df = df
3465            .filter(
3466                &df.column(col_name)
3467                    .unwrap()
3468                    .as_materialized_series()
3469                    .equal("")
3470                    .unwrap(),
3471            )
3472            .unwrap();
3473        assert_eq!(
3474            df.column(col_name)
3475                .unwrap()
3476                .as_materialized_series()
3477                .n_chunks(),
3478            1
3479        );
3480    }
3481
3482    #[test]
3483    #[cfg_attr(miri, ignore)]
3484    fn test_filter_broadcast_on_list_col() {
3485        let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3486        let ll: ListChunked = [&s1].iter().copied().collect();
3487
3488        let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3489        let new = ll.filter(&mask).unwrap();
3490
3491        assert_eq!(new.chunks.len(), 1);
3492        assert_eq!(new.len(), 0);
3493    }
3494
3495    #[test]
3496    fn slice() {
3497        let df = create_frame();
3498        let sliced_df = df.slice(0, 2);
3499        assert_eq!(sliced_df.shape(), (2, 2));
3500    }
3501
3502    #[test]
3503    fn rechunk_false() {
3504        let df = create_frame();
3505        assert!(!df.should_rechunk())
3506    }
3507
3508    #[test]
3509    fn rechunk_true() -> PolarsResult<()> {
3510        let mut base = df!(
3511            "a" => [1, 2, 3],
3512            "b" => [1, 2, 3]
3513        )?;
3514
3515        // Create a series with multiple chunks
3516        let mut s = Series::new("foo".into(), 0..2);
3517        let s2 = Series::new("bar".into(), 0..1);
3518        s.append(&s2)?;
3519
3520        // Append series to frame
3521        let out = base.with_column(s)?;
3522
3523        // Now we should rechunk
3524        assert!(out.should_rechunk());
3525        Ok(())
3526    }
3527
3528    #[test]
3529    fn test_duplicate_column() {
3530        let mut df = df! {
3531            "foo" => [1, 2, 3]
3532        }
3533        .unwrap();
3534        // check if column is replaced
3535        assert!(
3536            df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3537                .is_ok()
3538        );
3539        assert!(
3540            df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3541                .is_ok()
3542        );
3543        assert!(df.column("bar").is_ok())
3544    }
3545
3546    #[test]
3547    #[cfg_attr(miri, ignore)]
3548    fn distinct() {
3549        let df = df! {
3550            "flt" => [1., 1., 2., 2., 3., 3.],
3551            "int" => [1, 1, 2, 2, 3, 3, ],
3552            "str" => ["a", "a", "b", "b", "c", "c"]
3553        }
3554        .unwrap();
3555        let df = df
3556            .unique_stable(None, UniqueKeepStrategy::First, None)
3557            .unwrap()
3558            .sort(["flt"], SortMultipleOptions::default())
3559            .unwrap();
3560        let valid = df! {
3561            "flt" => [1., 2., 3.],
3562            "int" => [1, 2, 3],
3563            "str" => ["a", "b", "c"]
3564        }
3565        .unwrap();
3566        assert!(df.equals(&valid));
3567    }
3568
3569    #[test]
3570    fn test_vstack() {
3571        // check that it does not accidentally rechunks
3572        let mut df = df! {
3573            "flt" => [1., 1., 2., 2., 3., 3.],
3574            "int" => [1, 1, 2, 2, 3, 3, ],
3575            "str" => ["a", "a", "b", "b", "c", "c"]
3576        }
3577        .unwrap();
3578
3579        df.vstack_mut(&df.slice(0, 3)).unwrap();
3580        assert_eq!(df.first_col_n_chunks(), 2)
3581    }
3582
3583    #[test]
3584    fn test_vstack_on_empty_dataframe() {
3585        let mut df = DataFrame::empty();
3586
3587        let df_data = df! {
3588            "flt" => [1., 1., 2., 2., 3., 3.],
3589            "int" => [1, 1, 2, 2, 3, 3, ],
3590            "str" => ["a", "a", "b", "b", "c", "c"]
3591        }
3592        .unwrap();
3593
3594        df.vstack_mut(&df_data).unwrap();
3595        assert_eq!(df.height, 6)
3596    }
3597
3598    #[test]
3599    fn test_replace_or_add() -> PolarsResult<()> {
3600        let mut df = df!(
3601            "a" => [1, 2, 3],
3602            "b" => [1, 2, 3]
3603        )?;
3604
3605        // check that the new column is "c" and not "bar".
3606        df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3607
3608        assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3609        Ok(())
3610    }
3611}