polars_core/frame/
mod.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(any(feature = "rows", feature = "object"))]
33pub mod row;
34mod top_k;
35mod upstream_traits;
36mod validation;
37
38use arrow::record_batch::{RecordBatch, RecordBatchT};
39use polars_utils::pl_str::PlSmallStr;
40#[cfg(feature = "serde")]
41use serde::{Deserialize, Serialize};
42use strum_macros::IntoStaticStr;
43
44use crate::POOL;
45#[cfg(feature = "row_hash")]
46use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48use crate::series::IsSorted;
49
50#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52#[strum(serialize_all = "snake_case")]
53pub enum UniqueKeepStrategy {
54    /// Keep the first unique row.
55    First,
56    /// Keep the last unique row.
57    Last,
58    /// Keep None of the unique rows.
59    None,
60    /// Keep any of the unique rows
61    /// This allows more optimizations
62    #[default]
63    Any,
64}
65
66fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
67where
68    F: for<'a> FnMut(&'a T) -> &'a str,
69{
70    // Always unique.
71    if items.len() <= 1 {
72        return Ok(());
73    }
74
75    if items.len() <= 4 {
76        // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
77        for i in 0..items.len() - 1 {
78            let name = get_name(&items[i]);
79            for other in items.iter().skip(i + 1) {
80                if name == get_name(other) {
81                    polars_bail!(duplicate = name);
82                }
83            }
84        }
85    } else {
86        let mut names = PlHashSet::with_capacity(items.len());
87        for item in items {
88            let name = get_name(item);
89            if !names.insert(name) {
90                polars_bail!(duplicate = name);
91            }
92        }
93    }
94    Ok(())
95}
96
97/// A contiguous growable collection of `Series` that have the same length.
98///
99/// ## Use declarations
100///
101/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
102///
103/// ```rust
104/// use polars_core::prelude::*; // if the crate polars-core is used directly
105/// // use polars::prelude::*;      if the crate polars is used
106/// ```
107///
108/// # Initialization
109/// ## Default
110///
111/// A `DataFrame` can be initialized empty:
112///
113/// ```rust
114/// # use polars_core::prelude::*;
115/// let df = DataFrame::default();
116/// assert!(df.is_empty());
117/// ```
118///
119/// ## Wrapping a `Vec<Series>`
120///
121/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
122///
123/// ```rust
124/// # use polars_core::prelude::*;
125/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
126/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
127///
128/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
129/// ```
130///
131/// ## Using a macro
132///
133/// The [`df!`] macro is a convenient method:
134///
135/// ```rust
136/// # use polars_core::prelude::*;
137/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
138///                                       "Color" => ["Red", "Yellow", "Green"]);
139/// ```
140///
141/// ## Using a CSV file
142///
143/// See the `polars_io::csv::CsvReader`.
144///
145/// # Indexing
146/// ## By a number
147///
148/// The `Index<usize>` is implemented for the `DataFrame`.
149///
150/// ```rust
151/// # use polars_core::prelude::*;
152/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
153///              "Color" => ["Red", "Yellow", "Green"])?;
154///
155/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
156/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
157/// # Ok::<(), PolarsError>(())
158/// ```
159///
160/// ## By a `Series` name
161///
162/// ```rust
163/// # use polars_core::prelude::*;
164/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
165///              "Color" => ["Red", "Yellow", "Green"])?;
166///
167/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
168/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
169/// # Ok::<(), PolarsError>(())
170/// ```
171#[derive(Clone)]
172pub struct DataFrame {
173    height: usize,
174    // invariant: columns[i].len() == height for each 0 >= i > columns.len()
175    pub(crate) columns: Vec<Column>,
176
177    /// A cached schema. This might not give correct results if the DataFrame was modified in place
178    /// between schema and reading.
179    cached_schema: OnceLock<SchemaRef>,
180}
181
182impl DataFrame {
183    pub fn clear_schema(&mut self) {
184        self.cached_schema = OnceLock::new();
185    }
186
187    #[inline]
188    pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
189        self.columns.iter()
190    }
191
192    #[inline]
193    pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
194        self.columns.iter().map(Column::as_materialized_series)
195    }
196
197    #[inline]
198    pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
199        self.columns.par_iter().map(Column::as_materialized_series)
200    }
201
202    /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
203    ///
204    /// # Implementation
205    /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
206    /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
207    /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
208    ///
209    /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
210    /// However, this function will yield a smaller number. This is because this function returns
211    /// the visible size of the buffer, not its total capacity.
212    ///
213    /// FFI buffers are included in this estimation.
214    pub fn estimated_size(&self) -> usize {
215        self.columns.iter().map(Column::estimated_size).sum()
216    }
217
218    // Reduce monomorphization.
219    fn try_apply_columns(
220        &self,
221        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
222    ) -> PolarsResult<Vec<Column>> {
223        self.columns.iter().map(func).collect()
224    }
225    // Reduce monomorphization.
226    pub fn _apply_columns(&self, func: &(dyn Fn(&Column) -> Column)) -> Vec<Column> {
227        self.columns.iter().map(func).collect()
228    }
229    // Reduce monomorphization.
230    fn try_apply_columns_par(
231        &self,
232        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
233    ) -> PolarsResult<Vec<Column>> {
234        POOL.install(|| self.columns.par_iter().map(func).collect())
235    }
236    // Reduce monomorphization.
237    pub fn _apply_columns_par(
238        &self,
239        func: &(dyn Fn(&Column) -> Column + Send + Sync),
240    ) -> Vec<Column> {
241        POOL.install(|| self.columns.par_iter().map(func).collect())
242    }
243
244    /// Get the index of the column.
245    fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
246        self.get_column_index(name)
247            .ok_or_else(|| polars_err!(col_not_found = name))
248    }
249
250    fn check_already_present(&self, name: &str) -> PolarsResult<()> {
251        polars_ensure!(
252            self.columns.iter().all(|s| s.name().as_str() != name),
253            Duplicate: "column with name {:?} is already present in the DataFrame", name
254        );
255        Ok(())
256    }
257
258    /// Reserve additional slots into the chunks of the series.
259    pub(crate) fn reserve_chunks(&mut self, additional: usize) {
260        for s in &mut self.columns {
261            if let Column::Series(s) = s {
262                // SAFETY:
263                // do not modify the data, simply resize.
264                unsafe { s.chunks_mut().reserve(additional) }
265            }
266        }
267    }
268
269    /// Create a DataFrame from a Vector of Series.
270    ///
271    /// Errors if a column names are not unique, or if heights are not all equal.
272    ///
273    /// # Example
274    ///
275    /// ```
276    /// # use polars_core::prelude::*;
277    /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
278    /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
279    ///
280    /// let df = DataFrame::new(vec![s0, s1])?;
281    /// # Ok::<(), PolarsError>(())
282    /// ```
283    pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
284        DataFrame::validate_columns_slice(&columns)
285            .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {}", e)))?;
286        Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
287    }
288
289    pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
290        for col in &columns {
291            polars_ensure!(
292                col.len() == height,
293                ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
294                columns[0].name(), height, col.name(), col.len()
295            );
296        }
297
298        Ok(DataFrame {
299            height,
300            columns,
301            cached_schema: OnceLock::new(),
302        })
303    }
304
305    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
306    /// columns to match the other columns.
307    pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
308        // The length of the longest non-unit length column determines the
309        // broadcast length. If all columns are unit-length the broadcast length
310        // is one.
311        let broadcast_len = columns
312            .iter()
313            .map(|s| s.len())
314            .filter(|l| *l != 1)
315            .max()
316            .unwrap_or(1);
317        Self::new_with_broadcast_len(columns, broadcast_len)
318    }
319
320    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
321    /// columns to broadcast_len.
322    pub fn new_with_broadcast_len(
323        columns: Vec<Column>,
324        broadcast_len: usize,
325    ) -> PolarsResult<Self> {
326        ensure_names_unique(&columns, |s| s.name().as_str())?;
327        unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
328    }
329
330    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
331    /// columns to match the other columns.
332    ///  
333    /// # Safety
334    /// Does not check that the column names are unique (which they must be).
335    pub unsafe fn new_with_broadcast_no_namecheck(
336        mut columns: Vec<Column>,
337        broadcast_len: usize,
338    ) -> PolarsResult<Self> {
339        for col in &mut columns {
340            // Length not equal to the broadcast len, needs broadcast or is an error.
341            let len = col.len();
342            if len != broadcast_len {
343                if len != 1 {
344                    let name = col.name().to_owned();
345                    let extra_info =
346                        if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
347                            format!(" (matching column '{}')", c.name())
348                        } else {
349                            String::new()
350                        };
351                    polars_bail!(
352                        ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
353                    );
354                }
355                *col = col.new_from_index(0, broadcast_len);
356            }
357        }
358
359        let length = if columns.is_empty() { 0 } else { broadcast_len };
360
361        Ok(unsafe { DataFrame::new_no_checks(length, columns) })
362    }
363
364    /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
365    ///
366    /// # Example
367    ///
368    /// ```rust
369    /// use polars_core::prelude::DataFrame;
370    /// static EMPTY: DataFrame = DataFrame::empty();
371    /// ```
372    pub const fn empty() -> Self {
373        Self::empty_with_height(0)
374    }
375
376    /// Creates an empty `DataFrame` with a specific `height`.
377    pub const fn empty_with_height(height: usize) -> Self {
378        DataFrame {
379            height,
380            columns: vec![],
381            cached_schema: OnceLock::new(),
382        }
383    }
384
385    /// Create an empty `DataFrame` with empty columns as per the `schema`.
386    pub fn empty_with_schema(schema: &Schema) -> Self {
387        let cols = schema
388            .iter()
389            .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
390            .collect();
391        unsafe { DataFrame::new_no_checks(0, cols) }
392    }
393
394    /// Create an empty `DataFrame` with empty columns as per the `schema`.
395    pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
396        let cols = schema
397            .iter_values()
398            .map(|fld| {
399                Column::from(Series::new_empty(
400                    fld.name.clone(),
401                    &(DataType::from_arrow_field(fld)),
402                ))
403            })
404            .collect();
405        unsafe { DataFrame::new_no_checks(0, cols) }
406    }
407
408    /// Create a new `DataFrame` with the given schema, only containing nulls.
409    pub fn full_null(schema: &Schema, height: usize) -> Self {
410        let columns = schema
411            .iter_fields()
412            .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
413            .collect();
414        unsafe { DataFrame::new_no_checks(height, columns) }
415    }
416
417    /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
418    ///
419    /// # Example
420    ///
421    /// ```rust
422    /// # use polars_core::prelude::*;
423    /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
424    /// let s2 = Column::new("Area (kmĀ²)".into(), [106_460_000, 70_560_000]);
425    /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
426    ///
427    /// assert_eq!(df.pop(), Some(s2));
428    /// assert_eq!(df.pop(), Some(s1));
429    /// assert_eq!(df.pop(), None);
430    /// assert!(df.is_empty());
431    /// # Ok::<(), PolarsError>(())
432    /// ```
433    pub fn pop(&mut self) -> Option<Column> {
434        self.clear_schema();
435
436        self.columns.pop()
437    }
438
439    /// Add a new column at index 0 that counts the rows.
440    ///
441    /// # Example
442    ///
443    /// ```
444    /// # use polars_core::prelude::*;
445    /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
446    /// assert_eq!(df1.shape(), (4, 1));
447    ///
448    /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
449    /// assert_eq!(df2.shape(), (4, 2));
450    /// println!("{}", df2);
451    ///
452    /// # Ok::<(), PolarsError>(())
453    /// ```
454    ///
455    /// Output:
456    ///
457    /// ```text
458    ///  shape: (4, 2)
459    ///  +-----+----------+
460    ///  | Id  | Name     |
461    ///  | --- | ---      |
462    ///  | u32 | str      |
463    ///  +=====+==========+
464    ///  | 0   | James    |
465    ///  +-----+----------+
466    ///  | 1   | Mary     |
467    ///  +-----+----------+
468    ///  | 2   | John     |
469    ///  +-----+----------+
470    ///  | 3   | Patricia |
471    ///  +-----+----------+
472    /// ```
473    pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
474        let mut columns = Vec::with_capacity(self.columns.len() + 1);
475        let offset = offset.unwrap_or(0);
476
477        let mut ca = IdxCa::from_vec(
478            name,
479            (offset..(self.height() as IdxSize) + offset).collect(),
480        );
481        ca.set_sorted_flag(IsSorted::Ascending);
482        columns.push(ca.into_series().into());
483
484        columns.extend_from_slice(&self.columns);
485        DataFrame::new(columns)
486    }
487
488    /// Add a row index column in place.
489    pub fn with_row_index_mut(&mut self, name: PlSmallStr, offset: Option<IdxSize>) -> &mut Self {
490        let offset = offset.unwrap_or(0);
491        let mut ca = IdxCa::from_vec(
492            name,
493            (offset..(self.height() as IdxSize) + offset).collect(),
494        );
495        ca.set_sorted_flag(IsSorted::Ascending);
496
497        self.clear_schema();
498        self.columns.insert(0, ca.into_series().into());
499        self
500    }
501
502    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
503    /// `Series`.
504    ///
505    /// Calculates the height from the first column or `0` if no columns are given.
506    ///
507    /// # Safety
508    ///
509    /// It is the callers responsibility to uphold the contract of all `Series`
510    /// having an equal length and a unique name, if not this may panic down the line.
511    pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
512        let height = columns.first().map_or(0, Column::len);
513        unsafe { Self::new_no_checks(height, columns) }
514    }
515
516    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
517    /// `Series`.
518    ///
519    /// It is advised to use [DataFrame::new] in favor of this method.
520    ///
521    /// # Safety
522    ///
523    /// It is the callers responsibility to uphold the contract of all `Series`
524    /// having an equal length and a unique name, if not this may panic down the line.
525    pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
526        if cfg!(debug_assertions) {
527            DataFrame::validate_columns_slice(&columns).unwrap();
528        }
529
530        unsafe { Self::_new_no_checks_impl(height, columns) }
531    }
532
533    /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
534    /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
535    /// constructed with this method is generally highly unsafe and should not be long-lived.
536    #[allow(clippy::missing_safety_doc)]
537    pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
538        DataFrame {
539            height,
540            columns,
541            cached_schema: OnceLock::new(),
542        }
543    }
544
545    /// Shrink the capacity of this DataFrame to fit its length.
546    pub fn shrink_to_fit(&mut self) {
547        // Don't parallelize this. Memory overhead
548        for s in &mut self.columns {
549            s.shrink_to_fit();
550        }
551    }
552
553    /// Aggregate all the chunks in the DataFrame to a single chunk.
554    pub fn as_single_chunk(&mut self) -> &mut Self {
555        // Don't parallelize this. Memory overhead
556        for s in &mut self.columns {
557            if let Column::Series(s) = s {
558                *s = s.rechunk().into();
559            }
560        }
561        self
562    }
563
564    /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
565    /// This may lead to more peak memory consumption.
566    pub fn as_single_chunk_par(&mut self) -> &mut Self {
567        if self.columns.iter().any(|c| c.n_chunks() > 1) {
568            self.columns = self._apply_columns_par(&|s| s.rechunk());
569        }
570        self
571    }
572
573    /// Rechunks all columns to only have a single chunk.
574    pub fn rechunk_mut(&mut self) {
575        // SAFETY: We never adjust the length or names of the columns.
576        let columns = unsafe { self.get_columns_mut() };
577
578        for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
579            *col = col.rechunk();
580        }
581    }
582
583    pub fn _deshare_views_mut(&mut self) {
584        // SAFETY: We never adjust the length or names of the columns.
585        unsafe {
586            let columns = self.get_columns_mut();
587            for col in columns {
588                let Column::Series(s) = col else { continue };
589
590                if let Ok(ca) = s.binary() {
591                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
592                    *col = Column::from(gc_ca.into_series());
593                } else if let Ok(ca) = s.str() {
594                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
595                    *col = Column::from(gc_ca.into_series());
596                }
597            }
598        }
599    }
600
601    /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
602    pub fn rechunk_to_record_batch(
603        self,
604        compat_level: CompatLevel,
605    ) -> RecordBatchT<Box<dyn Array>> {
606        let height = self.height();
607
608        let (schema, arrays) = self
609            .columns
610            .into_iter()
611            .map(|col| {
612                let mut series = col.take_materialized_series();
613                // Rechunk to one chunk if necessary
614                if series.n_chunks() > 1 {
615                    series = series.rechunk();
616                }
617                (
618                    series.field().to_arrow(compat_level),
619                    series.to_arrow(0, compat_level),
620                )
621            })
622            .collect();
623
624        RecordBatchT::new(height, Arc::new(schema), arrays)
625    }
626
627    /// Returns true if the chunks of the columns do not align and re-chunking should be done
628    pub fn should_rechunk(&self) -> bool {
629        // Fast check. It is also needed for correctness, as code below doesn't check if the number
630        // of chunks is equal.
631        if !self
632            .get_columns()
633            .iter()
634            .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
635            .all_equal()
636        {
637            return true;
638        }
639
640        // From here we check chunk lengths.
641        let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
642        match chunk_lengths.next() {
643            None => false,
644            Some(first_column_chunk_lengths) => {
645                // Fast Path for single Chunk Series
646                if first_column_chunk_lengths.size_hint().0 == 1 {
647                    return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
648                }
649                // Always rechunk if we have more chunks than rows.
650                // except when we have an empty df containing a single chunk
651                let height = self.height();
652                let n_chunks = first_column_chunk_lengths.size_hint().0;
653                if n_chunks > height && !(height == 0 && n_chunks == 1) {
654                    return true;
655                }
656                // Slow Path for multi Chunk series
657                let v: Vec<_> = first_column_chunk_lengths.collect();
658                for cl in chunk_lengths {
659                    if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
660                        return true;
661                    }
662                }
663                false
664            },
665        }
666    }
667
668    /// Ensure all the chunks in the [`DataFrame`] are aligned.
669    pub fn align_chunks_par(&mut self) -> &mut Self {
670        if self.should_rechunk() {
671            self.as_single_chunk_par()
672        } else {
673            self
674        }
675    }
676
677    pub fn align_chunks(&mut self) -> &mut Self {
678        if self.should_rechunk() {
679            self.as_single_chunk()
680        } else {
681            self
682        }
683    }
684
685    /// Get the [`DataFrame`] schema.
686    ///
687    /// # Example
688    ///
689    /// ```rust
690    /// # use polars_core::prelude::*;
691    /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
692    ///                         "Diameter (m)" => [8.8e26, f64::INFINITY])?;
693    ///
694    /// let f1: Field = Field::new("Thing".into(), DataType::String);
695    /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
696    /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
697    ///
698    /// assert_eq!(&**df.schema(), &sc);
699    /// # Ok::<(), PolarsError>(())
700    /// ```
701    pub fn schema(&self) -> &SchemaRef {
702        let out = self.cached_schema.get_or_init(|| {
703            Arc::new(
704                self.columns
705                    .iter()
706                    .map(|x| (x.name().clone(), x.dtype().clone()))
707                    .collect(),
708            )
709        });
710
711        debug_assert_eq!(out.len(), self.width());
712
713        out
714    }
715
716    /// Get a reference to the [`DataFrame`] columns.
717    ///
718    /// # Example
719    ///
720    /// ```rust
721    /// # use polars_core::prelude::*;
722    /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
723    ///                         "Symbol" => ["A", "C", "G", "T"])?;
724    /// let columns: &[Column] = df.get_columns();
725    ///
726    /// assert_eq!(columns[0].name(), "Name");
727    /// assert_eq!(columns[1].name(), "Symbol");
728    /// # Ok::<(), PolarsError>(())
729    /// ```
730    #[inline]
731    pub fn get_columns(&self) -> &[Column] {
732        &self.columns
733    }
734
735    #[inline]
736    /// Get mutable access to the underlying columns.
737    ///
738    /// # Safety
739    ///
740    /// The caller must ensure the length of all [`Series`] remains equal to `height` or
741    /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
742    /// The caller must ensure that the cached schema is cleared if it modifies the schema by
743    /// calling [`DataFrame::clear_schema`].
744    pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
745        &mut self.columns
746    }
747
748    #[inline]
749    /// Remove all the columns in the [`DataFrame`] but keep the `height`.
750    pub fn clear_columns(&mut self) {
751        unsafe { self.get_columns_mut() }.clear();
752        self.clear_schema();
753    }
754
755    #[inline]
756    /// Extend the columns without checking for name collisions or height.
757    ///
758    /// # Safety
759    ///
760    /// The caller needs to ensure that:
761    /// - Column names are unique within the resulting [`DataFrame`].
762    /// - The length of each appended column matches the height of the [`DataFrame`]. For
763    ///   `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
764    ///   with [`DataFrame::set_height`].
765    pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
766        unsafe { self.get_columns_mut() }.extend(iter);
767        self.clear_schema();
768    }
769
770    /// Take ownership of the underlying columns vec.
771    pub fn take_columns(self) -> Vec<Column> {
772        self.columns
773    }
774
775    /// Iterator over the columns as [`Series`].
776    ///
777    /// # Example
778    ///
779    /// ```rust
780    /// # use polars_core::prelude::*;
781    /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
782    /// let s2 = Column::new("Formula".into(), ["aĀ²+bĀ²=cĀ²", "H=-Ī£[P(x)log|P(x)|]"]);
783    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
784    ///
785    /// let mut iterator = df.iter();
786    ///
787    /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
788    /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
789    /// assert_eq!(iterator.next(), None);
790    /// # Ok::<(), PolarsError>(())
791    /// ```
792    pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
793        self.materialized_column_iter()
794    }
795
796    /// # Example
797    ///
798    /// ```rust
799    /// # use polars_core::prelude::*;
800    /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
801    ///                         "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
802    ///
803    /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
804    /// # Ok::<(), PolarsError>(())
805    /// ```
806    pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
807        self.columns.iter().map(|s| s.name()).collect()
808    }
809
810    /// Get the [`Vec<PlSmallStr>`] representing the column names.
811    pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
812        self.columns.iter().map(|s| s.name().clone()).collect()
813    }
814
815    pub fn get_column_names_str(&self) -> Vec<&str> {
816        self.columns.iter().map(|s| s.name().as_str()).collect()
817    }
818
819    /// Set the column names.
820    /// # Example
821    ///
822    /// ```rust
823    /// # use polars_core::prelude::*;
824    /// let mut df: DataFrame = df!("Mathematical set" => ["ā„•", "ā„¤", "š”»", "ā„š", "ā„", "ā„‚"])?;
825    /// df.set_column_names(["Set"])?;
826    ///
827    /// assert_eq!(df.get_column_names(), &["Set"]);
828    /// # Ok::<(), PolarsError>(())
829    /// ```
830    pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
831    where
832        I: IntoIterator<Item = S>,
833        S: Into<PlSmallStr>,
834    {
835        let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
836        self._set_column_names_impl(names.as_slice())
837    }
838
839    fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
840        polars_ensure!(
841            names.len() == self.width(),
842            ShapeMismatch: "{} column names provided for a DataFrame of width {}",
843            names.len(), self.width()
844        );
845        ensure_names_unique(names, |s| s.as_str())?;
846
847        let columns = mem::take(&mut self.columns);
848        self.columns = columns
849            .into_iter()
850            .zip(names)
851            .map(|(s, name)| {
852                let mut s = s;
853                s.rename(name.clone());
854                s
855            })
856            .collect();
857        self.clear_schema();
858        Ok(())
859    }
860
861    /// Get the data types of the columns in the [`DataFrame`].
862    ///
863    /// # Example
864    ///
865    /// ```rust
866    /// # use polars_core::prelude::*;
867    /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
868    ///                                "Fraction" => [0.965, 0.035])?;
869    ///
870    /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
871    /// # Ok::<(), PolarsError>(())
872    /// ```
873    pub fn dtypes(&self) -> Vec<DataType> {
874        self.columns.iter().map(|s| s.dtype().clone()).collect()
875    }
876
877    pub(crate) fn first_series_column(&self) -> Option<&Series> {
878        self.columns.iter().find_map(|col| col.as_series())
879    }
880
881    /// The number of chunks for the first column.
882    pub fn first_col_n_chunks(&self) -> usize {
883        match self.first_series_column() {
884            None if self.columns.is_empty() => 0,
885            None => 1,
886            Some(s) => s.n_chunks(),
887        }
888    }
889
890    /// The highest number of chunks for any column.
891    pub fn max_n_chunks(&self) -> usize {
892        self.columns
893            .iter()
894            .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
895            .max()
896            .unwrap_or(0)
897    }
898
899    /// Get a reference to the schema fields of the [`DataFrame`].
900    ///
901    /// # Example
902    ///
903    /// ```rust
904    /// # use polars_core::prelude::*;
905    /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
906    ///                            "Fraction" => [0.708, 0.292])?;
907    ///
908    /// let f1: Field = Field::new("Surface type".into(), DataType::String);
909    /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
910    ///
911    /// assert_eq!(earth.fields(), &[f1, f2]);
912    /// # Ok::<(), PolarsError>(())
913    /// ```
914    pub fn fields(&self) -> Vec<Field> {
915        self.columns
916            .iter()
917            .map(|s| s.field().into_owned())
918            .collect()
919    }
920
921    /// Get (height, width) of the [`DataFrame`].
922    ///
923    /// # Example
924    ///
925    /// ```rust
926    /// # use polars_core::prelude::*;
927    /// let df0: DataFrame = DataFrame::default();
928    /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
929    /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
930    ///                          "2" => [1, 2, 3, 4, 5])?;
931    ///
932    /// assert_eq!(df0.shape(), (0 ,0));
933    /// assert_eq!(df1.shape(), (5, 1));
934    /// assert_eq!(df2.shape(), (5, 2));
935    /// # Ok::<(), PolarsError>(())
936    /// ```
937    pub fn shape(&self) -> (usize, usize) {
938        (self.height, self.columns.len())
939    }
940
941    /// Get the width of the [`DataFrame`] which is the number of columns.
942    ///
943    /// # Example
944    ///
945    /// ```rust
946    /// # use polars_core::prelude::*;
947    /// let df0: DataFrame = DataFrame::default();
948    /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
949    /// let df2: DataFrame = df!("Series 1" => [0; 0],
950    ///                          "Series 2" => [0; 0])?;
951    ///
952    /// assert_eq!(df0.width(), 0);
953    /// assert_eq!(df1.width(), 1);
954    /// assert_eq!(df2.width(), 2);
955    /// # Ok::<(), PolarsError>(())
956    /// ```
957    pub fn width(&self) -> usize {
958        self.columns.len()
959    }
960
961    /// Get the height of the [`DataFrame`] which is the number of rows.
962    ///
963    /// # Example
964    ///
965    /// ```rust
966    /// # use polars_core::prelude::*;
967    /// let df0: DataFrame = DataFrame::default();
968    /// let df1: DataFrame = df!("Currency" => ["ā‚¬", "$"])?;
969    /// let df2: DataFrame = df!("Currency" => ["ā‚¬", "$", "Ā„", "Ā£", "ā‚æ"])?;
970    ///
971    /// assert_eq!(df0.height(), 0);
972    /// assert_eq!(df1.height(), 2);
973    /// assert_eq!(df2.height(), 5);
974    /// # Ok::<(), PolarsError>(())
975    /// ```
976    pub fn height(&self) -> usize {
977        self.height
978    }
979
980    /// Returns the size as number of rows * number of columns
981    pub fn size(&self) -> usize {
982        let s = self.shape();
983        s.0 * s.1
984    }
985
986    /// Returns `true` if the [`DataFrame`] contains no rows.
987    ///
988    /// # Example
989    ///
990    /// ```rust
991    /// # use polars_core::prelude::*;
992    /// let df1: DataFrame = DataFrame::default();
993    /// assert!(df1.is_empty());
994    ///
995    /// let df2: DataFrame = df!("First name" => ["Forever"],
996    ///                          "Last name" => ["Alone"])?;
997    /// assert!(!df2.is_empty());
998    /// # Ok::<(), PolarsError>(())
999    /// ```
1000    pub fn is_empty(&self) -> bool {
1001        matches!(self.shape(), (0, _) | (_, 0))
1002    }
1003
1004    /// Set the height (i.e. number of rows) of this [`DataFrame`].
1005    ///
1006    /// # Safety
1007    ///
1008    /// This needs to be equal to the length of all the columns.
1009    pub unsafe fn set_height(&mut self, height: usize) {
1010        self.height = height;
1011    }
1012
1013    /// Add multiple [`Series`] to a [`DataFrame`].
1014    /// The added `Series` are required to have the same length.
1015    ///
1016    /// # Example
1017    ///
1018    /// ```rust
1019    /// # use polars_core::prelude::*;
1020    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1021    /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1022    /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1023    ///
1024    /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1025    /// assert_eq!(df2.shape(), (3, 3));
1026    /// println!("{}", df2);
1027    /// # Ok::<(), PolarsError>(())
1028    /// ```
1029    ///
1030    /// Output:
1031    ///
1032    /// ```text
1033    /// shape: (3, 3)
1034    /// +---------+--------+----------+
1035    /// | Element | Proton | Electron |
1036    /// | ---     | ---    | ---      |
1037    /// | str     | i32    | i32      |
1038    /// +=========+========+==========+
1039    /// | Copper  | 29     | 29       |
1040    /// +---------+--------+----------+
1041    /// | Silver  | 47     | 47       |
1042    /// +---------+--------+----------+
1043    /// | Gold    | 79     | 79       |
1044    /// +---------+--------+----------+
1045    /// ```
1046    pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1047        let mut new_cols = self.columns.clone();
1048        new_cols.extend_from_slice(columns);
1049        DataFrame::new(new_cols)
1050    }
1051
1052    /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1053    ///
1054    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1055    ///
1056    /// # Example
1057    ///
1058    /// ```rust
1059    /// # use polars_core::prelude::*;
1060    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1061    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1062    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1063    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1064    ///
1065    /// let df3: DataFrame = df1.vstack(&df2)?;
1066    ///
1067    /// assert_eq!(df3.shape(), (5, 2));
1068    /// println!("{}", df3);
1069    /// # Ok::<(), PolarsError>(())
1070    /// ```
1071    ///
1072    /// Output:
1073    ///
1074    /// ```text
1075    /// shape: (5, 2)
1076    /// +-----------+-------------------+
1077    /// | Element   | Melting Point (K) |
1078    /// | ---       | ---               |
1079    /// | str       | f64               |
1080    /// +===========+===================+
1081    /// | Copper    | 1357.77           |
1082    /// +-----------+-------------------+
1083    /// | Silver    | 1234.93           |
1084    /// +-----------+-------------------+
1085    /// | Gold      | 1337.33           |
1086    /// +-----------+-------------------+
1087    /// | Platinum  | 2041.4            |
1088    /// +-----------+-------------------+
1089    /// | Palladium | 1828.05           |
1090    /// +-----------+-------------------+
1091    /// ```
1092    pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1093        let mut df = self.clone();
1094        df.vstack_mut(other)?;
1095        Ok(df)
1096    }
1097
1098    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1099    ///
1100    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1101    ///
1102    /// # Example
1103    ///
1104    /// ```rust
1105    /// # use polars_core::prelude::*;
1106    /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1107    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1108    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1109    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1110    ///
1111    /// df1.vstack_mut(&df2)?;
1112    ///
1113    /// assert_eq!(df1.shape(), (5, 2));
1114    /// println!("{}", df1);
1115    /// # Ok::<(), PolarsError>(())
1116    /// ```
1117    ///
1118    /// Output:
1119    ///
1120    /// ```text
1121    /// shape: (5, 2)
1122    /// +-----------+-------------------+
1123    /// | Element   | Melting Point (K) |
1124    /// | ---       | ---               |
1125    /// | str       | f64               |
1126    /// +===========+===================+
1127    /// | Copper    | 1357.77           |
1128    /// +-----------+-------------------+
1129    /// | Silver    | 1234.93           |
1130    /// +-----------+-------------------+
1131    /// | Gold      | 1337.33           |
1132    /// +-----------+-------------------+
1133    /// | Platinum  | 2041.4            |
1134    /// +-----------+-------------------+
1135    /// | Palladium | 1828.05           |
1136    /// +-----------+-------------------+
1137    /// ```
1138    pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1139        if self.width() != other.width() {
1140            polars_ensure!(
1141                self.width() == 0,
1142                ShapeMismatch:
1143                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1144                self.width(), other.width(),
1145            );
1146            self.columns.clone_from(&other.columns);
1147            self.height = other.height;
1148            return Ok(self);
1149        }
1150
1151        self.columns
1152            .iter_mut()
1153            .zip(other.columns.iter())
1154            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1155                ensure_can_extend(&*left, right)?;
1156                left.append(right).map_err(|e| {
1157                    e.context(format!("failed to vstack column '{}'", right.name()).into())
1158                })?;
1159                Ok(())
1160            })?;
1161        self.height += other.height;
1162        Ok(self)
1163    }
1164
1165    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1166    ///
1167    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1168    ///
1169    /// # Panics
1170    /// Panics if the schema's don't match.
1171    pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1172        self.columns
1173            .iter_mut()
1174            .zip(other.columns.iter())
1175            .for_each(|(left, right)| {
1176                left.append(right)
1177                    .map_err(|e| {
1178                        e.context(format!("failed to vstack column '{}'", right.name()).into())
1179                    })
1180                    .expect("should not fail");
1181            });
1182        self.height += other.height;
1183    }
1184
1185    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1186    ///
1187    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1188    ///
1189    /// # Panics
1190    /// Panics if the schema's don't match.
1191    pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1192        self.columns
1193            .iter_mut()
1194            .zip(other.columns)
1195            .for_each(|(left, right)| {
1196                left.append_owned(right).expect("should not fail");
1197            });
1198        self.height += other.height;
1199    }
1200
1201    /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1202    ///
1203    /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1204    /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1205    ///
1206    /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1207    /// and thus will yield faster queries.
1208    ///
1209    /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1210    /// online operations where you add `n` rows and rerun a query.
1211    ///
1212    /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1213    /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1214    /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1215    pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1216        polars_ensure!(
1217            self.width() == other.width(),
1218            ShapeMismatch:
1219            "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1220            self.width(), other.width(),
1221        );
1222
1223        self.columns
1224            .iter_mut()
1225            .zip(other.columns.iter())
1226            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1227                ensure_can_extend(&*left, right)?;
1228                left.extend(right).map_err(|e| {
1229                    e.context(format!("failed to extend column '{}'", right.name()).into())
1230                })?;
1231                Ok(())
1232            })?;
1233        self.height += other.height;
1234        self.clear_schema();
1235        Ok(())
1236    }
1237
1238    /// Remove a column by name and return the column removed.
1239    ///
1240    /// # Example
1241    ///
1242    /// ```rust
1243    /// # use polars_core::prelude::*;
1244    /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1245    ///                             "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1246    ///
1247    /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1248    /// assert!(s1.is_err());
1249    ///
1250    /// let s2: Column = df.drop_in_place("Animal")?;
1251    /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1252    /// # Ok::<(), PolarsError>(())
1253    /// ```
1254    pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1255        let idx = self.check_name_to_idx(name)?;
1256        self.clear_schema();
1257        Ok(self.columns.remove(idx))
1258    }
1259
1260    /// Return a new [`DataFrame`] where all null values are dropped.
1261    ///
1262    /// # Example
1263    ///
1264    /// ```no_run
1265    /// # use polars_core::prelude::*;
1266    /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1267    ///                         "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1268    /// assert_eq!(df1.shape(), (3, 2));
1269    ///
1270    /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1271    /// assert_eq!(df2.shape(), (1, 2));
1272    /// println!("{}", df2);
1273    /// # Ok::<(), PolarsError>(())
1274    /// ```
1275    ///
1276    /// Output:
1277    ///
1278    /// ```text
1279    /// shape: (1, 2)
1280    /// +---------+---------------------+
1281    /// | Country | Tax revenue (% GDP) |
1282    /// | ---     | ---                 |
1283    /// | str     | f64                 |
1284    /// +=========+=====================+
1285    /// | Malta   | 32.7                |
1286    /// +---------+---------------------+
1287    /// ```
1288    pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1289    where
1290        for<'a> &'a S: Into<PlSmallStr>,
1291    {
1292        if let Some(v) = subset {
1293            let v = self.select_columns(v)?;
1294            self._drop_nulls_impl(v.as_slice())
1295        } else {
1296            self._drop_nulls_impl(self.columns.as_slice())
1297        }
1298    }
1299
1300    fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1301        // fast path for no nulls in df
1302        if subset.iter().all(|s| !s.has_nulls()) {
1303            return Ok(self.clone());
1304        }
1305
1306        let mut iter = subset.iter();
1307
1308        let mask = iter
1309            .next()
1310            .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1311        let mut mask = mask.is_not_null();
1312
1313        for c in iter {
1314            mask = mask & c.is_not_null();
1315        }
1316        self.filter(&mask)
1317    }
1318
1319    /// Drop a column by name.
1320    /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1321    /// the current one in place.
1322    ///
1323    /// # Example
1324    ///
1325    /// ```rust
1326    /// # use polars_core::prelude::*;
1327    /// let df1: DataFrame = df!("Ray type" => ["Ī±", "Ī²", "X", "Ī³"])?;
1328    /// let df2: DataFrame = df1.drop("Ray type")?;
1329    ///
1330    /// assert!(df2.is_empty());
1331    /// # Ok::<(), PolarsError>(())
1332    /// ```
1333    pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1334        let idx = self.check_name_to_idx(name)?;
1335        let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1336
1337        self.columns.iter().enumerate().for_each(|(i, s)| {
1338            if i != idx {
1339                new_cols.push(s.clone())
1340            }
1341        });
1342
1343        Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1344    }
1345
1346    /// Drop columns that are in `names`.
1347    pub fn drop_many<I, S>(&self, names: I) -> Self
1348    where
1349        I: IntoIterator<Item = S>,
1350        S: Into<PlSmallStr>,
1351    {
1352        let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1353        self.drop_many_amortized(&names)
1354    }
1355
1356    /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1357    pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1358        if names.is_empty() {
1359            return self.clone();
1360        }
1361        let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1362        self.columns.iter().for_each(|s| {
1363            if !names.contains(s.name()) {
1364                new_cols.push(s.clone())
1365            }
1366        });
1367
1368        unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1369    }
1370
1371    /// Insert a new column at a given index without checking for duplicates.
1372    /// This can leave the [`DataFrame`] at an invalid state
1373    fn insert_column_no_name_check(
1374        &mut self,
1375        index: usize,
1376        column: Column,
1377    ) -> PolarsResult<&mut Self> {
1378        polars_ensure!(
1379            self.width() == 0 || column.len() == self.height(),
1380            ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1381            column.len(), self.height(),
1382        );
1383
1384        if self.width() == 0 {
1385            self.height = column.len();
1386        }
1387
1388        self.columns.insert(index, column);
1389        self.clear_schema();
1390        Ok(self)
1391    }
1392
1393    /// Insert a new column at a given index.
1394    pub fn insert_column<S: IntoColumn>(
1395        &mut self,
1396        index: usize,
1397        column: S,
1398    ) -> PolarsResult<&mut Self> {
1399        let column = column.into_column();
1400        self.check_already_present(column.name().as_str())?;
1401        self.insert_column_no_name_check(index, column)
1402    }
1403
1404    fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1405        if let Some(idx) = self.get_column_index(column.name().as_str()) {
1406            self.replace_column(idx, column)?;
1407        } else {
1408            if self.width() == 0 {
1409                self.height = column.len();
1410            }
1411
1412            self.columns.push(column);
1413            self.clear_schema();
1414        }
1415        Ok(())
1416    }
1417
1418    /// Add a new column to this [`DataFrame`] or replace an existing one.
1419    pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1420        fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1421            let height = df.height();
1422            if column.len() == 1 && height > 1 {
1423                column = column.new_from_index(0, height);
1424            }
1425
1426            if column.len() == height || df.get_columns().is_empty() {
1427                df.add_column_by_search(column)?;
1428                Ok(df)
1429            }
1430            // special case for literals
1431            else if height == 0 && column.len() == 1 {
1432                let s = column.clear();
1433                df.add_column_by_search(s)?;
1434                Ok(df)
1435            } else {
1436                polars_bail!(
1437                    ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1438                    column.len(), height,
1439                );
1440            }
1441        }
1442        let column = column.into_column();
1443        inner(self, column)
1444    }
1445
1446    /// Adds a column to the [`DataFrame`] without doing any checks
1447    /// on length or duplicates.
1448    ///
1449    /// # Safety
1450    /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1451    pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1452        debug_assert!(self.width() == 0 || self.height() == column.len());
1453        debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1454
1455        // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1456        // properly for `width` == 0.
1457        if self.width() == 0 {
1458            unsafe { self.set_height(column.len()) };
1459        }
1460        unsafe { self.get_columns_mut() }.push(column);
1461        self.clear_schema();
1462
1463        self
1464    }
1465
1466    // Note: Schema can be both input or output_schema
1467    fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1468        let name = c.name();
1469        if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1470            if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1471                // Given schema is output_schema and we can push.
1472                if idx == self.columns.len() {
1473                    if self.width() == 0 {
1474                        self.height = c.len();
1475                    }
1476
1477                    self.columns.push(c);
1478                    self.clear_schema();
1479                }
1480                // Schema is incorrect fallback to search
1481                else {
1482                    debug_assert!(false);
1483                    self.add_column_by_search(c)?;
1484                }
1485            } else {
1486                self.replace_column(idx, c)?;
1487            }
1488        } else {
1489            if self.width() == 0 {
1490                self.height = c.len();
1491            }
1492
1493            self.columns.push(c);
1494            self.clear_schema();
1495        }
1496
1497        Ok(())
1498    }
1499
1500    // Note: Schema can be both input or output_schema
1501    pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1502        for (i, s) in series.into_iter().enumerate() {
1503            // we need to branch here
1504            // because users can add multiple columns with the same name
1505            if i == 0 || schema.get(s.name().as_str()).is_some() {
1506                self.with_column_and_schema(s.into_column(), schema)?;
1507            } else {
1508                self.with_column(s.clone().into_column())?;
1509            }
1510        }
1511        Ok(())
1512    }
1513
1514    pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1515        for (i, s) in columns.into_iter().enumerate() {
1516            // we need to branch here
1517            // because users can add multiple columns with the same name
1518            if i == 0 || schema.get(s.name().as_str()).is_some() {
1519                self.with_column_and_schema(s, schema)?;
1520            } else {
1521                self.with_column(s.clone())?;
1522            }
1523        }
1524
1525        Ok(())
1526    }
1527
1528    /// Add a new column to this [`DataFrame`] or replace an existing one.
1529    /// Uses an existing schema to amortize lookups.
1530    /// If the schema is incorrect, we will fallback to linear search.
1531    ///
1532    /// Note: Schema can be both input or output_schema
1533    pub fn with_column_and_schema<C: IntoColumn>(
1534        &mut self,
1535        column: C,
1536        schema: &Schema,
1537    ) -> PolarsResult<&mut Self> {
1538        let mut column = column.into_column();
1539
1540        let height = self.height();
1541        if column.len() == 1 && height > 1 {
1542            column = column.new_from_index(0, height);
1543        }
1544
1545        if column.len() == height || self.columns.is_empty() {
1546            self.add_column_by_schema(column, schema)?;
1547            Ok(self)
1548        }
1549        // special case for literals
1550        else if height == 0 && column.len() == 1 {
1551            let s = column.clear();
1552            self.add_column_by_schema(s, schema)?;
1553            Ok(self)
1554        } else {
1555            polars_bail!(
1556                ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1557                column.len(), height,
1558            );
1559        }
1560    }
1561
1562    /// Get a row in the [`DataFrame`]. Beware this is slow.
1563    ///
1564    /// # Example
1565    ///
1566    /// ```
1567    /// # use polars_core::prelude::*;
1568    /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1569    ///     df.get(idx)
1570    /// }
1571    /// ```
1572    pub fn get(&self, idx: usize) -> Option<Vec<AnyValue>> {
1573        match self.columns.first() {
1574            Some(s) => {
1575                if s.len() <= idx {
1576                    return None;
1577                }
1578            },
1579            None => return None,
1580        }
1581        // SAFETY: we just checked bounds
1582        unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1583    }
1584
1585    /// Select a [`Series`] by index.
1586    ///
1587    /// # Example
1588    ///
1589    /// ```rust
1590    /// # use polars_core::prelude::*;
1591    /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1592    ///                         "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1593    ///
1594    /// let s1: Option<&Column> = df.select_at_idx(0);
1595    /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1596    ///
1597    /// assert_eq!(s1, Some(&s2));
1598    /// # Ok::<(), PolarsError>(())
1599    /// ```
1600    pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1601        self.columns.get(idx)
1602    }
1603
1604    /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1605    ///
1606    /// # Examples
1607    ///
1608    /// ```rust
1609    /// # use polars_core::prelude::*;
1610    /// let df = df! {
1611    ///     "0" => [0, 0, 0],
1612    ///     "1" => [1, 1, 1],
1613    ///     "2" => [2, 2, 2]
1614    /// }?;
1615    ///
1616    /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1617    /// assert!(df.equals(&df.select_by_range(..)?));
1618    /// # Ok::<(), PolarsError>(())
1619    /// ```
1620    pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1621    where
1622        R: ops::RangeBounds<usize>,
1623    {
1624        // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1625        // because it is the nightly feature. We should change here if this function were stable.
1626        fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1627        where
1628            R: ops::RangeBounds<usize>,
1629        {
1630            let len = bounds.end;
1631
1632            let start: ops::Bound<&usize> = range.start_bound();
1633            let start = match start {
1634                ops::Bound::Included(&start) => start,
1635                ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1636                    panic!("attempted to index slice from after maximum usize");
1637                }),
1638                ops::Bound::Unbounded => 0,
1639            };
1640
1641            let end: ops::Bound<&usize> = range.end_bound();
1642            let end = match end {
1643                ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1644                    panic!("attempted to index slice up to maximum usize");
1645                }),
1646                ops::Bound::Excluded(&end) => end,
1647                ops::Bound::Unbounded => len,
1648            };
1649
1650            if start > end {
1651                panic!("slice index starts at {start} but ends at {end}");
1652            }
1653            if end > len {
1654                panic!("range end index {end} out of range for slice of length {len}",);
1655            }
1656
1657            ops::Range { start, end }
1658        }
1659
1660        let colnames = self.get_column_names_owned();
1661        let range = get_range(range, ..colnames.len());
1662
1663        self._select_impl(&colnames[range])
1664    }
1665
1666    /// Get column index of a [`Series`] by name.
1667    /// # Example
1668    ///
1669    /// ```rust
1670    /// # use polars_core::prelude::*;
1671    /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1672    ///                         "Health" => [100, 200, 500],
1673    ///                         "Mana" => [250, 100, 0],
1674    ///                         "Strength" => [30, 150, 300])?;
1675    ///
1676    /// assert_eq!(df.get_column_index("Name"), Some(0));
1677    /// assert_eq!(df.get_column_index("Health"), Some(1));
1678    /// assert_eq!(df.get_column_index("Mana"), Some(2));
1679    /// assert_eq!(df.get_column_index("Strength"), Some(3));
1680    /// assert_eq!(df.get_column_index("Haste"), None);
1681    /// # Ok::<(), PolarsError>(())
1682    /// ```
1683    pub fn get_column_index(&self, name: &str) -> Option<usize> {
1684        let schema = self.schema();
1685        if let Some(idx) = schema.index_of(name) {
1686            if self
1687                .get_columns()
1688                .get(idx)
1689                .is_some_and(|c| c.name() == name)
1690            {
1691                return Some(idx);
1692            }
1693        }
1694
1695        self.columns.iter().position(|s| s.name().as_str() == name)
1696    }
1697
1698    /// Get column index of a [`Series`] by name.
1699    pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1700        self.get_column_index(name)
1701            .ok_or_else(|| polars_err!(col_not_found = name))
1702    }
1703
1704    /// Select a single column by name.
1705    ///
1706    /// # Example
1707    ///
1708    /// ```rust
1709    /// # use polars_core::prelude::*;
1710    /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1711    /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1712    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1713    ///
1714    /// assert_eq!(df.column("Password")?, &s1);
1715    /// # Ok::<(), PolarsError>(())
1716    /// ```
1717    pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1718        let idx = self.try_get_column_index(name)?;
1719        Ok(self.select_at_idx(idx).unwrap())
1720    }
1721
1722    /// Selected multiple columns by name.
1723    ///
1724    /// # Example
1725    ///
1726    /// ```rust
1727    /// # use polars_core::prelude::*;
1728    /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1729    ///                         "Max weight (kg)" => [16.0, 35.89])?;
1730    /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1731    ///
1732    /// assert_eq!(&df[0], sv[0]);
1733    /// assert_eq!(&df[1], sv[1]);
1734    /// # Ok::<(), PolarsError>(())
1735    /// ```
1736    pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1737    where
1738        I: IntoIterator<Item = S>,
1739        S: AsRef<str>,
1740    {
1741        names
1742            .into_iter()
1743            .map(|name| self.column(name.as_ref()))
1744            .collect()
1745    }
1746
1747    /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1748    ///
1749    /// # Examples
1750    ///
1751    /// ```
1752    /// # use polars_core::prelude::*;
1753    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1754    ///     df.select(["foo", "bar"])
1755    /// }
1756    /// ```
1757    pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1758    where
1759        I: IntoIterator<Item = S>,
1760        S: Into<PlSmallStr>,
1761    {
1762        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1763        self._select_impl(cols.as_slice())
1764    }
1765
1766    pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1767        ensure_names_unique(cols, |s| s.as_str())?;
1768        self._select_impl_unchecked(cols)
1769    }
1770
1771    pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1772        let selected = self.select_columns_impl(cols)?;
1773        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1774    }
1775
1776    /// Select with a known schema. The schema names must match the column names of this DataFrame.
1777    pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1778    where
1779        I: IntoIterator<Item = S>,
1780        S: Into<PlSmallStr>,
1781    {
1782        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1783        self._select_with_schema_impl(&cols, schema, true)
1784    }
1785
1786    /// Select with a known schema without checking for duplicates in `selection`.
1787    /// The schema names must match the column names of this DataFrame.
1788    pub fn select_with_schema_unchecked<I, S>(
1789        &self,
1790        selection: I,
1791        schema: &Schema,
1792    ) -> PolarsResult<Self>
1793    where
1794        I: IntoIterator<Item = S>,
1795        S: Into<PlSmallStr>,
1796    {
1797        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1798        self._select_with_schema_impl(&cols, schema, false)
1799    }
1800
1801    /// * The schema names must match the column names of this DataFrame.
1802    pub fn _select_with_schema_impl(
1803        &self,
1804        cols: &[PlSmallStr],
1805        schema: &Schema,
1806        check_duplicates: bool,
1807    ) -> PolarsResult<Self> {
1808        if check_duplicates {
1809            ensure_names_unique(cols, |s| s.as_str())?;
1810        }
1811
1812        let selected = self.select_columns_impl_with_schema(cols, schema)?;
1813        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1814    }
1815
1816    /// A non generic implementation to reduce compiler bloat.
1817    fn select_columns_impl_with_schema(
1818        &self,
1819        cols: &[PlSmallStr],
1820        schema: &Schema,
1821    ) -> PolarsResult<Vec<Column>> {
1822        if cfg!(debug_assertions) {
1823            ensure_matching_schema_names(schema, self.schema())?;
1824        }
1825
1826        cols.iter()
1827            .map(|name| {
1828                let index = schema.try_get_full(name.as_str())?.0;
1829                Ok(self.columns[index].clone())
1830            })
1831            .collect()
1832    }
1833
1834    pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1835    where
1836        I: IntoIterator<Item = S>,
1837        S: Into<PlSmallStr>,
1838    {
1839        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1840        self.select_physical_impl(&cols)
1841    }
1842
1843    fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1844        ensure_names_unique(cols, |s| s.as_str())?;
1845        let selected = self.select_columns_physical_impl(cols)?;
1846        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1847    }
1848
1849    /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1850    ///
1851    /// # Example
1852    ///
1853    /// ```rust
1854    /// # use polars_core::prelude::*;
1855    /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1856    ///                         "Carbon" => [1, 2, 3],
1857    ///                         "Hydrogen" => [4, 6, 8])?;
1858    /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1859    ///
1860    /// assert_eq!(df["Carbon"], sv[0]);
1861    /// assert_eq!(df["Hydrogen"], sv[1]);
1862    /// # Ok::<(), PolarsError>(())
1863    /// ```
1864    pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1865        let cols = selection.into_vec();
1866        self.select_columns_impl(&cols)
1867    }
1868
1869    fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1870        self.columns
1871            .iter()
1872            .enumerate()
1873            .map(|(i, s)| (s.name().as_str(), i))
1874            .collect()
1875    }
1876
1877    /// A non generic implementation to reduce compiler bloat.
1878    fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1879        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1880            let name_to_idx = self._names_to_idx_map();
1881            cols.iter()
1882                .map(|name| {
1883                    let idx = *name_to_idx
1884                        .get(name.as_str())
1885                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1886                    Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1887                })
1888                .collect::<PolarsResult<Vec<_>>>()?
1889        } else {
1890            cols.iter()
1891                .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1892                .collect::<PolarsResult<Vec<_>>>()?
1893        };
1894
1895        Ok(selected)
1896    }
1897
1898    /// A non generic implementation to reduce compiler bloat.
1899    fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1900        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1901            // we hash, because there are user that having millions of columns.
1902            // # https://github.com/pola-rs/polars/issues/1023
1903            let name_to_idx = self._names_to_idx_map();
1904
1905            cols.iter()
1906                .map(|name| {
1907                    let idx = *name_to_idx
1908                        .get(name.as_str())
1909                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1910                    Ok(self.select_at_idx(idx).unwrap().clone())
1911                })
1912                .collect::<PolarsResult<Vec<_>>>()?
1913        } else {
1914            cols.iter()
1915                .map(|c| self.column(c.as_str()).cloned())
1916                .collect::<PolarsResult<Vec<_>>>()?
1917        };
1918
1919        Ok(selected)
1920    }
1921
1922    fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1923        // If there is a filtered column just see how many columns there are left.
1924        if let Some(fst) = filtered.first() {
1925            return fst.len();
1926        }
1927
1928        // Otherwise, count the number of values that would be filtered and return that height.
1929        let num_trues = mask.num_trues();
1930        if mask.len() == self.height() {
1931            num_trues
1932        } else {
1933            // This is for broadcasting masks
1934            debug_assert!(num_trues == 0 || num_trues == 1);
1935            self.height() * num_trues
1936        }
1937    }
1938
1939    /// Take the [`DataFrame`] rows by a boolean mask.
1940    ///
1941    /// # Example
1942    ///
1943    /// ```
1944    /// # use polars_core::prelude::*;
1945    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1946    ///     let mask = df.column("sepal_width")?.is_not_null();
1947    ///     df.filter(&mask)
1948    /// }
1949    /// ```
1950    pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1951        let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
1952        let height = self.filter_height(&new_col, mask);
1953
1954        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1955    }
1956
1957    /// Same as `filter` but does not parallelize.
1958    pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1959        let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
1960        let height = self.filter_height(&new_col, mask);
1961
1962        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1963    }
1964
1965    /// Take [`DataFrame`] rows by index values.
1966    ///
1967    /// # Example
1968    ///
1969    /// ```
1970    /// # use polars_core::prelude::*;
1971    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1972    ///     let idx = IdxCa::new("idx".into(), [0, 1, 9]);
1973    ///     df.take(&idx)
1974    /// }
1975    /// ```
1976    pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
1977        let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
1978
1979        Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
1980    }
1981
1982    /// # Safety
1983    /// The indices must be in-bounds.
1984    pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
1985        self.take_unchecked_impl(idx, true)
1986    }
1987
1988    /// # Safety
1989    /// The indices must be in-bounds.
1990    pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
1991        let cols = if allow_threads {
1992            POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
1993        } else {
1994            self._apply_columns(&|s| s.take_unchecked(idx))
1995        };
1996        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
1997    }
1998
1999    /// # Safety
2000    /// The indices must be in-bounds.
2001    pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2002        self.take_slice_unchecked_impl(idx, true)
2003    }
2004
2005    /// # Safety
2006    /// The indices must be in-bounds.
2007    pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2008        let cols = if allow_threads {
2009            POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2010        } else {
2011            self._apply_columns(&|s| s.take_slice_unchecked(idx))
2012        };
2013        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2014    }
2015
2016    /// Rename a column in the [`DataFrame`].
2017    ///
2018    /// # Example
2019    ///
2020    /// ```
2021    /// # use polars_core::prelude::*;
2022    /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2023    ///     let original_name = "foo";
2024    ///     let new_name = "bar";
2025    ///     df.rename(original_name, new_name.into())
2026    /// }
2027    /// ```
2028    pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2029        if column == name.as_str() {
2030            return Ok(self);
2031        }
2032        polars_ensure!(
2033            !self.schema().contains(&name),
2034            Duplicate: "column rename attempted with already existing name \"{name}\""
2035        );
2036
2037        self.get_column_index(column)
2038            .and_then(|idx| self.columns.get_mut(idx))
2039            .ok_or_else(|| polars_err!(col_not_found = column))
2040            .map(|c| c.rename(name))?;
2041        Ok(self)
2042    }
2043
2044    /// Sort [`DataFrame`] in place.
2045    ///
2046    /// See [`DataFrame::sort`] for more instruction.
2047    pub fn sort_in_place(
2048        &mut self,
2049        by: impl IntoVec<PlSmallStr>,
2050        sort_options: SortMultipleOptions,
2051    ) -> PolarsResult<&mut Self> {
2052        let by_column = self.select_columns(by)?;
2053        self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2054        Ok(self)
2055    }
2056
2057    #[doc(hidden)]
2058    /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2059    pub fn sort_impl(
2060        &self,
2061        by_column: Vec<Column>,
2062        mut sort_options: SortMultipleOptions,
2063        slice: Option<(i64, usize)>,
2064    ) -> PolarsResult<Self> {
2065        if by_column.is_empty() {
2066            // If no columns selected, any order (including original order) is correct.
2067            return if let Some((offset, len)) = slice {
2068                Ok(self.slice(offset, len))
2069            } else {
2070                Ok(self.clone())
2071            };
2072        }
2073
2074        // note that the by_column argument also contains evaluated expression from
2075        // polars-lazy that may not even be present in this dataframe. therefore
2076        // when we try to set the first columns as sorted, we ignore the error as
2077        // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2078        let first_descending = sort_options.descending[0];
2079        let first_by_column = by_column[0].name().to_string();
2080
2081        let set_sorted = |df: &mut DataFrame| {
2082            // Mark the first sort column as sorted; if the column does not exist it
2083            // is ok, because we sorted by an expression not present in the dataframe
2084            let _ = df.apply(&first_by_column, |s| {
2085                let mut s = s.clone();
2086                if first_descending {
2087                    s.set_sorted_flag(IsSorted::Descending)
2088                } else {
2089                    s.set_sorted_flag(IsSorted::Ascending)
2090                }
2091                s
2092            });
2093        };
2094        if self.is_empty() {
2095            let mut out = self.clone();
2096            set_sorted(&mut out);
2097            return Ok(out);
2098        }
2099
2100        if let Some((0, k)) = slice {
2101            if k < self.len() {
2102                return self.bottom_k_impl(k, by_column, sort_options);
2103            }
2104        }
2105        // Check if the required column is already sorted; if so we can exit early
2106        // We can do so when there is only one column to sort by, for multiple columns
2107        // it will be complicated to do so
2108        #[cfg(feature = "dtype-categorical")]
2109        let is_not_categorical_enum =
2110            !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2111                || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2112
2113        #[cfg(not(feature = "dtype-categorical"))]
2114        #[allow(non_upper_case_globals)]
2115        const is_not_categorical_enum: bool = true;
2116
2117        if by_column.len() == 1 && is_not_categorical_enum {
2118            let required_sorting = if sort_options.descending[0] {
2119                IsSorted::Descending
2120            } else {
2121                IsSorted::Ascending
2122            };
2123            // If null count is 0 then nulls_last doesnt matter
2124            // Safe to get value at last position since the dataframe is not empty (taken care above)
2125            let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2126                && ((by_column[0].null_count() == 0)
2127                    || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2128                        == sort_options.nulls_last[0]);
2129
2130            if no_sorting_required {
2131                return if let Some((offset, len)) = slice {
2132                    Ok(self.slice(offset, len))
2133                } else {
2134                    Ok(self.clone())
2135                };
2136            }
2137        }
2138
2139        #[cfg(feature = "dtype-struct")]
2140        let has_struct = by_column
2141            .iter()
2142            .any(|s| matches!(s.dtype(), DataType::Struct(_)));
2143
2144        #[cfg(not(feature = "dtype-struct"))]
2145        #[allow(non_upper_case_globals)]
2146        const has_struct: bool = false;
2147
2148        // a lot of indirection in both sorting and take
2149        let mut df = self.clone();
2150        let df = df.as_single_chunk_par();
2151        let mut take = match (by_column.len(), has_struct) {
2152            (1, false) => {
2153                let s = &by_column[0];
2154                let options = SortOptions {
2155                    descending: sort_options.descending[0],
2156                    nulls_last: sort_options.nulls_last[0],
2157                    multithreaded: sort_options.multithreaded,
2158                    maintain_order: sort_options.maintain_order,
2159                    limit: sort_options.limit,
2160                };
2161                // fast path for a frame with a single series
2162                // no need to compute the sort indices and then take by these indices
2163                // simply sort and return as frame
2164                if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2165                    let mut out = s.sort_with(options)?;
2166                    if let Some((offset, len)) = slice {
2167                        out = out.slice(offset, len);
2168                    }
2169                    return Ok(out.into_frame());
2170                }
2171                s.arg_sort(options)
2172            },
2173            _ => {
2174                if sort_options.nulls_last.iter().all(|&x| x)
2175                    || has_struct
2176                    || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2177                {
2178                    argsort_multiple_row_fmt(
2179                        &by_column,
2180                        sort_options.descending,
2181                        sort_options.nulls_last,
2182                        sort_options.multithreaded,
2183                    )?
2184                } else {
2185                    let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2186                    first
2187                        .as_materialized_series()
2188                        .arg_sort_multiple(&other, &sort_options)?
2189                }
2190            },
2191        };
2192
2193        if let Some((offset, len)) = slice {
2194            take = take.slice(offset, len);
2195        }
2196
2197        // SAFETY:
2198        // the created indices are in bounds
2199        let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2200        set_sorted(&mut df);
2201        Ok(df)
2202    }
2203
2204    /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2205    ///
2206    /// This dataframe does not necessarily have a specified schema and may be changed at any
2207    /// point. It is primarily used for debugging.
2208    pub fn _to_metadata(&self) -> DataFrame {
2209        let num_columns = self.columns.len();
2210
2211        let mut column_names =
2212            StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2213        let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2214        let mut sorted_asc_ca =
2215            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2216        let mut sorted_dsc_ca =
2217            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2218        let mut fast_explode_list_ca =
2219            BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2220        let mut materialized_at_ca =
2221            StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2222
2223        for col in &self.columns {
2224            let flags = col.get_flags();
2225
2226            let (repr, materialized_at) = match col {
2227                Column::Series(s) => ("series", s.materialized_at()),
2228                Column::Partitioned(_) => ("partitioned", None),
2229                Column::Scalar(_) => ("scalar", None),
2230            };
2231            let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2232            let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2233            let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2234
2235            column_names.append_value(col.name().clone());
2236            repr_ca.append_value(repr);
2237            sorted_asc_ca.append_value(sorted_asc);
2238            sorted_dsc_ca.append_value(sorted_dsc);
2239            fast_explode_list_ca.append_value(fast_explode_list);
2240            materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2241        }
2242
2243        unsafe {
2244            DataFrame::new_no_checks(
2245                self.width(),
2246                vec![
2247                    column_names.finish().into_column(),
2248                    repr_ca.finish().into_column(),
2249                    sorted_asc_ca.finish().into_column(),
2250                    sorted_dsc_ca.finish().into_column(),
2251                    fast_explode_list_ca.finish().into_column(),
2252                    materialized_at_ca.finish().into_column(),
2253                ],
2254            )
2255        }
2256    }
2257
2258    /// Return a sorted clone of this [`DataFrame`].
2259    ///
2260    /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2261    /// # Example
2262    ///
2263    /// Sort by a single column with default options:
2264    /// ```
2265    /// # use polars_core::prelude::*;
2266    /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2267    ///     df.sort(["sepal_width"], Default::default())
2268    /// }
2269    /// ```
2270    /// Sort by a single column with specific order:
2271    /// ```
2272    /// # use polars_core::prelude::*;
2273    /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2274    ///     df.sort(
2275    ///         ["sepal_width"],
2276    ///         SortMultipleOptions::new()
2277    ///             .with_order_descending(descending)
2278    ///     )
2279    /// }
2280    /// ```
2281    /// Sort by multiple columns with specifying order for each column:
2282    /// ```
2283    /// # use polars_core::prelude::*;
2284    /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2285    ///     df.sort(
2286    ///         ["sepal_width", "sepal_length"],
2287    ///         SortMultipleOptions::new()
2288    ///             .with_order_descending_multi([false, true])
2289    ///     )
2290    /// }
2291    /// ```
2292    /// See [`SortMultipleOptions`] for more options.
2293    ///
2294    /// Also see [`DataFrame::sort_in_place`].
2295    pub fn sort(
2296        &self,
2297        by: impl IntoVec<PlSmallStr>,
2298        sort_options: SortMultipleOptions,
2299    ) -> PolarsResult<Self> {
2300        let mut df = self.clone();
2301        df.sort_in_place(by, sort_options)?;
2302        Ok(df)
2303    }
2304
2305    /// Replace a column with a [`Series`].
2306    ///
2307    /// # Example
2308    ///
2309    /// ```rust
2310    /// # use polars_core::prelude::*;
2311    /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2312    ///                         "Area (kmĀ²)" => [9_833_520, 9_596_961])?;
2313    /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2314    ///
2315    /// assert!(df.replace("Nation", s.clone()).is_err());
2316    /// assert!(df.replace("Country", s).is_ok());
2317    /// # Ok::<(), PolarsError>(())
2318    /// ```
2319    pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2320        self.apply(column, |_| new_col.into_series())
2321    }
2322
2323    /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2324    /// is that now the value of `column: &str` determines the name of the column and not the name
2325    /// of the `Series` passed to this method.
2326    pub fn replace_or_add<S: IntoSeries>(
2327        &mut self,
2328        column: PlSmallStr,
2329        new_col: S,
2330    ) -> PolarsResult<&mut Self> {
2331        let mut new_col = new_col.into_series();
2332        new_col.rename(column);
2333        self.with_column(new_col)
2334    }
2335
2336    /// Replace column at index `idx` with a [`Series`].
2337    ///
2338    /// # Example
2339    ///
2340    /// ```ignored
2341    /// # use polars_core::prelude::*;
2342    /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2343    /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2344    /// let mut df = DataFrame::new(vec![s0, s1])?;
2345    ///
2346    /// // Add 32 to get lowercase ascii values
2347    /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2348    /// # Ok::<(), PolarsError>(())
2349    /// ```
2350    pub fn replace_column<C: IntoColumn>(
2351        &mut self,
2352        index: usize,
2353        new_column: C,
2354    ) -> PolarsResult<&mut Self> {
2355        polars_ensure!(
2356            index < self.width(),
2357            ShapeMismatch:
2358            "unable to replace at index {}, the DataFrame has only {} columns",
2359            index, self.width(),
2360        );
2361        let mut new_column = new_column.into_column();
2362        polars_ensure!(
2363            new_column.len() == self.height(),
2364            ShapeMismatch:
2365            "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2366            new_column.len(), self.height(),
2367        );
2368        let old_col = &mut self.columns[index];
2369        mem::swap(old_col, &mut new_column);
2370        self.clear_schema();
2371        Ok(self)
2372    }
2373
2374    /// Apply a closure to a column. This is the recommended way to do in place modification.
2375    ///
2376    /// # Example
2377    ///
2378    /// ```rust
2379    /// # use polars_core::prelude::*;
2380    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2381    /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2382    /// let mut df = DataFrame::new(vec![s0, s1])?;
2383    ///
2384    /// fn str_to_len(str_val: &Column) -> Column {
2385    ///     str_val.str()
2386    ///         .unwrap()
2387    ///         .into_iter()
2388    ///         .map(|opt_name: Option<&str>| {
2389    ///             opt_name.map(|name: &str| name.len() as u32)
2390    ///          })
2391    ///         .collect::<UInt32Chunked>()
2392    ///         .into_column()
2393    /// }
2394    ///
2395    /// // Replace the names column by the length of the names.
2396    /// df.apply("names", str_to_len);
2397    /// # Ok::<(), PolarsError>(())
2398    /// ```
2399    /// Results in:
2400    ///
2401    /// ```text
2402    /// +--------+-------+
2403    /// | foo    |       |
2404    /// | ---    | names |
2405    /// | str    | u32   |
2406    /// +========+=======+
2407    /// | "ham"  | 4     |
2408    /// +--------+-------+
2409    /// | "spam" | 6     |
2410    /// +--------+-------+
2411    /// | "egg"  | 3     |
2412    /// +--------+-------+
2413    /// ```
2414    pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2415    where
2416        F: FnOnce(&Column) -> C,
2417        C: IntoColumn,
2418    {
2419        let idx = self.check_name_to_idx(name)?;
2420        self.apply_at_idx(idx, f)
2421    }
2422
2423    /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2424    /// modification.
2425    ///
2426    /// # Example
2427    ///
2428    /// ```rust
2429    /// # use polars_core::prelude::*;
2430    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2431    /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2432    /// let mut df = DataFrame::new(vec![s0, s1])?;
2433    ///
2434    /// // Add 32 to get lowercase ascii values
2435    /// df.apply_at_idx(1, |s| s + 32);
2436    /// # Ok::<(), PolarsError>(())
2437    /// ```
2438    /// Results in:
2439    ///
2440    /// ```text
2441    /// +--------+-------+
2442    /// | foo    | ascii |
2443    /// | ---    | ---   |
2444    /// | str    | i32   |
2445    /// +========+=======+
2446    /// | "ham"  | 102   |
2447    /// +--------+-------+
2448    /// | "spam" | 111   |
2449    /// +--------+-------+
2450    /// | "egg"  | 111   |
2451    /// +--------+-------+
2452    /// ```
2453    pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2454    where
2455        F: FnOnce(&Column) -> C,
2456        C: IntoColumn,
2457    {
2458        let df_height = self.height();
2459        let width = self.width();
2460        let col = self.columns.get_mut(idx).ok_or_else(|| {
2461            polars_err!(
2462                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2463                idx, width
2464            )
2465        })?;
2466        let name = col.name().clone();
2467        let new_col = f(col).into_column();
2468        match new_col.len() {
2469            1 => {
2470                let new_col = new_col.new_from_index(0, df_height);
2471                let _ = mem::replace(col, new_col);
2472            },
2473            len if (len == df_height) => {
2474                let _ = mem::replace(col, new_col);
2475            },
2476            len => polars_bail!(
2477                ShapeMismatch:
2478                "resulting Series has length {} while the DataFrame has height {}",
2479                len, df_height
2480            ),
2481        }
2482
2483        // make sure the name remains the same after applying the closure
2484        unsafe {
2485            let col = self.columns.get_unchecked_mut(idx);
2486            col.rename(name);
2487        }
2488        Ok(self)
2489    }
2490
2491    /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2492    /// modification.
2493    ///
2494    /// # Example
2495    ///
2496    /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2497    ///
2498    /// ```rust
2499    /// # use polars_core::prelude::*;
2500    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2501    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2502    /// let mut df = DataFrame::new(vec![s0, s1])?;
2503    ///
2504    /// let idx = vec![0, 1, 4];
2505    ///
2506    /// df.try_apply("foo", |c| {
2507    ///     c.str()?
2508    ///     .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2509    /// });
2510    /// # Ok::<(), PolarsError>(())
2511    /// ```
2512    /// Results in:
2513    ///
2514    /// ```text
2515    /// +---------------------+--------+
2516    /// | foo                 | values |
2517    /// | ---                 | ---    |
2518    /// | str                 | i32    |
2519    /// +=====================+========+
2520    /// | "ham-is-modified"   | 1      |
2521    /// +---------------------+--------+
2522    /// | "spam-is-modified"  | 2      |
2523    /// +---------------------+--------+
2524    /// | "egg"               | 3      |
2525    /// +---------------------+--------+
2526    /// | "bacon"             | 4      |
2527    /// +---------------------+--------+
2528    /// | "quack-is-modified" | 5      |
2529    /// +---------------------+--------+
2530    /// ```
2531    pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2532    where
2533        F: FnOnce(&Column) -> PolarsResult<C>,
2534        C: IntoColumn,
2535    {
2536        let width = self.width();
2537        let col = self.columns.get_mut(idx).ok_or_else(|| {
2538            polars_err!(
2539                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2540                idx, width
2541            )
2542        })?;
2543        let name = col.name().clone();
2544
2545        let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2546
2547        // make sure the name remains the same after applying the closure
2548        unsafe {
2549            let col = self.columns.get_unchecked_mut(idx);
2550            col.rename(name);
2551        }
2552        Ok(self)
2553    }
2554
2555    /// Apply a closure that may fail to a column. This is the recommended way to do in place
2556    /// modification.
2557    ///
2558    /// # Example
2559    ///
2560    /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2561    ///
2562    /// ```rust
2563    /// # use polars_core::prelude::*;
2564    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2565    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2566    /// let mut df = DataFrame::new(vec![s0, s1])?;
2567    ///
2568    /// // create a mask
2569    /// let values = df.column("values")?.as_materialized_series();
2570    /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2571    ///
2572    /// df.try_apply("foo", |c| {
2573    ///     c.str()?
2574    ///     .set(&mask, Some("not_within_bounds"))
2575    /// });
2576    /// # Ok::<(), PolarsError>(())
2577    /// ```
2578    /// Results in:
2579    ///
2580    /// ```text
2581    /// +---------------------+--------+
2582    /// | foo                 | values |
2583    /// | ---                 | ---    |
2584    /// | str                 | i32    |
2585    /// +=====================+========+
2586    /// | "not_within_bounds" | 1      |
2587    /// +---------------------+--------+
2588    /// | "spam"              | 2      |
2589    /// +---------------------+--------+
2590    /// | "egg"               | 3      |
2591    /// +---------------------+--------+
2592    /// | "bacon"             | 4      |
2593    /// +---------------------+--------+
2594    /// | "not_within_bounds" | 5      |
2595    /// +---------------------+--------+
2596    /// ```
2597    pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2598    where
2599        F: FnOnce(&Series) -> PolarsResult<C>,
2600        C: IntoColumn,
2601    {
2602        let idx = self.try_get_column_index(column)?;
2603        self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2604    }
2605
2606    /// Slice the [`DataFrame`] along the rows.
2607    ///
2608    /// # Example
2609    ///
2610    /// ```rust
2611    /// # use polars_core::prelude::*;
2612    /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2613    ///                         "Color" => ["Green", "Red", "White", "White", "Red"])?;
2614    /// let sl: DataFrame = df.slice(2, 3);
2615    ///
2616    /// assert_eq!(sl.shape(), (3, 2));
2617    /// println!("{}", sl);
2618    /// # Ok::<(), PolarsError>(())
2619    /// ```
2620    /// Output:
2621    /// ```text
2622    /// shape: (3, 2)
2623    /// +-------+-------+
2624    /// | Fruit | Color |
2625    /// | ---   | ---   |
2626    /// | str   | str   |
2627    /// +=======+=======+
2628    /// | Grape | White |
2629    /// +-------+-------+
2630    /// | Fig   | White |
2631    /// +-------+-------+
2632    /// | Fig   | Red   |
2633    /// +-------+-------+
2634    /// ```
2635    #[must_use]
2636    pub fn slice(&self, offset: i64, length: usize) -> Self {
2637        if offset == 0 && length == self.height() {
2638            return self.clone();
2639        }
2640        if length == 0 {
2641            return self.clear();
2642        }
2643        let col = self
2644            .columns
2645            .iter()
2646            .map(|s| s.slice(offset, length))
2647            .collect::<Vec<_>>();
2648
2649        let height = if let Some(fst) = col.first() {
2650            fst.len()
2651        } else {
2652            let (_, length) = slice_offsets(offset, length, self.height());
2653            length
2654        };
2655
2656        unsafe { DataFrame::new_no_checks(height, col) }
2657    }
2658
2659    /// Split [`DataFrame`] at the given `offset`.
2660    pub fn split_at(&self, offset: i64) -> (Self, Self) {
2661        let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2662
2663        let (idx, _) = slice_offsets(offset, 0, self.height());
2664
2665        let a = unsafe { DataFrame::new_no_checks(idx, a) };
2666        let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2667        (a, b)
2668    }
2669
2670    pub fn clear(&self) -> Self {
2671        let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2672        unsafe { DataFrame::new_no_checks(0, col) }
2673    }
2674
2675    #[must_use]
2676    pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2677        if offset == 0 && length == self.height() {
2678            return self.clone();
2679        }
2680        let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2681        unsafe { DataFrame::new_no_checks(length, columns) }
2682    }
2683
2684    #[must_use]
2685    pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2686        if offset == 0 && length == self.height() {
2687            return self.clone();
2688        }
2689        // @scalar-opt
2690        let columns = self._apply_columns(&|s| {
2691            let mut out = s.slice(offset, length);
2692            out.shrink_to_fit();
2693            out
2694        });
2695        unsafe { DataFrame::new_no_checks(length, columns) }
2696    }
2697
2698    /// Get the head of the [`DataFrame`].
2699    ///
2700    /// # Example
2701    ///
2702    /// ```rust
2703    /// # use polars_core::prelude::*;
2704    /// let countries: DataFrame =
2705    ///     df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2706    ///         "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2707    ///         "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2708    ///         "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2709    /// assert_eq!(countries.shape(), (5, 4));
2710    ///
2711    /// println!("{}", countries.head(Some(3)));
2712    /// # Ok::<(), PolarsError>(())
2713    /// ```
2714    ///
2715    /// Output:
2716    ///
2717    /// ```text
2718    /// shape: (3, 4)
2719    /// +--------------------+---------------+---------------+------------+
2720    /// | Rank by GDP (2021) | Continent     | Country       | Capital    |
2721    /// | ---                | ---           | ---           | ---        |
2722    /// | i32                | str           | str           | str        |
2723    /// +====================+===============+===============+============+
2724    /// | 1                  | North America | United States | Washington |
2725    /// +--------------------+---------------+---------------+------------+
2726    /// | 2                  | Asia          | China         | Beijing    |
2727    /// +--------------------+---------------+---------------+------------+
2728    /// | 3                  | Asia          | Japan         | Tokyo      |
2729    /// +--------------------+---------------+---------------+------------+
2730    /// ```
2731    #[must_use]
2732    pub fn head(&self, length: Option<usize>) -> Self {
2733        let col = self
2734            .columns
2735            .iter()
2736            .map(|c| c.head(length))
2737            .collect::<Vec<_>>();
2738
2739        let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2740        let height = usize::min(height, self.height());
2741        unsafe { DataFrame::new_no_checks(height, col) }
2742    }
2743
2744    /// Get the tail of the [`DataFrame`].
2745    ///
2746    /// # Example
2747    ///
2748    /// ```rust
2749    /// # use polars_core::prelude::*;
2750    /// let countries: DataFrame =
2751    ///     df!("Rank (2021)" => [105, 106, 107, 108, 109],
2752    ///         "Apple Price (ā‚¬/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2753    ///         "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2754    /// assert_eq!(countries.shape(), (5, 3));
2755    ///
2756    /// println!("{}", countries.tail(Some(2)));
2757    /// # Ok::<(), PolarsError>(())
2758    /// ```
2759    ///
2760    /// Output:
2761    ///
2762    /// ```text
2763    /// shape: (2, 3)
2764    /// +-------------+--------------------+---------+
2765    /// | Rank (2021) | Apple Price (ā‚¬/kg) | Country |
2766    /// | ---         | ---                | ---     |
2767    /// | i32         | f64                | str     |
2768    /// +=============+====================+=========+
2769    /// | 108         | 0.63               | Syria   |
2770    /// +-------------+--------------------+---------+
2771    /// | 109         | 0.63               | Turkey  |
2772    /// +-------------+--------------------+---------+
2773    /// ```
2774    #[must_use]
2775    pub fn tail(&self, length: Option<usize>) -> Self {
2776        let col = self
2777            .columns
2778            .iter()
2779            .map(|c| c.tail(length))
2780            .collect::<Vec<_>>();
2781
2782        let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2783        let height = usize::min(height, self.height());
2784        unsafe { DataFrame::new_no_checks(height, col) }
2785    }
2786
2787    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2788    ///
2789    /// # Panics
2790    ///
2791    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2792    ///
2793    /// This responsibility is left to the caller as we don't want to take mutable references here,
2794    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2795    /// as well.
2796    pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter {
2797        debug_assert!(!self.should_rechunk(), "expected equal chunks");
2798        // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2799        // as we must allocate arrow strings/binaries.
2800        let must_convert = compat_level.0 == 0;
2801        let parallel = parallel
2802            && must_convert
2803            && self.columns.len() > 1
2804            && self
2805                .columns
2806                .iter()
2807                .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2808
2809        RecordBatchIter {
2810            columns: &self.columns,
2811            schema: Arc::new(
2812                self.columns
2813                    .iter()
2814                    .map(|c| c.field().to_arrow(compat_level))
2815                    .collect(),
2816            ),
2817            idx: 0,
2818            n_chunks: self.first_col_n_chunks(),
2819            compat_level,
2820            parallel,
2821        }
2822    }
2823
2824    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2825    ///
2826    /// # Panics
2827    ///
2828    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2829    ///
2830    /// This responsibility is left to the caller as we don't want to take mutable references here,
2831    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2832    /// as well.
2833    pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2834        PhysRecordBatchIter {
2835            schema: Arc::new(
2836                self.get_columns()
2837                    .iter()
2838                    .map(|c| c.field().to_arrow(CompatLevel::newest()))
2839                    .collect(),
2840            ),
2841            arr_iters: self
2842                .materialized_column_iter()
2843                .map(|s| s.chunks().iter())
2844                .collect(),
2845        }
2846    }
2847
2848    /// Get a [`DataFrame`] with all the columns in reversed order.
2849    #[must_use]
2850    pub fn reverse(&self) -> Self {
2851        let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2852        unsafe { DataFrame::new_no_checks(self.height(), col) }
2853    }
2854
2855    /// Shift the values by a given period and fill the parts that will be empty due to this operation
2856    /// with `Nones`.
2857    ///
2858    /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2859    #[must_use]
2860    pub fn shift(&self, periods: i64) -> Self {
2861        let col = self._apply_columns_par(&|s| s.shift(periods));
2862        unsafe { DataFrame::new_no_checks(self.height(), col) }
2863    }
2864
2865    /// Replace None values with one of the following strategies:
2866    /// * Forward fill (replace None with the previous value)
2867    /// * Backward fill (replace None with the next value)
2868    /// * Mean fill (replace None with the mean of the whole array)
2869    /// * Min fill (replace None with the minimum of the whole array)
2870    /// * Max fill (replace None with the maximum of the whole array)
2871    ///
2872    /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2873    pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2874        let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2875
2876        Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2877    }
2878
2879    /// Pipe different functions/ closure operations that work on a DataFrame together.
2880    pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2881    where
2882        F: Fn(DataFrame) -> PolarsResult<B>,
2883    {
2884        f(self)
2885    }
2886
2887    /// Pipe different functions/ closure operations that work on a DataFrame together.
2888    pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2889    where
2890        F: Fn(&mut DataFrame) -> PolarsResult<B>,
2891    {
2892        f(self)
2893    }
2894
2895    /// Pipe different functions/ closure operations that work on a DataFrame together.
2896    pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2897    where
2898        F: Fn(DataFrame, Args) -> PolarsResult<B>,
2899    {
2900        f(self, args)
2901    }
2902
2903    /// Drop duplicate rows from a [`DataFrame`].
2904    /// *This fails when there is a column of type List in DataFrame*
2905    ///
2906    /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2907    ///
2908    /// # Example
2909    ///
2910    /// ```no_run
2911    /// # use polars_core::prelude::*;
2912    /// let df = df! {
2913    ///               "flt" => [1., 1., 2., 2., 3., 3.],
2914    ///               "int" => [1, 1, 2, 2, 3, 3, ],
2915    ///               "str" => ["a", "a", "b", "b", "c", "c"]
2916    ///           }?;
2917    ///
2918    /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2919    /// # Ok::<(), PolarsError>(())
2920    /// ```
2921    /// Returns
2922    ///
2923    /// ```text
2924    /// +-----+-----+-----+
2925    /// | flt | int | str |
2926    /// | --- | --- | --- |
2927    /// | f64 | i32 | str |
2928    /// +=====+=====+=====+
2929    /// | 1   | 1   | "a" |
2930    /// +-----+-----+-----+
2931    /// | 2   | 2   | "b" |
2932    /// +-----+-----+-----+
2933    /// | 3   | 3   | "c" |
2934    /// +-----+-----+-----+
2935    /// ```
2936    #[cfg(feature = "algorithm_group_by")]
2937    pub fn unique_stable(
2938        &self,
2939        subset: Option<&[String]>,
2940        keep: UniqueKeepStrategy,
2941        slice: Option<(i64, usize)>,
2942    ) -> PolarsResult<DataFrame> {
2943        self.unique_impl(
2944            true,
2945            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2946            keep,
2947            slice,
2948        )
2949    }
2950
2951    /// Unstable distinct. See [`DataFrame::unique_stable`].
2952    #[cfg(feature = "algorithm_group_by")]
2953    pub fn unique<I, S>(
2954        &self,
2955        subset: Option<&[String]>,
2956        keep: UniqueKeepStrategy,
2957        slice: Option<(i64, usize)>,
2958    ) -> PolarsResult<DataFrame> {
2959        self.unique_impl(
2960            false,
2961            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2962            keep,
2963            slice,
2964        )
2965    }
2966
2967    #[cfg(feature = "algorithm_group_by")]
2968    pub fn unique_impl(
2969        &self,
2970        maintain_order: bool,
2971        subset: Option<Vec<PlSmallStr>>,
2972        keep: UniqueKeepStrategy,
2973        slice: Option<(i64, usize)>,
2974    ) -> PolarsResult<Self> {
2975        let names = subset.unwrap_or_else(|| self.get_column_names_owned());
2976        let mut df = self.clone();
2977        // take on multiple chunks is terrible
2978        df.as_single_chunk_par();
2979
2980        let columns = match (keep, maintain_order) {
2981            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
2982                let gb = df.group_by_stable(names)?;
2983                let groups = gb.get_groups();
2984                let (offset, len) = slice.unwrap_or((0, groups.len()));
2985                let groups = groups.slice(offset, len);
2986                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
2987            },
2988            (UniqueKeepStrategy::Last, true) => {
2989                // maintain order by last values, so the sorted groups are not correct as they
2990                // are sorted by the first value
2991                let gb = df.group_by(names)?;
2992                let groups = gb.get_groups();
2993
2994                let func = |g: GroupsIndicator| match g {
2995                    GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
2996                    GroupsIndicator::Slice([first, len]) => first + len - 1,
2997                };
2998
2999                let last_idx: NoNull<IdxCa> = match slice {
3000                    None => groups.iter().map(func).collect(),
3001                    Some((offset, len)) => {
3002                        let (offset, len) = slice_offsets(offset, len, groups.len());
3003                        groups.iter().skip(offset).take(len).map(func).collect()
3004                    },
3005                };
3006
3007                let last_idx = last_idx.sort(false);
3008                return Ok(unsafe { df.take_unchecked(&last_idx) });
3009            },
3010            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3011                let gb = df.group_by(names)?;
3012                let groups = gb.get_groups();
3013                let (offset, len) = slice.unwrap_or((0, groups.len()));
3014                let groups = groups.slice(offset, len);
3015                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3016            },
3017            (UniqueKeepStrategy::Last, false) => {
3018                let gb = df.group_by(names)?;
3019                let groups = gb.get_groups();
3020                let (offset, len) = slice.unwrap_or((0, groups.len()));
3021                let groups = groups.slice(offset, len);
3022                df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3023            },
3024            (UniqueKeepStrategy::None, _) => {
3025                let df_part = df.select(names)?;
3026                let mask = df_part.is_unique()?;
3027                let mask = match slice {
3028                    None => mask,
3029                    Some((offset, len)) => mask.slice(offset, len),
3030                };
3031                return df.filter(&mask);
3032            },
3033        };
3034
3035        let height = Self::infer_height(&columns);
3036        Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3037    }
3038
3039    /// Get a mask of all the unique rows in the [`DataFrame`].
3040    ///
3041    /// # Example
3042    ///
3043    /// ```no_run
3044    /// # use polars_core::prelude::*;
3045    /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3046    ///                         "ISIN" => ["US0378331005", "US5949181045"])?;
3047    /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3048    ///
3049    /// assert!(ca.all());
3050    /// # Ok::<(), PolarsError>(())
3051    /// ```
3052    #[cfg(feature = "algorithm_group_by")]
3053    pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3054        let gb = self.group_by(self.get_column_names_owned())?;
3055        let groups = gb.get_groups();
3056        Ok(is_unique_helper(
3057            groups,
3058            self.height() as IdxSize,
3059            true,
3060            false,
3061        ))
3062    }
3063
3064    /// Get a mask of all the duplicated rows in the [`DataFrame`].
3065    ///
3066    /// # Example
3067    ///
3068    /// ```no_run
3069    /// # use polars_core::prelude::*;
3070    /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3071    ///                         "ISIN" => ["US02079K3059", "US02079K1079"])?;
3072    /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3073    ///
3074    /// assert!(!ca.all());
3075    /// # Ok::<(), PolarsError>(())
3076    /// ```
3077    #[cfg(feature = "algorithm_group_by")]
3078    pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3079        let gb = self.group_by(self.get_column_names_owned())?;
3080        let groups = gb.get_groups();
3081        Ok(is_unique_helper(
3082            groups,
3083            self.height() as IdxSize,
3084            false,
3085            true,
3086        ))
3087    }
3088
3089    /// Create a new [`DataFrame`] that shows the null counts per column.
3090    #[must_use]
3091    pub fn null_count(&self) -> Self {
3092        let cols = self
3093            .columns
3094            .iter()
3095            .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3096            .collect();
3097        unsafe { Self::new_no_checks(1, cols) }
3098    }
3099
3100    /// Hash and combine the row values
3101    #[cfg(feature = "row_hash")]
3102    pub fn hash_rows(
3103        &mut self,
3104        hasher_builder: Option<PlRandomState>,
3105    ) -> PolarsResult<UInt64Chunked> {
3106        let dfs = split_df(self, POOL.current_num_threads(), false);
3107        let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3108
3109        let mut iter = cas.into_iter();
3110        let mut acc_ca = iter.next().unwrap();
3111        for ca in iter {
3112            acc_ca.append(&ca)?;
3113        }
3114        Ok(acc_ca.rechunk().into_owned())
3115    }
3116
3117    /// Get the supertype of the columns in this DataFrame
3118    pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3119        self.columns
3120            .iter()
3121            .map(|s| Ok(s.dtype().clone()))
3122            .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3123    }
3124
3125    /// Take by index values given by the slice `idx`.
3126    /// # Warning
3127    /// Be careful with allowing threads when calling this in a large hot loop
3128    /// every thread split may be on rayon stack and lead to SO
3129    #[doc(hidden)]
3130    pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3131        self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3132    }
3133
3134    /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3135    /// if the index value in `idx` are sorted. This will maintain sorted flags.
3136    ///
3137    /// # Warning
3138    /// Be careful with allowing threads when calling this in a large hot loop
3139    /// every thread split may be on rayon stack and lead to SO
3140    #[doc(hidden)]
3141    pub unsafe fn _take_unchecked_slice_sorted(
3142        &self,
3143        idx: &[IdxSize],
3144        allow_threads: bool,
3145        sorted: IsSorted,
3146    ) -> Self {
3147        #[cfg(debug_assertions)]
3148        {
3149            if idx.len() > 2 {
3150                match sorted {
3151                    IsSorted::Ascending => {
3152                        assert!(idx[0] <= idx[idx.len() - 1]);
3153                    },
3154                    IsSorted::Descending => {
3155                        assert!(idx[0] >= idx[idx.len() - 1]);
3156                    },
3157                    _ => {},
3158                }
3159            }
3160        }
3161        let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3162        ca.set_sorted_flag(sorted);
3163        self.take_unchecked_impl(&ca, allow_threads)
3164    }
3165
3166    #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3167    #[doc(hidden)]
3168    pub fn _partition_by_impl(
3169        &self,
3170        cols: &[PlSmallStr],
3171        stable: bool,
3172        include_key: bool,
3173    ) -> PolarsResult<Vec<DataFrame>> {
3174        let groups = if stable {
3175            self.group_by_stable(cols.iter().cloned())?.take_groups()
3176        } else {
3177            self.group_by(cols.iter().cloned())?.take_groups()
3178        };
3179
3180        // drop key columns prior to calculation if requested
3181        let df = if include_key {
3182            self.clone()
3183        } else {
3184            self.drop_many(cols.iter().cloned())
3185        };
3186
3187        // don't parallelize this
3188        // there is a lot of parallelization in take and this may easily SO
3189        POOL.install(|| {
3190            match groups.as_ref() {
3191                GroupsType::Idx(idx) => {
3192                    // Rechunk as the gather may rechunk for every group #17562.
3193                    let mut df = df.clone();
3194                    df.as_single_chunk_par();
3195                    Ok(idx
3196                        .into_par_iter()
3197                        .map(|(_, group)| {
3198                            // groups are in bounds
3199                            unsafe {
3200                                df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3201                            }
3202                        })
3203                        .collect())
3204                },
3205                GroupsType::Slice { groups, .. } => Ok(groups
3206                    .into_par_iter()
3207                    .map(|[first, len]| df.slice(*first as i64, *len as usize))
3208                    .collect()),
3209            }
3210        })
3211    }
3212
3213    /// Split into multiple DataFrames partitioned by groups
3214    #[cfg(feature = "partition_by")]
3215    pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3216    where
3217        I: IntoIterator<Item = S>,
3218        S: Into<PlSmallStr>,
3219    {
3220        let cols = cols
3221            .into_iter()
3222            .map(Into::into)
3223            .collect::<Vec<PlSmallStr>>();
3224        self._partition_by_impl(cols.as_slice(), false, include_key)
3225    }
3226
3227    /// Split into multiple DataFrames partitioned by groups
3228    /// Order of the groups are maintained.
3229    #[cfg(feature = "partition_by")]
3230    pub fn partition_by_stable<I, S>(
3231        &self,
3232        cols: I,
3233        include_key: bool,
3234    ) -> PolarsResult<Vec<DataFrame>>
3235    where
3236        I: IntoIterator<Item = S>,
3237        S: Into<PlSmallStr>,
3238    {
3239        let cols = cols
3240            .into_iter()
3241            .map(Into::into)
3242            .collect::<Vec<PlSmallStr>>();
3243        self._partition_by_impl(cols.as_slice(), true, include_key)
3244    }
3245
3246    /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3247    /// inserted as columns.
3248    #[cfg(feature = "dtype-struct")]
3249    pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3250        let cols = cols.into_vec();
3251        self.unnest_impl(cols.into_iter().collect())
3252    }
3253
3254    #[cfg(feature = "dtype-struct")]
3255    fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3256        let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3257        let mut count = 0;
3258        for s in &self.columns {
3259            if cols.contains(s.name()) {
3260                let ca = s.struct_()?.clone();
3261                new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3262                count += 1;
3263            } else {
3264                new_cols.push(s.clone())
3265            }
3266        }
3267        if count != cols.len() {
3268            // one or more columns not found
3269            // the code below will return an error with the missing name
3270            let schema = self.schema();
3271            for col in cols {
3272                let _ = schema
3273                    .get(col.as_str())
3274                    .ok_or_else(|| polars_err!(col_not_found = col))?;
3275            }
3276        }
3277        DataFrame::new(new_cols)
3278    }
3279
3280    pub(crate) fn infer_height(cols: &[Column]) -> usize {
3281        cols.first().map_or(0, Column::len)
3282    }
3283
3284    pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3285        // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3286        // append_chunk or something like this. It is just quite difficult to make that safe.
3287        let df = DataFrame::from(rb);
3288        polars_ensure!(
3289            self.schema() == df.schema(),
3290            SchemaMismatch: "cannot append record batch with different schema",
3291        );
3292        self.vstack_mut_owned_unchecked(df);
3293        Ok(())
3294    }
3295}
3296
3297pub struct RecordBatchIter<'a> {
3298    columns: &'a Vec<Column>,
3299    schema: ArrowSchemaRef,
3300    idx: usize,
3301    n_chunks: usize,
3302    compat_level: CompatLevel,
3303    parallel: bool,
3304}
3305
3306impl Iterator for RecordBatchIter<'_> {
3307    type Item = RecordBatch;
3308
3309    fn next(&mut self) -> Option<Self::Item> {
3310        if self.idx >= self.n_chunks {
3311            return None;
3312        }
3313
3314        // Create a batch of the columns with the same chunk no.
3315        let batch_cols: Vec<ArrayRef> = if self.parallel {
3316            let iter = self
3317                .columns
3318                .par_iter()
3319                .map(Column::as_materialized_series)
3320                .map(|s| s.to_arrow(self.idx, self.compat_level));
3321            POOL.install(|| iter.collect())
3322        } else {
3323            self.columns
3324                .iter()
3325                .map(Column::as_materialized_series)
3326                .map(|s| s.to_arrow(self.idx, self.compat_level))
3327                .collect()
3328        };
3329        self.idx += 1;
3330
3331        let length = batch_cols.first().map_or(0, |arr| arr.len());
3332        Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3333    }
3334
3335    fn size_hint(&self) -> (usize, Option<usize>) {
3336        let n = self.n_chunks - self.idx;
3337        (n, Some(n))
3338    }
3339}
3340
3341pub struct PhysRecordBatchIter<'a> {
3342    schema: ArrowSchemaRef,
3343    arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3344}
3345
3346impl Iterator for PhysRecordBatchIter<'_> {
3347    type Item = RecordBatch;
3348
3349    fn next(&mut self) -> Option<Self::Item> {
3350        let arrs = self
3351            .arr_iters
3352            .iter_mut()
3353            .map(|phys_iter| phys_iter.next().cloned())
3354            .collect::<Option<Vec<_>>>()?;
3355
3356        let length = arrs.first().map_or(0, |arr| arr.len());
3357        Some(RecordBatch::new(length, self.schema.clone(), arrs))
3358    }
3359
3360    fn size_hint(&self) -> (usize, Option<usize>) {
3361        if let Some(iter) = self.arr_iters.first() {
3362            iter.size_hint()
3363        } else {
3364            (0, None)
3365        }
3366    }
3367}
3368
3369impl Default for DataFrame {
3370    fn default() -> Self {
3371        DataFrame::empty()
3372    }
3373}
3374
3375impl From<DataFrame> for Vec<Column> {
3376    fn from(df: DataFrame) -> Self {
3377        df.columns
3378    }
3379}
3380
3381// utility to test if we can vstack/extend the columns
3382fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3383    polars_ensure!(
3384        left.name() == right.name(),
3385        ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3386        left.name(), right.name(),
3387    );
3388    Ok(())
3389}
3390
3391#[cfg(test)]
3392mod test {
3393    use super::*;
3394
3395    fn create_frame() -> DataFrame {
3396        let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3397        let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3398        DataFrame::new(vec![s0, s1]).unwrap()
3399    }
3400
3401    #[test]
3402    #[cfg_attr(miri, ignore)]
3403    fn test_recordbatch_iterator() {
3404        let df = df!(
3405            "foo" => [1, 2, 3, 4, 5]
3406        )
3407        .unwrap();
3408        let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3409        assert_eq!(5, iter.next().unwrap().len());
3410        assert!(iter.next().is_none());
3411    }
3412
3413    #[test]
3414    #[cfg_attr(miri, ignore)]
3415    fn test_select() {
3416        let df = create_frame();
3417        assert_eq!(
3418            df.column("days")
3419                .unwrap()
3420                .as_series()
3421                .unwrap()
3422                .equal(1)
3423                .unwrap()
3424                .sum(),
3425            Some(1)
3426        );
3427    }
3428
3429    #[test]
3430    #[cfg_attr(miri, ignore)]
3431    fn test_filter_broadcast_on_string_col() {
3432        let col_name = "some_col";
3433        let v = vec!["test".to_string()];
3434        let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3435        let mut df = DataFrame::new(vec![s0]).unwrap();
3436
3437        df = df
3438            .filter(
3439                &df.column(col_name)
3440                    .unwrap()
3441                    .as_materialized_series()
3442                    .equal("")
3443                    .unwrap(),
3444            )
3445            .unwrap();
3446        assert_eq!(
3447            df.column(col_name)
3448                .unwrap()
3449                .as_materialized_series()
3450                .n_chunks(),
3451            1
3452        );
3453    }
3454
3455    #[test]
3456    #[cfg_attr(miri, ignore)]
3457    fn test_filter_broadcast_on_list_col() {
3458        let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3459        let ll: ListChunked = [&s1].iter().copied().collect();
3460
3461        let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3462        let new = ll.filter(&mask).unwrap();
3463
3464        assert_eq!(new.chunks.len(), 1);
3465        assert_eq!(new.len(), 0);
3466    }
3467
3468    #[test]
3469    fn slice() {
3470        let df = create_frame();
3471        let sliced_df = df.slice(0, 2);
3472        assert_eq!(sliced_df.shape(), (2, 2));
3473    }
3474
3475    #[test]
3476    fn rechunk_false() {
3477        let df = create_frame();
3478        assert!(!df.should_rechunk())
3479    }
3480
3481    #[test]
3482    fn rechunk_true() -> PolarsResult<()> {
3483        let mut base = df!(
3484            "a" => [1, 2, 3],
3485            "b" => [1, 2, 3]
3486        )?;
3487
3488        // Create a series with multiple chunks
3489        let mut s = Series::new("foo".into(), 0..2);
3490        let s2 = Series::new("bar".into(), 0..1);
3491        s.append(&s2)?;
3492
3493        // Append series to frame
3494        let out = base.with_column(s)?;
3495
3496        // Now we should rechunk
3497        assert!(out.should_rechunk());
3498        Ok(())
3499    }
3500
3501    #[test]
3502    fn test_duplicate_column() {
3503        let mut df = df! {
3504            "foo" => [1, 2, 3]
3505        }
3506        .unwrap();
3507        // check if column is replaced
3508        assert!(
3509            df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3510                .is_ok()
3511        );
3512        assert!(
3513            df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3514                .is_ok()
3515        );
3516        assert!(df.column("bar").is_ok())
3517    }
3518
3519    #[test]
3520    #[cfg_attr(miri, ignore)]
3521    fn distinct() {
3522        let df = df! {
3523            "flt" => [1., 1., 2., 2., 3., 3.],
3524            "int" => [1, 1, 2, 2, 3, 3, ],
3525            "str" => ["a", "a", "b", "b", "c", "c"]
3526        }
3527        .unwrap();
3528        let df = df
3529            .unique_stable(None, UniqueKeepStrategy::First, None)
3530            .unwrap()
3531            .sort(["flt"], SortMultipleOptions::default())
3532            .unwrap();
3533        let valid = df! {
3534            "flt" => [1., 2., 3.],
3535            "int" => [1, 2, 3],
3536            "str" => ["a", "b", "c"]
3537        }
3538        .unwrap();
3539        assert!(df.equals(&valid));
3540    }
3541
3542    #[test]
3543    fn test_vstack() {
3544        // check that it does not accidentally rechunks
3545        let mut df = df! {
3546            "flt" => [1., 1., 2., 2., 3., 3.],
3547            "int" => [1, 1, 2, 2, 3, 3, ],
3548            "str" => ["a", "a", "b", "b", "c", "c"]
3549        }
3550        .unwrap();
3551
3552        df.vstack_mut(&df.slice(0, 3)).unwrap();
3553        assert_eq!(df.first_col_n_chunks(), 2)
3554    }
3555
3556    #[test]
3557    fn test_vstack_on_empty_dataframe() {
3558        let mut df = DataFrame::empty();
3559
3560        let df_data = df! {
3561            "flt" => [1., 1., 2., 2., 3., 3.],
3562            "int" => [1, 1, 2, 2, 3, 3, ],
3563            "str" => ["a", "a", "b", "b", "c", "c"]
3564        }
3565        .unwrap();
3566
3567        df.vstack_mut(&df_data).unwrap();
3568        assert_eq!(df.height, 6)
3569    }
3570
3571    #[test]
3572    fn test_replace_or_add() -> PolarsResult<()> {
3573        let mut df = df!(
3574            "a" => [1, 2, 3],
3575            "b" => [1, 2, 3]
3576        )?;
3577
3578        // check that the new column is "c" and not "bar".
3579        df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3580
3581        assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3582        Ok(())
3583    }
3584}