polars_core/frame/
mod.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::UnitVec;
10use polars_utils::itertools::Itertools;
11use rayon::prelude::*;
12
13use crate::chunked_array::flags::StatisticsFlags;
14#[cfg(feature = "algorithm_group_by")]
15use crate::chunked_array::ops::unique::is_unique_helper;
16use crate::prelude::*;
17#[cfg(feature = "row_hash")]
18use crate::utils::split_df;
19use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
20use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
21
22#[cfg(feature = "dataframe_arithmetic")]
23mod arithmetic;
24pub mod builder;
25mod chunks;
26pub use chunks::chunk_df_for_writing;
27pub mod column;
28pub mod explode;
29mod from;
30#[cfg(feature = "algorithm_group_by")]
31pub mod group_by;
32pub(crate) mod horizontal;
33#[cfg(feature = "proptest")]
34pub mod proptest;
35#[cfg(any(feature = "rows", feature = "object"))]
36pub mod row;
37mod top_k;
38mod upstream_traits;
39mod validation;
40
41use arrow::record_batch::{RecordBatch, RecordBatchT};
42use polars_utils::pl_str::PlSmallStr;
43#[cfg(feature = "serde")]
44use serde::{Deserialize, Serialize};
45use strum_macros::IntoStaticStr;
46
47use crate::POOL;
48#[cfg(feature = "row_hash")]
49use crate::hashing::_df_rows_to_hashes_threaded_vertical;
50use crate::prelude::sort::arg_sort;
51use crate::series::IsSorted;
52
53#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
54#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
55#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
56#[strum(serialize_all = "snake_case")]
57pub enum UniqueKeepStrategy {
58    /// Keep the first unique row.
59    First,
60    /// Keep the last unique row.
61    Last,
62    /// Keep None of the unique rows.
63    None,
64    /// Keep any of the unique rows
65    /// This allows more optimizations
66    #[default]
67    Any,
68}
69
70fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
71where
72    F: for<'a> FnMut(&'a T) -> &'a str,
73{
74    // Always unique.
75    if items.len() <= 1 {
76        return Ok(());
77    }
78
79    if items.len() <= 4 {
80        // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
81        for i in 0..items.len() - 1 {
82            let name = get_name(&items[i]);
83            for other in items.iter().skip(i + 1) {
84                if name == get_name(other) {
85                    polars_bail!(duplicate = name);
86                }
87            }
88        }
89    } else {
90        let mut names = PlHashSet::with_capacity(items.len());
91        for item in items {
92            let name = get_name(item);
93            if !names.insert(name) {
94                polars_bail!(duplicate = name);
95            }
96        }
97    }
98    Ok(())
99}
100
101/// A contiguous growable collection of `Series` that have the same length.
102///
103/// ## Use declarations
104///
105/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
106///
107/// ```rust
108/// use polars_core::prelude::*; // if the crate polars-core is used directly
109/// // use polars::prelude::*;      if the crate polars is used
110/// ```
111///
112/// # Initialization
113/// ## Default
114///
115/// A `DataFrame` can be initialized empty:
116///
117/// ```rust
118/// # use polars_core::prelude::*;
119/// let df = DataFrame::default();
120/// assert!(df.is_empty());
121/// ```
122///
123/// ## Wrapping a `Vec<Series>`
124///
125/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
126///
127/// ```rust
128/// # use polars_core::prelude::*;
129/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
130/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
131///
132/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
133/// ```
134///
135/// ## Using a macro
136///
137/// The [`df!`] macro is a convenient method:
138///
139/// ```rust
140/// # use polars_core::prelude::*;
141/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
142///                                       "Color" => ["Red", "Yellow", "Green"]);
143/// ```
144///
145/// ## Using a CSV file
146///
147/// See the `polars_io::csv::CsvReader`.
148///
149/// # Indexing
150/// ## By a number
151///
152/// The `Index<usize>` is implemented for the `DataFrame`.
153///
154/// ```rust
155/// # use polars_core::prelude::*;
156/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
157///              "Color" => ["Red", "Yellow", "Green"])?;
158///
159/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
160/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
161/// # Ok::<(), PolarsError>(())
162/// ```
163///
164/// ## By a `Series` name
165///
166/// ```rust
167/// # use polars_core::prelude::*;
168/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
169///              "Color" => ["Red", "Yellow", "Green"])?;
170///
171/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
172/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
173/// # Ok::<(), PolarsError>(())
174/// ```
175#[derive(Clone)]
176pub struct DataFrame {
177    height: usize,
178    // invariant: columns[i].len() == height for each 0 >= i > columns.len()
179    pub(crate) columns: Vec<Column>,
180
181    /// A cached schema. This might not give correct results if the DataFrame was modified in place
182    /// between schema and reading.
183    cached_schema: OnceLock<SchemaRef>,
184}
185
186impl DataFrame {
187    pub fn clear_schema(&mut self) {
188        self.cached_schema = OnceLock::new();
189    }
190
191    #[inline]
192    pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
193        self.columns.iter()
194    }
195
196    #[inline]
197    pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
198        self.columns.iter().map(Column::as_materialized_series)
199    }
200
201    #[inline]
202    pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
203        self.columns.par_iter().map(Column::as_materialized_series)
204    }
205
206    /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
207    ///
208    /// # Implementation
209    /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
210    /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
211    /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
212    ///
213    /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
214    /// However, this function will yield a smaller number. This is because this function returns
215    /// the visible size of the buffer, not its total capacity.
216    ///
217    /// FFI buffers are included in this estimation.
218    pub fn estimated_size(&self) -> usize {
219        self.columns.iter().map(Column::estimated_size).sum()
220    }
221
222    // Reduce monomorphization.
223    fn try_apply_columns(
224        &self,
225        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
226    ) -> PolarsResult<Vec<Column>> {
227        self.columns.iter().map(func).collect()
228    }
229    // Reduce monomorphization.
230    pub fn _apply_columns(&self, func: &dyn Fn(&Column) -> Column) -> Vec<Column> {
231        self.columns.iter().map(func).collect()
232    }
233    // Reduce monomorphization.
234    fn try_apply_columns_par(
235        &self,
236        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
237    ) -> PolarsResult<Vec<Column>> {
238        POOL.install(|| self.columns.par_iter().map(func).collect())
239    }
240    // Reduce monomorphization.
241    pub fn _apply_columns_par(
242        &self,
243        func: &(dyn Fn(&Column) -> Column + Send + Sync),
244    ) -> Vec<Column> {
245        POOL.install(|| self.columns.par_iter().map(func).collect())
246    }
247
248    /// Get the index of the column.
249    fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
250        self.get_column_index(name)
251            .ok_or_else(|| polars_err!(col_not_found = name))
252    }
253
254    fn check_already_present(&self, name: &str) -> PolarsResult<()> {
255        polars_ensure!(
256            self.columns.iter().all(|s| s.name().as_str() != name),
257            Duplicate: "column with name {:?} is already present in the DataFrame", name
258        );
259        Ok(())
260    }
261
262    /// Reserve additional slots into the chunks of the series.
263    pub(crate) fn reserve_chunks(&mut self, additional: usize) {
264        for s in &mut self.columns {
265            if let Column::Series(s) = s {
266                // SAFETY:
267                // do not modify the data, simply resize.
268                unsafe { s.chunks_mut().reserve(additional) }
269            }
270        }
271    }
272
273    /// Create a DataFrame from a Vector of Columns.
274    ///
275    /// Errors if a column names are not unique, or if heights are not all equal.
276    ///
277    /// # Example
278    ///
279    /// ```
280    /// # use polars_core::prelude::*;
281    /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
282    /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
283    ///
284    /// let df = DataFrame::new(vec![s0, s1])?;
285    /// # Ok::<(), PolarsError>(())
286    /// ```
287    pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
288        DataFrame::validate_columns_slice(&columns)
289            .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
290        Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
291    }
292
293    pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
294        for col in &columns {
295            polars_ensure!(
296                col.len() == height,
297                ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
298                columns[0].name(), height, col.name(), col.len()
299            );
300        }
301
302        ensure_names_unique(&columns, |s| s.name().as_str())?;
303
304        Ok(DataFrame {
305            height,
306            columns,
307            cached_schema: OnceLock::new(),
308        })
309    }
310
311    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
312    /// columns to match the other columns.
313    pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
314        // The length of the longest non-unit length column determines the
315        // broadcast length. If all columns are unit-length the broadcast length
316        // is one.
317        let broadcast_len = columns
318            .iter()
319            .map(|s| s.len())
320            .filter(|l| *l != 1)
321            .max()
322            .unwrap_or(1);
323        Self::new_with_broadcast_len(columns, broadcast_len)
324    }
325
326    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
327    /// columns to broadcast_len.
328    pub fn new_with_broadcast_len(
329        columns: Vec<Column>,
330        broadcast_len: usize,
331    ) -> PolarsResult<Self> {
332        ensure_names_unique(&columns, |s| s.name().as_str())?;
333        unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
334    }
335
336    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
337    /// columns to match the other columns.
338    ///
339    /// # Safety
340    /// Does not check that the column names are unique (which they must be).
341    pub unsafe fn new_with_broadcast_no_namecheck(
342        mut columns: Vec<Column>,
343        broadcast_len: usize,
344    ) -> PolarsResult<Self> {
345        for col in &mut columns {
346            // Length not equal to the broadcast len, needs broadcast or is an error.
347            let len = col.len();
348            if len != broadcast_len {
349                if len != 1 {
350                    let name = col.name().to_owned();
351                    let extra_info =
352                        if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
353                            format!(" (matching column '{}')", c.name())
354                        } else {
355                            String::new()
356                        };
357                    polars_bail!(
358                        ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
359                    );
360                }
361                *col = col.new_from_index(0, broadcast_len);
362            }
363        }
364
365        let length = if columns.is_empty() { 0 } else { broadcast_len };
366
367        Ok(unsafe { DataFrame::new_no_checks(length, columns) })
368    }
369
370    pub fn new_from_index(&self, index: usize, height: usize) -> Self {
371        let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
372        unsafe { Self::new_no_checks(height, cols.collect()) }
373    }
374
375    /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
376    ///
377    /// # Example
378    ///
379    /// ```rust
380    /// use polars_core::prelude::DataFrame;
381    /// static EMPTY: DataFrame = DataFrame::empty();
382    /// ```
383    pub const fn empty() -> Self {
384        Self::empty_with_height(0)
385    }
386
387    /// Creates an empty `DataFrame` with a specific `height`.
388    pub const fn empty_with_height(height: usize) -> Self {
389        DataFrame {
390            height,
391            columns: vec![],
392            cached_schema: OnceLock::new(),
393        }
394    }
395
396    /// Create an empty `DataFrame` with empty columns as per the `schema`.
397    pub fn empty_with_arc_schema(schema: Arc<Schema>) -> Self {
398        let mut df = Self::empty_with_schema(&schema);
399        df.cached_schema = OnceLock::from(schema);
400        df
401    }
402
403    /// Create an empty `DataFrame` with empty columns as per the `schema`.
404    pub fn empty_with_schema(schema: &Schema) -> Self {
405        let cols = schema
406            .iter()
407            .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
408            .collect();
409        unsafe { DataFrame::new_no_checks(0, cols) }
410    }
411
412    /// Create an empty `DataFrame` with empty columns as per the `schema`.
413    pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
414        let cols = schema
415            .iter_values()
416            .map(|fld| {
417                Column::from(Series::new_empty(
418                    fld.name.clone(),
419                    &(DataType::from_arrow_field(fld)),
420                ))
421            })
422            .collect();
423        unsafe { DataFrame::new_no_checks(0, cols) }
424    }
425
426    /// Create a new `DataFrame` with the given schema, only containing nulls.
427    pub fn full_null(schema: &Schema, height: usize) -> Self {
428        let columns = schema
429            .iter_fields()
430            .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
431            .collect();
432        unsafe { DataFrame::new_no_checks(height, columns) }
433    }
434
435    /// Ensure this DataFrame matches the given schema. Casts null columns to
436    /// the expected schema if necessary (but nothing else).
437    pub fn ensure_matches_schema(&mut self, schema: &Schema) -> PolarsResult<()> {
438        let mut any_needed_cast = false;
439        for (col, (name, dt)) in self.columns.iter_mut().zip(schema.iter()) {
440            polars_ensure!(
441                col.name() == name,
442                SchemaMismatch: "column name mismatch: expected {:?}, found {:?}",
443                name,
444                col.name()
445            );
446
447            let needs_cast = col.dtype().matches_schema_type(dt)?;
448            any_needed_cast |= needs_cast;
449            if needs_cast {
450                *col = col.cast(dt)?;
451            }
452        }
453        if any_needed_cast {
454            self.clear_schema();
455        }
456        Ok(())
457    }
458
459    /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
460    ///
461    /// # Example
462    ///
463    /// ```rust
464    /// # use polars_core::prelude::*;
465    /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
466    /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
467    /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
468    ///
469    /// assert_eq!(df.pop(), Some(s2));
470    /// assert_eq!(df.pop(), Some(s1));
471    /// assert_eq!(df.pop(), None);
472    /// assert!(df.is_empty());
473    /// # Ok::<(), PolarsError>(())
474    /// ```
475    pub fn pop(&mut self) -> Option<Column> {
476        self.clear_schema();
477
478        self.columns.pop()
479    }
480
481    /// Add a new column at index 0 that counts the rows.
482    ///
483    /// # Example
484    ///
485    /// ```
486    /// # use polars_core::prelude::*;
487    /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
488    /// assert_eq!(df1.shape(), (4, 1));
489    ///
490    /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
491    /// assert_eq!(df2.shape(), (4, 2));
492    /// println!("{}", df2);
493    ///
494    /// # Ok::<(), PolarsError>(())
495    /// ```
496    ///
497    /// Output:
498    ///
499    /// ```text
500    ///  shape: (4, 2)
501    ///  +-----+----------+
502    ///  | Id  | Name     |
503    ///  | --- | ---      |
504    ///  | u32 | str      |
505    ///  +=====+==========+
506    ///  | 0   | James    |
507    ///  +-----+----------+
508    ///  | 1   | Mary     |
509    ///  +-----+----------+
510    ///  | 2   | John     |
511    ///  +-----+----------+
512    ///  | 3   | Patricia |
513    ///  +-----+----------+
514    /// ```
515    pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
516        let mut columns = Vec::with_capacity(self.columns.len() + 1);
517        let offset = offset.unwrap_or(0);
518
519        let col = Column::new_row_index(name, offset, self.height())?;
520        columns.push(col);
521        columns.extend_from_slice(&self.columns);
522        DataFrame::new(columns)
523    }
524
525    /// Add a row index column in place.
526    ///
527    /// # Safety
528    /// The caller should ensure the DataFrame does not already contain a column with the given name.
529    ///
530    /// # Panics
531    /// Panics if the resulting column would reach or overflow IdxSize::MAX.
532    pub unsafe fn with_row_index_mut(
533        &mut self,
534        name: PlSmallStr,
535        offset: Option<IdxSize>,
536    ) -> &mut Self {
537        // TODO: Make this function unsafe
538        debug_assert!(
539            self.columns.iter().all(|c| c.name() != &name),
540            "with_row_index_mut(): column with name {} already exists",
541            &name
542        );
543
544        let offset = offset.unwrap_or(0);
545        let col = Column::new_row_index(name, offset, self.height()).unwrap();
546
547        self.clear_schema();
548        self.columns.insert(0, col);
549        self
550    }
551
552    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
553    /// `Series`.
554    ///
555    /// Calculates the height from the first column or `0` if no columns are given.
556    ///
557    /// # Safety
558    ///
559    /// It is the callers responsibility to uphold the contract of all `Series`
560    /// having an equal length and a unique name, if not this may panic down the line.
561    pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
562        let height = columns.first().map_or(0, Column::len);
563        unsafe { Self::new_no_checks(height, columns) }
564    }
565
566    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
567    /// `Series`.
568    ///
569    /// It is advised to use [DataFrame::new] in favor of this method.
570    ///
571    /// # Safety
572    ///
573    /// It is the callers responsibility to uphold the contract of all `Series`
574    /// having an equal length and a unique name, if not this may panic down the line.
575    pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
576        if cfg!(debug_assertions) {
577            DataFrame::validate_columns_slice(&columns).unwrap();
578        }
579
580        unsafe { Self::_new_no_checks_impl(height, columns) }
581    }
582
583    /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
584    /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
585    /// constructed with this method is generally highly unsafe and should not be long-lived.
586    #[allow(clippy::missing_safety_doc)]
587    pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
588        DataFrame {
589            height,
590            columns,
591            cached_schema: OnceLock::new(),
592        }
593    }
594
595    /// Shrink the capacity of this DataFrame to fit its length.
596    pub fn shrink_to_fit(&mut self) {
597        // Don't parallelize this. Memory overhead
598        for s in &mut self.columns {
599            s.shrink_to_fit();
600        }
601    }
602
603    /// Aggregate all the chunks in the DataFrame to a single chunk.
604    pub fn as_single_chunk(&mut self) -> &mut Self {
605        // Don't parallelize this. Memory overhead
606        for s in &mut self.columns {
607            *s = s.rechunk();
608        }
609        self
610    }
611
612    /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
613    /// This may lead to more peak memory consumption.
614    pub fn as_single_chunk_par(&mut self) -> &mut Self {
615        if self.columns.iter().any(|c| c.n_chunks() > 1) {
616            self.columns = self._apply_columns_par(&|s| s.rechunk());
617        }
618        self
619    }
620
621    /// Rechunks all columns to only have a single chunk.
622    pub fn rechunk_mut(&mut self) {
623        // SAFETY: We never adjust the length or names of the columns.
624        let columns = unsafe { self.get_columns_mut() };
625
626        for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
627            *col = col.rechunk();
628        }
629    }
630
631    pub fn _deshare_views_mut(&mut self) {
632        // SAFETY: We never adjust the length or names of the columns.
633        unsafe {
634            let columns = self.get_columns_mut();
635            for col in columns {
636                let Column::Series(s) = col else { continue };
637
638                if let Ok(ca) = s.binary() {
639                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
640                    *col = Column::from(gc_ca.into_series());
641                } else if let Ok(ca) = s.str() {
642                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
643                    *col = Column::from(gc_ca.into_series());
644                }
645            }
646        }
647    }
648
649    /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
650    pub fn rechunk_to_record_batch(
651        self,
652        compat_level: CompatLevel,
653    ) -> RecordBatchT<Box<dyn Array>> {
654        let height = self.height();
655
656        let (schema, arrays) = self
657            .columns
658            .into_iter()
659            .map(|col| {
660                let mut series = col.take_materialized_series();
661                // Rechunk to one chunk if necessary
662                if series.n_chunks() > 1 {
663                    series = series.rechunk();
664                }
665                (
666                    series.field().to_arrow(compat_level),
667                    series.to_arrow(0, compat_level),
668                )
669            })
670            .collect();
671
672        RecordBatchT::new(height, Arc::new(schema), arrays)
673    }
674
675    /// Returns true if the chunks of the columns do not align and re-chunking should be done
676    pub fn should_rechunk(&self) -> bool {
677        // Fast check. It is also needed for correctness, as code below doesn't check if the number
678        // of chunks is equal.
679        if !self
680            .get_columns()
681            .iter()
682            .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
683            .all_equal()
684        {
685            return true;
686        }
687
688        // From here we check chunk lengths.
689        let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
690        match chunk_lengths.next() {
691            None => false,
692            Some(first_column_chunk_lengths) => {
693                // Fast Path for single Chunk Series
694                if first_column_chunk_lengths.size_hint().0 == 1 {
695                    return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
696                }
697                // Always rechunk if we have more chunks than rows.
698                // except when we have an empty df containing a single chunk
699                let height = self.height();
700                let n_chunks = first_column_chunk_lengths.size_hint().0;
701                if n_chunks > height && !(height == 0 && n_chunks == 1) {
702                    return true;
703                }
704                // Slow Path for multi Chunk series
705                let v: Vec<_> = first_column_chunk_lengths.collect();
706                for cl in chunk_lengths {
707                    if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
708                        return true;
709                    }
710                }
711                false
712            },
713        }
714    }
715
716    /// Ensure all the chunks in the [`DataFrame`] are aligned.
717    pub fn align_chunks_par(&mut self) -> &mut Self {
718        if self.should_rechunk() {
719            self.as_single_chunk_par()
720        } else {
721            self
722        }
723    }
724
725    pub fn align_chunks(&mut self) -> &mut Self {
726        if self.should_rechunk() {
727            self.as_single_chunk()
728        } else {
729            self
730        }
731    }
732
733    /// Get the [`DataFrame`] schema.
734    ///
735    /// # Example
736    ///
737    /// ```rust
738    /// # use polars_core::prelude::*;
739    /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
740    ///                         "Diameter (m)" => [8.8e26, f64::INFINITY])?;
741    ///
742    /// let f1: Field = Field::new("Thing".into(), DataType::String);
743    /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
744    /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
745    ///
746    /// assert_eq!(&**df.schema(), &sc);
747    /// # Ok::<(), PolarsError>(())
748    /// ```
749    pub fn schema(&self) -> &SchemaRef {
750        let out = self.cached_schema.get_or_init(|| {
751            Arc::new(
752                self.columns
753                    .iter()
754                    .map(|x| (x.name().clone(), x.dtype().clone()))
755                    .collect(),
756            )
757        });
758
759        debug_assert_eq!(out.len(), self.width());
760
761        out
762    }
763
764    /// Get a reference to the [`DataFrame`] columns.
765    ///
766    /// # Example
767    ///
768    /// ```rust
769    /// # use polars_core::prelude::*;
770    /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
771    ///                         "Symbol" => ["A", "C", "G", "T"])?;
772    /// let columns: &[Column] = df.get_columns();
773    ///
774    /// assert_eq!(columns[0].name(), "Name");
775    /// assert_eq!(columns[1].name(), "Symbol");
776    /// # Ok::<(), PolarsError>(())
777    /// ```
778    #[inline]
779    pub fn get_columns(&self) -> &[Column] {
780        &self.columns
781    }
782
783    #[inline]
784    /// Get mutable access to the underlying columns.
785    ///
786    /// # Safety
787    ///
788    /// The caller must ensure the length of all [`Series`] remains equal to `height` or
789    /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
790    /// The caller must ensure that the cached schema is cleared if it modifies the schema by
791    /// calling [`DataFrame::clear_schema`].
792    pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
793        &mut self.columns
794    }
795
796    #[inline]
797    /// Remove all the columns in the [`DataFrame`] but keep the `height`.
798    pub fn clear_columns(&mut self) {
799        unsafe { self.get_columns_mut() }.clear();
800        self.clear_schema();
801    }
802
803    #[inline]
804    /// Extend the columns without checking for name collisions or height.
805    ///
806    /// # Safety
807    ///
808    /// The caller needs to ensure that:
809    /// - Column names are unique within the resulting [`DataFrame`].
810    /// - The length of each appended column matches the height of the [`DataFrame`]. For
811    ///   `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
812    ///   with [`DataFrame::set_height`].
813    pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
814        unsafe { self.get_columns_mut() }.extend(iter);
815        self.clear_schema();
816    }
817
818    /// Take ownership of the underlying columns vec.
819    pub fn take_columns(self) -> Vec<Column> {
820        self.columns
821    }
822
823    /// Iterator over the columns as [`Series`].
824    ///
825    /// # Example
826    ///
827    /// ```rust
828    /// # use polars_core::prelude::*;
829    /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
830    /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
831    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
832    ///
833    /// let mut iterator = df.iter();
834    ///
835    /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
836    /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
837    /// assert_eq!(iterator.next(), None);
838    /// # Ok::<(), PolarsError>(())
839    /// ```
840    pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
841        self.materialized_column_iter()
842    }
843
844    /// # Example
845    ///
846    /// ```rust
847    /// # use polars_core::prelude::*;
848    /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
849    ///                         "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
850    ///
851    /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
852    /// # Ok::<(), PolarsError>(())
853    /// ```
854    pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
855        self.columns.iter().map(|s| s.name()).collect()
856    }
857
858    /// Get the [`Vec<PlSmallStr>`] representing the column names.
859    pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
860        self.columns.iter().map(|s| s.name().clone()).collect()
861    }
862
863    pub fn get_column_names_str(&self) -> Vec<&str> {
864        self.columns.iter().map(|s| s.name().as_str()).collect()
865    }
866
867    /// Set the column names.
868    /// # Example
869    ///
870    /// ```rust
871    /// # use polars_core::prelude::*;
872    /// let mut df: DataFrame = df!("Mathematical set" => ["ā„•", "ℤ", "š”»", "ā„š", "ā„", "ā„‚"])?;
873    /// df.set_column_names(["Set"])?;
874    ///
875    /// assert_eq!(df.get_column_names(), &["Set"]);
876    /// # Ok::<(), PolarsError>(())
877    /// ```
878    pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
879    where
880        I: IntoIterator<Item = S>,
881        S: Into<PlSmallStr>,
882    {
883        let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
884        self._set_column_names_impl(names.as_slice())
885    }
886
887    fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
888        polars_ensure!(
889            names.len() == self.width(),
890            ShapeMismatch: "{} column names provided for a DataFrame of width {}",
891            names.len(), self.width()
892        );
893        ensure_names_unique(names, |s| s.as_str())?;
894
895        let columns = mem::take(&mut self.columns);
896        self.columns = columns
897            .into_iter()
898            .zip(names)
899            .map(|(s, name)| {
900                let mut s = s;
901                s.rename(name.clone());
902                s
903            })
904            .collect();
905        self.clear_schema();
906        Ok(())
907    }
908
909    /// Get the data types of the columns in the [`DataFrame`].
910    ///
911    /// # Example
912    ///
913    /// ```rust
914    /// # use polars_core::prelude::*;
915    /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
916    ///                                "Fraction" => [0.965, 0.035])?;
917    ///
918    /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
919    /// # Ok::<(), PolarsError>(())
920    /// ```
921    pub fn dtypes(&self) -> Vec<DataType> {
922        self.columns.iter().map(|s| s.dtype().clone()).collect()
923    }
924
925    pub(crate) fn first_series_column(&self) -> Option<&Series> {
926        self.columns.iter().find_map(|col| col.as_series())
927    }
928
929    /// The number of chunks for the first column.
930    pub fn first_col_n_chunks(&self) -> usize {
931        match self.first_series_column() {
932            None if self.columns.is_empty() => 0,
933            None => 1,
934            Some(s) => s.n_chunks(),
935        }
936    }
937
938    /// The highest number of chunks for any column.
939    pub fn max_n_chunks(&self) -> usize {
940        self.columns
941            .iter()
942            .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
943            .max()
944            .unwrap_or(0)
945    }
946
947    /// Get a reference to the schema fields of the [`DataFrame`].
948    ///
949    /// # Example
950    ///
951    /// ```rust
952    /// # use polars_core::prelude::*;
953    /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
954    ///                            "Fraction" => [0.708, 0.292])?;
955    ///
956    /// let f1: Field = Field::new("Surface type".into(), DataType::String);
957    /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
958    ///
959    /// assert_eq!(earth.fields(), &[f1, f2]);
960    /// # Ok::<(), PolarsError>(())
961    /// ```
962    pub fn fields(&self) -> Vec<Field> {
963        self.columns
964            .iter()
965            .map(|s| s.field().into_owned())
966            .collect()
967    }
968
969    /// Get (height, width) of the [`DataFrame`].
970    ///
971    /// # Example
972    ///
973    /// ```rust
974    /// # use polars_core::prelude::*;
975    /// let df0: DataFrame = DataFrame::default();
976    /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
977    /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
978    ///                          "2" => [1, 2, 3, 4, 5])?;
979    ///
980    /// assert_eq!(df0.shape(), (0 ,0));
981    /// assert_eq!(df1.shape(), (5, 1));
982    /// assert_eq!(df2.shape(), (5, 2));
983    /// # Ok::<(), PolarsError>(())
984    /// ```
985    pub fn shape(&self) -> (usize, usize) {
986        (self.height, self.columns.len())
987    }
988
989    /// Get the width of the [`DataFrame`] which is the number of columns.
990    ///
991    /// # Example
992    ///
993    /// ```rust
994    /// # use polars_core::prelude::*;
995    /// let df0: DataFrame = DataFrame::default();
996    /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
997    /// let df2: DataFrame = df!("Series 1" => [0; 0],
998    ///                          "Series 2" => [0; 0])?;
999    ///
1000    /// assert_eq!(df0.width(), 0);
1001    /// assert_eq!(df1.width(), 1);
1002    /// assert_eq!(df2.width(), 2);
1003    /// # Ok::<(), PolarsError>(())
1004    /// ```
1005    pub fn width(&self) -> usize {
1006        self.columns.len()
1007    }
1008
1009    /// Get the height of the [`DataFrame`] which is the number of rows.
1010    ///
1011    /// # Example
1012    ///
1013    /// ```rust
1014    /// # use polars_core::prelude::*;
1015    /// let df0: DataFrame = DataFrame::default();
1016    /// let df1: DataFrame = df!("Currency" => ["€", "$"])?;
1017    /// let df2: DataFrame = df!("Currency" => ["€", "$", "Ā„", "Ā£", "₿"])?;
1018    ///
1019    /// assert_eq!(df0.height(), 0);
1020    /// assert_eq!(df1.height(), 2);
1021    /// assert_eq!(df2.height(), 5);
1022    /// # Ok::<(), PolarsError>(())
1023    /// ```
1024    pub fn height(&self) -> usize {
1025        self.height
1026    }
1027
1028    /// Returns the size as number of rows * number of columns
1029    pub fn size(&self) -> usize {
1030        let s = self.shape();
1031        s.0 * s.1
1032    }
1033
1034    /// Returns `true` if the [`DataFrame`] contains no rows.
1035    ///
1036    /// # Example
1037    ///
1038    /// ```rust
1039    /// # use polars_core::prelude::*;
1040    /// let df1: DataFrame = DataFrame::default();
1041    /// assert!(df1.is_empty());
1042    ///
1043    /// let df2: DataFrame = df!("First name" => ["Forever"],
1044    ///                          "Last name" => ["Alone"])?;
1045    /// assert!(!df2.is_empty());
1046    /// # Ok::<(), PolarsError>(())
1047    /// ```
1048    pub fn is_empty(&self) -> bool {
1049        matches!(self.shape(), (0, _) | (_, 0))
1050    }
1051
1052    /// Set the height (i.e. number of rows) of this [`DataFrame`].
1053    ///
1054    /// # Safety
1055    ///
1056    /// This needs to be equal to the length of all the columns.
1057    pub unsafe fn set_height(&mut self, height: usize) {
1058        self.height = height;
1059    }
1060
1061    /// Add multiple [`Series`] to a [`DataFrame`].
1062    /// The added `Series` are required to have the same length.
1063    ///
1064    /// # Example
1065    ///
1066    /// ```rust
1067    /// # use polars_core::prelude::*;
1068    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1069    /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1070    /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1071    ///
1072    /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1073    /// assert_eq!(df2.shape(), (3, 3));
1074    /// println!("{}", df2);
1075    /// # Ok::<(), PolarsError>(())
1076    /// ```
1077    ///
1078    /// Output:
1079    ///
1080    /// ```text
1081    /// shape: (3, 3)
1082    /// +---------+--------+----------+
1083    /// | Element | Proton | Electron |
1084    /// | ---     | ---    | ---      |
1085    /// | str     | i32    | i32      |
1086    /// +=========+========+==========+
1087    /// | Copper  | 29     | 29       |
1088    /// +---------+--------+----------+
1089    /// | Silver  | 47     | 47       |
1090    /// +---------+--------+----------+
1091    /// | Gold    | 79     | 79       |
1092    /// +---------+--------+----------+
1093    /// ```
1094    pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1095        let mut new_cols = self.columns.clone();
1096        new_cols.extend_from_slice(columns);
1097        DataFrame::new(new_cols)
1098    }
1099
1100    /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1101    ///
1102    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1103    ///
1104    /// # Example
1105    ///
1106    /// ```rust
1107    /// # use polars_core::prelude::*;
1108    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1109    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1110    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1111    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1112    ///
1113    /// let df3: DataFrame = df1.vstack(&df2)?;
1114    ///
1115    /// assert_eq!(df3.shape(), (5, 2));
1116    /// println!("{}", df3);
1117    /// # Ok::<(), PolarsError>(())
1118    /// ```
1119    ///
1120    /// Output:
1121    ///
1122    /// ```text
1123    /// shape: (5, 2)
1124    /// +-----------+-------------------+
1125    /// | Element   | Melting Point (K) |
1126    /// | ---       | ---               |
1127    /// | str       | f64               |
1128    /// +===========+===================+
1129    /// | Copper    | 1357.77           |
1130    /// +-----------+-------------------+
1131    /// | Silver    | 1234.93           |
1132    /// +-----------+-------------------+
1133    /// | Gold      | 1337.33           |
1134    /// +-----------+-------------------+
1135    /// | Platinum  | 2041.4            |
1136    /// +-----------+-------------------+
1137    /// | Palladium | 1828.05           |
1138    /// +-----------+-------------------+
1139    /// ```
1140    pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1141        let mut df = self.clone();
1142        df.vstack_mut(other)?;
1143        Ok(df)
1144    }
1145
1146    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1147    ///
1148    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1149    ///
1150    /// # Example
1151    ///
1152    /// ```rust
1153    /// # use polars_core::prelude::*;
1154    /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1155    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1156    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1157    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1158    ///
1159    /// df1.vstack_mut(&df2)?;
1160    ///
1161    /// assert_eq!(df1.shape(), (5, 2));
1162    /// println!("{}", df1);
1163    /// # Ok::<(), PolarsError>(())
1164    /// ```
1165    ///
1166    /// Output:
1167    ///
1168    /// ```text
1169    /// shape: (5, 2)
1170    /// +-----------+-------------------+
1171    /// | Element   | Melting Point (K) |
1172    /// | ---       | ---               |
1173    /// | str       | f64               |
1174    /// +===========+===================+
1175    /// | Copper    | 1357.77           |
1176    /// +-----------+-------------------+
1177    /// | Silver    | 1234.93           |
1178    /// +-----------+-------------------+
1179    /// | Gold      | 1337.33           |
1180    /// +-----------+-------------------+
1181    /// | Platinum  | 2041.4            |
1182    /// +-----------+-------------------+
1183    /// | Palladium | 1828.05           |
1184    /// +-----------+-------------------+
1185    /// ```
1186    pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1187        if self.width() != other.width() {
1188            polars_ensure!(
1189                self.width() == 0,
1190                ShapeMismatch:
1191                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1192                self.width(), other.width(),
1193            );
1194            self.columns.clone_from(&other.columns);
1195            self.height = other.height;
1196            return Ok(self);
1197        }
1198
1199        self.columns
1200            .iter_mut()
1201            .zip(other.columns.iter())
1202            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1203                ensure_can_extend(&*left, right)?;
1204                left.append(right).map_err(|e| {
1205                    e.context(format!("failed to vstack column '{}'", right.name()).into())
1206                })?;
1207                Ok(())
1208            })?;
1209        self.height += other.height;
1210        Ok(self)
1211    }
1212
1213    pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1214        if self.width() != other.width() {
1215            polars_ensure!(
1216                self.width() == 0,
1217                ShapeMismatch:
1218                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1219                self.width(), other.width(),
1220            );
1221            self.columns = other.columns;
1222            self.height = other.height;
1223            return Ok(self);
1224        }
1225
1226        self.columns
1227            .iter_mut()
1228            .zip(other.columns.into_iter())
1229            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1230                ensure_can_extend(&*left, &right)?;
1231                let right_name = right.name().clone();
1232                left.append_owned(right).map_err(|e| {
1233                    e.context(format!("failed to vstack column '{right_name}'").into())
1234                })?;
1235                Ok(())
1236            })?;
1237        self.height += other.height;
1238        Ok(self)
1239    }
1240
1241    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1242    ///
1243    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1244    ///
1245    /// # Panics
1246    /// Panics if the schema's don't match.
1247    pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1248        self.columns
1249            .iter_mut()
1250            .zip(other.columns.iter())
1251            .for_each(|(left, right)| {
1252                left.append(right)
1253                    .map_err(|e| {
1254                        e.context(format!("failed to vstack column '{}'", right.name()).into())
1255                    })
1256                    .expect("should not fail");
1257            });
1258        self.height += other.height;
1259    }
1260
1261    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1262    ///
1263    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1264    ///
1265    /// # Panics
1266    /// Panics if the schema's don't match.
1267    pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1268        self.columns
1269            .iter_mut()
1270            .zip(other.columns)
1271            .for_each(|(left, right)| {
1272                left.append_owned(right).expect("should not fail");
1273            });
1274        self.height += other.height;
1275    }
1276
1277    /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1278    ///
1279    /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1280    /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1281    ///
1282    /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1283    /// and thus will yield faster queries.
1284    ///
1285    /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1286    /// online operations where you add `n` rows and rerun a query.
1287    ///
1288    /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1289    /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1290    /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1291    pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1292        polars_ensure!(
1293            self.width() == other.width(),
1294            ShapeMismatch:
1295            "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1296            self.width(), other.width(),
1297        );
1298
1299        self.columns
1300            .iter_mut()
1301            .zip(other.columns.iter())
1302            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1303                ensure_can_extend(&*left, right)?;
1304                left.extend(right).map_err(|e| {
1305                    e.context(format!("failed to extend column '{}'", right.name()).into())
1306                })?;
1307                Ok(())
1308            })?;
1309        self.height += other.height;
1310        self.clear_schema();
1311        Ok(())
1312    }
1313
1314    /// Remove a column by name and return the column removed.
1315    ///
1316    /// # Example
1317    ///
1318    /// ```rust
1319    /// # use polars_core::prelude::*;
1320    /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1321    ///                             "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1322    ///
1323    /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1324    /// assert!(s1.is_err());
1325    ///
1326    /// let s2: Column = df.drop_in_place("Animal")?;
1327    /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1328    /// # Ok::<(), PolarsError>(())
1329    /// ```
1330    pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1331        let idx = self.check_name_to_idx(name)?;
1332        self.clear_schema();
1333        Ok(self.columns.remove(idx))
1334    }
1335
1336    /// Return a new [`DataFrame`] where all null values are dropped.
1337    ///
1338    /// # Example
1339    ///
1340    /// ```no_run
1341    /// # use polars_core::prelude::*;
1342    /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1343    ///                         "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1344    /// assert_eq!(df1.shape(), (3, 2));
1345    ///
1346    /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1347    /// assert_eq!(df2.shape(), (1, 2));
1348    /// println!("{}", df2);
1349    /// # Ok::<(), PolarsError>(())
1350    /// ```
1351    ///
1352    /// Output:
1353    ///
1354    /// ```text
1355    /// shape: (1, 2)
1356    /// +---------+---------------------+
1357    /// | Country | Tax revenue (% GDP) |
1358    /// | ---     | ---                 |
1359    /// | str     | f64                 |
1360    /// +=========+=====================+
1361    /// | Malta   | 32.7                |
1362    /// +---------+---------------------+
1363    /// ```
1364    pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1365    where
1366        for<'a> &'a S: Into<PlSmallStr>,
1367    {
1368        if let Some(v) = subset {
1369            let v = self.select_columns(v)?;
1370            self._drop_nulls_impl(v.as_slice())
1371        } else {
1372            self._drop_nulls_impl(self.columns.as_slice())
1373        }
1374    }
1375
1376    fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1377        // fast path for no nulls in df
1378        if subset.iter().all(|s| !s.has_nulls()) {
1379            return Ok(self.clone());
1380        }
1381
1382        let mut iter = subset.iter();
1383
1384        let mask = iter
1385            .next()
1386            .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1387        let mut mask = mask.is_not_null();
1388
1389        for c in iter {
1390            mask = mask & c.is_not_null();
1391        }
1392        self.filter(&mask)
1393    }
1394
1395    /// Drop a column by name.
1396    /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1397    /// the current one in place.
1398    ///
1399    /// # Example
1400    ///
1401    /// ```rust
1402    /// # use polars_core::prelude::*;
1403    /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1404    /// let df2: DataFrame = df1.drop("Ray type")?;
1405    ///
1406    /// assert!(df2.is_empty());
1407    /// # Ok::<(), PolarsError>(())
1408    /// ```
1409    pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1410        let idx = self.check_name_to_idx(name)?;
1411        let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1412
1413        self.columns.iter().enumerate().for_each(|(i, s)| {
1414            if i != idx {
1415                new_cols.push(s.clone())
1416            }
1417        });
1418
1419        Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1420    }
1421
1422    /// Drop columns that are in `names`.
1423    pub fn drop_many<I, S>(&self, names: I) -> Self
1424    where
1425        I: IntoIterator<Item = S>,
1426        S: Into<PlSmallStr>,
1427    {
1428        let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1429        self.drop_many_amortized(&names)
1430    }
1431
1432    /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1433    pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1434        if names.is_empty() {
1435            return self.clone();
1436        }
1437        let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1438        self.columns.iter().for_each(|s| {
1439            if !names.contains(s.name()) {
1440                new_cols.push(s.clone())
1441            }
1442        });
1443
1444        unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1445    }
1446
1447    /// Insert a new column at a given index without checking for duplicates.
1448    /// This can leave the [`DataFrame`] at an invalid state
1449    fn insert_column_no_name_check(
1450        &mut self,
1451        index: usize,
1452        column: Column,
1453    ) -> PolarsResult<&mut Self> {
1454        polars_ensure!(
1455            self.width() == 0 || column.len() == self.height(),
1456            ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1457            column.len(), self.height(),
1458        );
1459
1460        if self.width() == 0 {
1461            self.height = column.len();
1462        }
1463
1464        self.columns.insert(index, column);
1465        self.clear_schema();
1466        Ok(self)
1467    }
1468
1469    /// Insert a new column at a given index.
1470    pub fn insert_column<S: IntoColumn>(
1471        &mut self,
1472        index: usize,
1473        column: S,
1474    ) -> PolarsResult<&mut Self> {
1475        let column = column.into_column();
1476        self.check_already_present(column.name().as_str())?;
1477        self.insert_column_no_name_check(index, column)
1478    }
1479
1480    fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1481        if let Some(idx) = self.get_column_index(column.name().as_str()) {
1482            self.replace_column(idx, column)?;
1483        } else {
1484            if self.width() == 0 {
1485                self.height = column.len();
1486            }
1487
1488            self.columns.push(column);
1489            self.clear_schema();
1490        }
1491        Ok(())
1492    }
1493
1494    /// Add a new column to this [`DataFrame`] or replace an existing one.
1495    pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1496        fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1497            let height = df.height();
1498            if column.len() == 1 && height > 1 {
1499                column = column.new_from_index(0, height);
1500            }
1501
1502            if column.len() == height || df.get_columns().is_empty() {
1503                df.add_column_by_search(column)?;
1504                Ok(df)
1505            }
1506            // special case for literals
1507            else if height == 0 && column.len() == 1 {
1508                let s = column.clear();
1509                df.add_column_by_search(s)?;
1510                Ok(df)
1511            } else {
1512                polars_bail!(
1513                    ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1514                    column.len(), height,
1515                );
1516            }
1517        }
1518        let column = column.into_column();
1519        inner(self, column)
1520    }
1521
1522    /// Adds a column to the [`DataFrame`] without doing any checks
1523    /// on length or duplicates.
1524    ///
1525    /// # Safety
1526    /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1527    pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1528        debug_assert!(self.width() == 0 || self.height() == column.len());
1529        debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1530
1531        // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1532        // properly for `width` == 0.
1533        if self.width() == 0 {
1534            unsafe { self.set_height(column.len()) };
1535        }
1536        unsafe { self.get_columns_mut() }.push(column);
1537        self.clear_schema();
1538
1539        self
1540    }
1541
1542    // Note: Schema can be both input or output_schema
1543    fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1544        let name = c.name();
1545        if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1546            if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1547                // Given schema is output_schema and we can push.
1548                if idx == self.columns.len() {
1549                    if self.width() == 0 {
1550                        self.height = c.len();
1551                    }
1552
1553                    self.columns.push(c);
1554                    self.clear_schema();
1555                }
1556                // Schema is incorrect fallback to search
1557                else {
1558                    debug_assert!(false);
1559                    self.add_column_by_search(c)?;
1560                }
1561            } else {
1562                self.replace_column(idx, c)?;
1563            }
1564        } else {
1565            if self.width() == 0 {
1566                self.height = c.len();
1567            }
1568
1569            self.columns.push(c);
1570            self.clear_schema();
1571        }
1572
1573        Ok(())
1574    }
1575
1576    // Note: Schema can be both input or output_schema
1577    pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1578        for (i, s) in series.into_iter().enumerate() {
1579            // we need to branch here
1580            // because users can add multiple columns with the same name
1581            if i == 0 || schema.get(s.name().as_str()).is_some() {
1582                self.with_column_and_schema(s.into_column(), schema)?;
1583            } else {
1584                self.with_column(s.clone().into_column())?;
1585            }
1586        }
1587        Ok(())
1588    }
1589
1590    pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1591        for (i, s) in columns.into_iter().enumerate() {
1592            // we need to branch here
1593            // because users can add multiple columns with the same name
1594            if i == 0 || schema.get(s.name().as_str()).is_some() {
1595                self.with_column_and_schema(s, schema)?;
1596            } else {
1597                self.with_column(s.clone())?;
1598            }
1599        }
1600
1601        Ok(())
1602    }
1603
1604    /// Add a new column to this [`DataFrame`] or replace an existing one.
1605    /// Uses an existing schema to amortize lookups.
1606    /// If the schema is incorrect, we will fallback to linear search.
1607    ///
1608    /// Note: Schema can be both input or output_schema
1609    pub fn with_column_and_schema<C: IntoColumn>(
1610        &mut self,
1611        column: C,
1612        schema: &Schema,
1613    ) -> PolarsResult<&mut Self> {
1614        let mut column = column.into_column();
1615
1616        let height = self.height();
1617        if column.len() == 1 && height > 1 {
1618            column = column.new_from_index(0, height);
1619        }
1620
1621        if column.len() == height || self.columns.is_empty() {
1622            self.add_column_by_schema(column, schema)?;
1623            Ok(self)
1624        }
1625        // special case for literals
1626        else if height == 0 && column.len() == 1 {
1627            let s = column.clear();
1628            self.add_column_by_schema(s, schema)?;
1629            Ok(self)
1630        } else {
1631            polars_bail!(
1632                ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1633                column.len(), height,
1634            );
1635        }
1636    }
1637
1638    /// Get a row in the [`DataFrame`]. Beware this is slow.
1639    ///
1640    /// # Example
1641    ///
1642    /// ```
1643    /// # use polars_core::prelude::*;
1644    /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1645    ///     df.get(idx)
1646    /// }
1647    /// ```
1648    pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1649        match self.columns.first() {
1650            Some(s) => {
1651                if s.len() <= idx {
1652                    return None;
1653                }
1654            },
1655            None => return None,
1656        }
1657        // SAFETY: we just checked bounds
1658        unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1659    }
1660
1661    /// Select a [`Series`] by index.
1662    ///
1663    /// # Example
1664    ///
1665    /// ```rust
1666    /// # use polars_core::prelude::*;
1667    /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1668    ///                         "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1669    ///
1670    /// let s1: Option<&Column> = df.select_at_idx(0);
1671    /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1672    ///
1673    /// assert_eq!(s1, Some(&s2));
1674    /// # Ok::<(), PolarsError>(())
1675    /// ```
1676    pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1677        self.columns.get(idx)
1678    }
1679
1680    /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1681    ///
1682    /// # Examples
1683    ///
1684    /// ```rust
1685    /// # use polars_core::prelude::*;
1686    /// let df = df! {
1687    ///     "0" => [0, 0, 0],
1688    ///     "1" => [1, 1, 1],
1689    ///     "2" => [2, 2, 2]
1690    /// }?;
1691    ///
1692    /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1693    /// assert!(df.equals(&df.select_by_range(..)?));
1694    /// # Ok::<(), PolarsError>(())
1695    /// ```
1696    pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1697    where
1698        R: ops::RangeBounds<usize>,
1699    {
1700        // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1701        // because it is the nightly feature. We should change here if this function were stable.
1702        fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1703        where
1704            R: ops::RangeBounds<usize>,
1705        {
1706            let len = bounds.end;
1707
1708            let start: ops::Bound<&usize> = range.start_bound();
1709            let start = match start {
1710                ops::Bound::Included(&start) => start,
1711                ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1712                    panic!("attempted to index slice from after maximum usize");
1713                }),
1714                ops::Bound::Unbounded => 0,
1715            };
1716
1717            let end: ops::Bound<&usize> = range.end_bound();
1718            let end = match end {
1719                ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1720                    panic!("attempted to index slice up to maximum usize");
1721                }),
1722                ops::Bound::Excluded(&end) => end,
1723                ops::Bound::Unbounded => len,
1724            };
1725
1726            if start > end {
1727                panic!("slice index starts at {start} but ends at {end}");
1728            }
1729            if end > len {
1730                panic!("range end index {end} out of range for slice of length {len}",);
1731            }
1732
1733            ops::Range { start, end }
1734        }
1735
1736        let colnames = self.get_column_names_owned();
1737        let range = get_range(range, ..colnames.len());
1738
1739        self._select_impl(&colnames[range])
1740    }
1741
1742    /// Get column index of a [`Series`] by name.
1743    /// # Example
1744    ///
1745    /// ```rust
1746    /// # use polars_core::prelude::*;
1747    /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1748    ///                         "Health" => [100, 200, 500],
1749    ///                         "Mana" => [250, 100, 0],
1750    ///                         "Strength" => [30, 150, 300])?;
1751    ///
1752    /// assert_eq!(df.get_column_index("Name"), Some(0));
1753    /// assert_eq!(df.get_column_index("Health"), Some(1));
1754    /// assert_eq!(df.get_column_index("Mana"), Some(2));
1755    /// assert_eq!(df.get_column_index("Strength"), Some(3));
1756    /// assert_eq!(df.get_column_index("Haste"), None);
1757    /// # Ok::<(), PolarsError>(())
1758    /// ```
1759    pub fn get_column_index(&self, name: &str) -> Option<usize> {
1760        let schema = self.schema();
1761        if let Some(idx) = schema.index_of(name) {
1762            if self
1763                .get_columns()
1764                .get(idx)
1765                .is_some_and(|c| c.name() == name)
1766            {
1767                return Some(idx);
1768            }
1769        }
1770
1771        self.columns.iter().position(|s| s.name().as_str() == name)
1772    }
1773
1774    /// Get column index of a [`Series`] by name.
1775    pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1776        self.get_column_index(name)
1777            .ok_or_else(|| polars_err!(col_not_found = name))
1778    }
1779
1780    /// Select a single column by name.
1781    ///
1782    /// # Example
1783    ///
1784    /// ```rust
1785    /// # use polars_core::prelude::*;
1786    /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1787    /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1788    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1789    ///
1790    /// assert_eq!(df.column("Password")?, &s1);
1791    /// # Ok::<(), PolarsError>(())
1792    /// ```
1793    pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1794        let idx = self.try_get_column_index(name)?;
1795        Ok(self.select_at_idx(idx).unwrap())
1796    }
1797
1798    /// Selected multiple columns by name.
1799    ///
1800    /// # Example
1801    ///
1802    /// ```rust
1803    /// # use polars_core::prelude::*;
1804    /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1805    ///                         "Max weight (kg)" => [16.0, 35.89])?;
1806    /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1807    ///
1808    /// assert_eq!(&df[0], sv[0]);
1809    /// assert_eq!(&df[1], sv[1]);
1810    /// # Ok::<(), PolarsError>(())
1811    /// ```
1812    pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1813    where
1814        I: IntoIterator<Item = S>,
1815        S: AsRef<str>,
1816    {
1817        names
1818            .into_iter()
1819            .map(|name| self.column(name.as_ref()))
1820            .collect()
1821    }
1822
1823    /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1824    ///
1825    /// # Examples
1826    ///
1827    /// ```
1828    /// # use polars_core::prelude::*;
1829    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1830    ///     df.select(["foo", "bar"])
1831    /// }
1832    /// ```
1833    pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1834    where
1835        I: IntoIterator<Item = S>,
1836        S: Into<PlSmallStr>,
1837    {
1838        let cols: UnitVec<PlSmallStr> = selection.into_iter().map(|s| s.into()).collect();
1839        self._select_impl(cols.as_slice())
1840    }
1841
1842    pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1843        ensure_names_unique(cols, |s| s.as_str())?;
1844        self._select_impl_unchecked(cols)
1845    }
1846
1847    pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1848        let selected = self.select_columns_impl(cols)?;
1849        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1850    }
1851
1852    /// Select with a known schema. The schema names must match the column names of this DataFrame.
1853    pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1854    where
1855        I: IntoIterator<Item = S>,
1856        S: Into<PlSmallStr>,
1857    {
1858        let cols: UnitVec<PlSmallStr> = selection.into_iter().map(|s| s.into()).collect();
1859        self._select_with_schema_impl(&cols, schema, true)
1860    }
1861
1862    /// Select with a known schema without checking for duplicates in `selection`.
1863    /// The schema names must match the column names of this DataFrame.
1864    pub fn select_with_schema_unchecked<I, S>(
1865        &self,
1866        selection: I,
1867        schema: &Schema,
1868    ) -> PolarsResult<Self>
1869    where
1870        I: IntoIterator<Item = S>,
1871        S: Into<PlSmallStr>,
1872    {
1873        let cols: UnitVec<PlSmallStr> = selection.into_iter().map(|s| s.into()).collect();
1874        self._select_with_schema_impl(&cols, schema, false)
1875    }
1876
1877    /// * The schema names must match the column names of this DataFrame.
1878    pub fn _select_with_schema_impl(
1879        &self,
1880        cols: &[PlSmallStr],
1881        schema: &Schema,
1882        check_duplicates: bool,
1883    ) -> PolarsResult<Self> {
1884        if check_duplicates {
1885            ensure_names_unique(cols, |s| s.as_str())?;
1886        }
1887
1888        let selected = self.select_columns_impl_with_schema(cols, schema)?;
1889        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1890    }
1891
1892    /// A non generic implementation to reduce compiler bloat.
1893    fn select_columns_impl_with_schema(
1894        &self,
1895        cols: &[PlSmallStr],
1896        schema: &Schema,
1897    ) -> PolarsResult<Vec<Column>> {
1898        if cfg!(debug_assertions) {
1899            ensure_matching_schema_names(schema, self.schema())?;
1900        }
1901
1902        cols.iter()
1903            .map(|name| {
1904                let index = schema.try_get_full(name.as_str())?.0;
1905                Ok(self.columns[index].clone())
1906            })
1907            .collect()
1908    }
1909
1910    pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1911    where
1912        I: IntoIterator<Item = S>,
1913        S: Into<PlSmallStr>,
1914    {
1915        let cols: UnitVec<PlSmallStr> = selection.into_iter().map(|s| s.into()).collect();
1916        self.select_physical_impl(&cols)
1917    }
1918
1919    fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1920        ensure_names_unique(cols, |s| s.as_str())?;
1921        let selected = self.select_columns_physical_impl(cols)?;
1922        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1923    }
1924
1925    /// # Safety
1926    /// Dtypes must match, as the provided schema becomes the cached schema of the result.
1927    pub unsafe fn project(&self, to: SchemaRef) -> PolarsResult<Self> {
1928        let mut df = unsafe { self.project_names(to.iter_names())? };
1929        df.cached_schema = to.into();
1930        Ok(df)
1931    }
1932
1933    /// # Safety
1934    /// This does not check for duplicates on names.
1935    pub unsafe fn project_names(
1936        &self,
1937        names: impl IntoIterator<Item = impl AsRef<str>>,
1938    ) -> PolarsResult<Self> {
1939        let from = self.schema();
1940        let columns = names
1941            .into_iter()
1942            .map(|name| Ok(self.columns[from.try_index_of(name.as_ref())?].clone()))
1943            .collect::<PolarsResult<_>>()?;
1944        let df = unsafe { Self::new_no_checks(self.height(), columns) };
1945        Ok(df)
1946    }
1947
1948    /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1949    ///
1950    /// # Example
1951    ///
1952    /// ```rust
1953    /// # use polars_core::prelude::*;
1954    /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1955    ///                         "Carbon" => [1, 2, 3],
1956    ///                         "Hydrogen" => [4, 6, 8])?;
1957    /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1958    ///
1959    /// assert_eq!(df["Carbon"], sv[0]);
1960    /// assert_eq!(df["Hydrogen"], sv[1]);
1961    /// # Ok::<(), PolarsError>(())
1962    /// ```
1963    pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1964        let cols = selection.into_vec();
1965        self.select_columns_impl(&cols)
1966    }
1967
1968    fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1969        self.columns
1970            .iter()
1971            .enumerate()
1972            .map(|(i, s)| (s.name().as_str(), i))
1973            .collect()
1974    }
1975
1976    /// A non generic implementation to reduce compiler bloat.
1977    fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1978        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1979            let name_to_idx = self._names_to_idx_map();
1980            cols.iter()
1981                .map(|name| {
1982                    let idx = *name_to_idx
1983                        .get(name.as_str())
1984                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1985                    Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1986                })
1987                .collect::<PolarsResult<Vec<_>>>()?
1988        } else {
1989            cols.iter()
1990                .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1991                .collect::<PolarsResult<Vec<_>>>()?
1992        };
1993
1994        Ok(selected)
1995    }
1996
1997    /// A non generic implementation to reduce compiler bloat.
1998    fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1999        let selected = if cols.len() > 1 && self.columns.len() > 10 {
2000            // we hash, because there are user that having millions of columns.
2001            // # https://github.com/pola-rs/polars/issues/1023
2002            let name_to_idx = self._names_to_idx_map();
2003
2004            cols.iter()
2005                .map(|name| {
2006                    let idx = *name_to_idx
2007                        .get(name.as_str())
2008                        .ok_or_else(|| polars_err!(col_not_found = name))?;
2009                    Ok(self.select_at_idx(idx).unwrap().clone())
2010                })
2011                .collect::<PolarsResult<Vec<_>>>()?
2012        } else {
2013            cols.iter()
2014                .map(|c| self.column(c.as_str()).cloned())
2015                .collect::<PolarsResult<Vec<_>>>()?
2016        };
2017
2018        Ok(selected)
2019    }
2020
2021    fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
2022        // If there is a filtered column just see how many columns there are left.
2023        if let Some(fst) = filtered.first() {
2024            return fst.len();
2025        }
2026
2027        // Otherwise, count the number of values that would be filtered and return that height.
2028        let num_trues = mask.num_trues();
2029        if mask.len() == self.height() {
2030            num_trues
2031        } else {
2032            // This is for broadcasting masks
2033            debug_assert!(num_trues == 0 || num_trues == 1);
2034            self.height() * num_trues
2035        }
2036    }
2037
2038    /// Take the [`DataFrame`] rows by a boolean mask.
2039    ///
2040    /// # Example
2041    ///
2042    /// ```
2043    /// # use polars_core::prelude::*;
2044    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2045    ///     let mask = df.column("sepal_width")?.is_not_null();
2046    ///     df.filter(&mask)
2047    /// }
2048    /// ```
2049    pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2050        let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
2051        let height = self.filter_height(&new_col, mask);
2052
2053        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2054    }
2055
2056    /// Same as `filter` but does not parallelize.
2057    pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2058        let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
2059        let height = self.filter_height(&new_col, mask);
2060
2061        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2062    }
2063
2064    /// Take [`DataFrame`] rows by index values.
2065    ///
2066    /// # Example
2067    ///
2068    /// ```
2069    /// # use polars_core::prelude::*;
2070    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2071    ///     let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2072    ///     df.take(&idx)
2073    /// }
2074    /// ```
2075    pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2076        let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2077
2078        Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2079    }
2080
2081    /// # Safety
2082    /// The indices must be in-bounds.
2083    pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2084        self.take_unchecked_impl(idx, true)
2085    }
2086
2087    /// # Safety
2088    /// The indices must be in-bounds.
2089    pub unsafe fn gather_group_unchecked(&self, group: &GroupsIndicator) -> Self {
2090        match group {
2091            GroupsIndicator::Idx((_, indices)) => unsafe {
2092                self.take_slice_unchecked_impl(indices.as_slice(), false)
2093            },
2094            GroupsIndicator::Slice([offset, len]) => self.slice(*offset as i64, *len as usize),
2095        }
2096    }
2097
2098    /// # Safety
2099    /// The indices must be in-bounds.
2100    pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2101        let cols = if allow_threads && POOL.current_num_threads() > 1 {
2102            POOL.install(|| {
2103                if POOL.current_num_threads() > self.width() {
2104                    let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2105                    if self.len() / stride >= 2 {
2106                        self._apply_columns_par(&|c| {
2107                            // Nested types initiate a rechunk in their take_unchecked implementation.
2108                            // If we do not rechunk, it will result in rechunk storms downstream.
2109                            let c = if c.dtype().is_nested() {
2110                                &c.rechunk()
2111                            } else {
2112                                c
2113                            };
2114
2115                            (0..idx.len().div_ceil(stride))
2116                                .into_par_iter()
2117                                .map(|i| c.take_unchecked(&idx.slice((i * stride) as i64, stride)))
2118                                .reduce(
2119                                    || Column::new_empty(c.name().clone(), c.dtype()),
2120                                    |mut a, b| {
2121                                        a.append_owned(b).unwrap();
2122                                        a
2123                                    },
2124                                )
2125                        })
2126                    } else {
2127                        self._apply_columns_par(&|c| c.take_unchecked(idx))
2128                    }
2129                } else {
2130                    self._apply_columns_par(&|c| c.take_unchecked(idx))
2131                }
2132            })
2133        } else {
2134            self._apply_columns(&|s| s.take_unchecked(idx))
2135        };
2136        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2137    }
2138
2139    /// # Safety
2140    /// The indices must be in-bounds.
2141    pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2142        self.take_slice_unchecked_impl(idx, true)
2143    }
2144
2145    /// # Safety
2146    /// The indices must be in-bounds.
2147    pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2148        let cols = if allow_threads && POOL.current_num_threads() > 1 {
2149            POOL.install(|| {
2150                if POOL.current_num_threads() > self.width() {
2151                    let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2152                    if self.len() / stride >= 2 {
2153                        self._apply_columns_par(&|c| {
2154                            // Nested types initiate a rechunk in their take_unchecked implementation.
2155                            // If we do not rechunk, it will result in rechunk storms downstream.
2156                            let c = if c.dtype().is_nested() {
2157                                &c.rechunk()
2158                            } else {
2159                                c
2160                            };
2161
2162                            (0..idx.len().div_ceil(stride))
2163                                .into_par_iter()
2164                                .map(|i| {
2165                                    let idx = &idx[i * stride..];
2166                                    let idx = &idx[..idx.len().min(stride)];
2167                                    c.take_slice_unchecked(idx)
2168                                })
2169                                .reduce(
2170                                    || Column::new_empty(c.name().clone(), c.dtype()),
2171                                    |mut a, b| {
2172                                        a.append_owned(b).unwrap();
2173                                        a
2174                                    },
2175                                )
2176                        })
2177                    } else {
2178                        self._apply_columns_par(&|s| s.take_slice_unchecked(idx))
2179                    }
2180                } else {
2181                    self._apply_columns_par(&|s| s.take_slice_unchecked(idx))
2182                }
2183            })
2184        } else {
2185            self._apply_columns(&|s| s.take_slice_unchecked(idx))
2186        };
2187        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2188    }
2189
2190    /// Rename a column in the [`DataFrame`].
2191    ///
2192    /// Should not be called in a loop as that can lead to quadratic behavior.
2193    ///
2194    /// # Example
2195    ///
2196    /// ```
2197    /// # use polars_core::prelude::*;
2198    /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2199    ///     let original_name = "foo";
2200    ///     let new_name = "bar";
2201    ///     df.rename(original_name, new_name.into())
2202    /// }
2203    /// ```
2204    pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2205        if column == name.as_str() {
2206            return Ok(self);
2207        }
2208        polars_ensure!(
2209            !self.schema().contains(&name),
2210            Duplicate: "column rename attempted with already existing name \"{name}\""
2211        );
2212
2213        self.get_column_index(column)
2214            .and_then(|idx| self.columns.get_mut(idx))
2215            .ok_or_else(|| polars_err!(col_not_found = column))
2216            .map(|c| c.rename(name))?;
2217        self.clear_schema();
2218
2219        Ok(self)
2220    }
2221
2222    pub fn rename_many<'a>(
2223        &mut self,
2224        renames: impl Iterator<Item = (&'a str, PlSmallStr)>,
2225    ) -> PolarsResult<&mut Self> {
2226        let mut schema = self.schema().as_ref().clone();
2227        self.clear_schema();
2228
2229        for (from, to) in renames {
2230            if from == to.as_str() {
2231                continue;
2232            }
2233
2234            polars_ensure!(
2235                !schema.contains(&to),
2236                Duplicate: "column rename attempted with already existing name \"{to}\""
2237            );
2238
2239            match schema.get_full(from) {
2240                None => polars_bail!(col_not_found = from),
2241                Some((idx, _, _)) => {
2242                    let (n, _) = schema.get_at_index_mut(idx).unwrap();
2243                    *n = to.clone();
2244                    self.columns.get_mut(idx).unwrap().rename(to);
2245                },
2246            }
2247        }
2248
2249        self.cached_schema = OnceLock::from(Arc::new(schema));
2250        Ok(self)
2251    }
2252
2253    /// Sort [`DataFrame`] in place.
2254    ///
2255    /// See [`DataFrame::sort`] for more instruction.
2256    pub fn sort_in_place(
2257        &mut self,
2258        by: impl IntoVec<PlSmallStr>,
2259        sort_options: SortMultipleOptions,
2260    ) -> PolarsResult<&mut Self> {
2261        let by_column = self.select_columns(by)?;
2262        self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2263        Ok(self)
2264    }
2265
2266    #[doc(hidden)]
2267    /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2268    pub fn sort_impl(
2269        &self,
2270        by_column: Vec<Column>,
2271        sort_options: SortMultipleOptions,
2272        slice: Option<(i64, usize)>,
2273    ) -> PolarsResult<Self> {
2274        if by_column.is_empty() {
2275            // If no columns selected, any order (including original order) is correct.
2276            return if let Some((offset, len)) = slice {
2277                Ok(self.slice(offset, len))
2278            } else {
2279                Ok(self.clone())
2280            };
2281        }
2282
2283        // note that the by_column argument also contains evaluated expression from
2284        // polars-lazy that may not even be present in this dataframe. therefore
2285        // when we try to set the first columns as sorted, we ignore the error as
2286        // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2287        let first_descending = sort_options.descending[0];
2288        let first_by_column = by_column[0].name().to_string();
2289
2290        let set_sorted = |df: &mut DataFrame| {
2291            // Mark the first sort column as sorted; if the column does not exist it
2292            // is ok, because we sorted by an expression not present in the dataframe
2293            let _ = df.apply(&first_by_column, |s| {
2294                let mut s = s.clone();
2295                if first_descending {
2296                    s.set_sorted_flag(IsSorted::Descending)
2297                } else {
2298                    s.set_sorted_flag(IsSorted::Ascending)
2299                }
2300                s
2301            });
2302        };
2303        if self.is_empty() {
2304            let mut out = self.clone();
2305            set_sorted(&mut out);
2306            return Ok(out);
2307        }
2308
2309        if let Some((0, k)) = slice {
2310            if k < self.len() {
2311                return self.bottom_k_impl(k, by_column, sort_options);
2312            }
2313        }
2314        // Check if the required column is already sorted; if so we can exit early
2315        // We can do so when there is only one column to sort by, for multiple columns
2316        // it will be complicated to do so
2317        #[cfg(feature = "dtype-categorical")]
2318        let is_not_categorical_enum =
2319            !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2320                || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2321
2322        #[cfg(not(feature = "dtype-categorical"))]
2323        #[allow(non_upper_case_globals)]
2324        const is_not_categorical_enum: bool = true;
2325
2326        if by_column.len() == 1 && is_not_categorical_enum {
2327            let required_sorting = if sort_options.descending[0] {
2328                IsSorted::Descending
2329            } else {
2330                IsSorted::Ascending
2331            };
2332            // If null count is 0 then nulls_last doesnt matter
2333            // Safe to get value at last position since the dataframe is not empty (taken care above)
2334            let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2335                && ((by_column[0].null_count() == 0)
2336                    || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2337                        == sort_options.nulls_last[0]);
2338
2339            if no_sorting_required {
2340                return if let Some((offset, len)) = slice {
2341                    Ok(self.slice(offset, len))
2342                } else {
2343                    Ok(self.clone())
2344                };
2345            }
2346        }
2347
2348        let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2349        let allow_threads = sort_options.multithreaded;
2350
2351        // a lot of indirection in both sorting and take
2352        let mut df = self.clone();
2353        let df = df.as_single_chunk_par();
2354        let mut take = match (by_column.len(), has_nested) {
2355            (1, false) => {
2356                let s = &by_column[0];
2357                let options = SortOptions {
2358                    descending: sort_options.descending[0],
2359                    nulls_last: sort_options.nulls_last[0],
2360                    multithreaded: sort_options.multithreaded,
2361                    maintain_order: sort_options.maintain_order,
2362                    limit: sort_options.limit,
2363                };
2364                // fast path for a frame with a single series
2365                // no need to compute the sort indices and then take by these indices
2366                // simply sort and return as frame
2367                if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2368                    let mut out = s.sort_with(options)?;
2369                    if let Some((offset, len)) = slice {
2370                        out = out.slice(offset, len);
2371                    }
2372                    return Ok(out.into_frame());
2373                }
2374                s.arg_sort(options)
2375            },
2376            _ => arg_sort(&by_column, sort_options)?,
2377        };
2378
2379        if let Some((offset, len)) = slice {
2380            take = take.slice(offset, len);
2381        }
2382
2383        // SAFETY:
2384        // the created indices are in bounds
2385        let mut df = unsafe { df.take_unchecked_impl(&take, allow_threads) };
2386        set_sorted(&mut df);
2387        Ok(df)
2388    }
2389
2390    /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2391    ///
2392    /// This dataframe does not necessarily have a specified schema and may be changed at any
2393    /// point. It is primarily used for debugging.
2394    pub fn _to_metadata(&self) -> DataFrame {
2395        let num_columns = self.columns.len();
2396
2397        let mut column_names =
2398            StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2399        let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2400        let mut sorted_asc_ca =
2401            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2402        let mut sorted_dsc_ca =
2403            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2404        let mut fast_explode_list_ca =
2405            BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2406        let mut materialized_at_ca =
2407            StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2408
2409        for col in &self.columns {
2410            let flags = col.get_flags();
2411
2412            let (repr, materialized_at) = match col {
2413                Column::Series(s) => ("series", s.materialized_at()),
2414                Column::Scalar(_) => ("scalar", None),
2415            };
2416            let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2417            let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2418            let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2419
2420            column_names.append_value(col.name().clone());
2421            repr_ca.append_value(repr);
2422            sorted_asc_ca.append_value(sorted_asc);
2423            sorted_dsc_ca.append_value(sorted_dsc);
2424            fast_explode_list_ca.append_value(fast_explode_list);
2425            materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2426        }
2427
2428        unsafe {
2429            DataFrame::new_no_checks(
2430                self.width(),
2431                vec![
2432                    column_names.finish().into_column(),
2433                    repr_ca.finish().into_column(),
2434                    sorted_asc_ca.finish().into_column(),
2435                    sorted_dsc_ca.finish().into_column(),
2436                    fast_explode_list_ca.finish().into_column(),
2437                    materialized_at_ca.finish().into_column(),
2438                ],
2439            )
2440        }
2441    }
2442
2443    /// Return a sorted clone of this [`DataFrame`].
2444    ///
2445    /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2446    /// # Example
2447    ///
2448    /// Sort by a single column with default options:
2449    /// ```
2450    /// # use polars_core::prelude::*;
2451    /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2452    ///     df.sort(["sepal_width"], Default::default())
2453    /// }
2454    /// ```
2455    /// Sort by a single column with specific order:
2456    /// ```
2457    /// # use polars_core::prelude::*;
2458    /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2459    ///     df.sort(
2460    ///         ["sepal_width"],
2461    ///         SortMultipleOptions::new()
2462    ///             .with_order_descending(descending)
2463    ///     )
2464    /// }
2465    /// ```
2466    /// Sort by multiple columns with specifying order for each column:
2467    /// ```
2468    /// # use polars_core::prelude::*;
2469    /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2470    ///     df.sort(
2471    ///         ["sepal_width", "sepal_length"],
2472    ///         SortMultipleOptions::new()
2473    ///             .with_order_descending_multi([false, true])
2474    ///     )
2475    /// }
2476    /// ```
2477    /// See [`SortMultipleOptions`] for more options.
2478    ///
2479    /// Also see [`DataFrame::sort_in_place`].
2480    pub fn sort(
2481        &self,
2482        by: impl IntoVec<PlSmallStr>,
2483        sort_options: SortMultipleOptions,
2484    ) -> PolarsResult<Self> {
2485        let mut df = self.clone();
2486        df.sort_in_place(by, sort_options)?;
2487        Ok(df)
2488    }
2489
2490    /// Replace a column with a [`Series`].
2491    ///
2492    /// # Example
2493    ///
2494    /// ```rust
2495    /// # use polars_core::prelude::*;
2496    /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2497    ///                         "Area (km²)" => [9_833_520, 9_596_961])?;
2498    /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2499    ///
2500    /// assert!(df.replace("Nation", s.clone()).is_err());
2501    /// assert!(df.replace("Country", s).is_ok());
2502    /// # Ok::<(), PolarsError>(())
2503    /// ```
2504    pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2505        self.apply(column, |_| new_col.into_series())
2506    }
2507
2508    /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2509    /// is that now the value of `column: &str` determines the name of the column and not the name
2510    /// of the `Series` passed to this method.
2511    pub fn replace_or_add<S: IntoSeries>(
2512        &mut self,
2513        column: PlSmallStr,
2514        new_col: S,
2515    ) -> PolarsResult<&mut Self> {
2516        let mut new_col = new_col.into_series();
2517        new_col.rename(column);
2518        self.with_column(new_col)
2519    }
2520
2521    /// Replace column at index `idx` with a [`Series`].
2522    ///
2523    /// # Example
2524    ///
2525    /// ```ignored
2526    /// # use polars_core::prelude::*;
2527    /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2528    /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2529    /// let mut df = DataFrame::new(vec![s0, s1])?;
2530    ///
2531    /// // Add 32 to get lowercase ascii values
2532    /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2533    /// # Ok::<(), PolarsError>(())
2534    /// ```
2535    pub fn replace_column<C: IntoColumn>(
2536        &mut self,
2537        index: usize,
2538        new_column: C,
2539    ) -> PolarsResult<&mut Self> {
2540        polars_ensure!(
2541            index < self.width(),
2542            ShapeMismatch:
2543            "unable to replace at index {}, the DataFrame has only {} columns",
2544            index, self.width(),
2545        );
2546        let mut new_column = new_column.into_column();
2547        polars_ensure!(
2548            new_column.len() == self.height(),
2549            ShapeMismatch:
2550            "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2551            new_column.len(), self.height(),
2552        );
2553        let old_col = &mut self.columns[index];
2554        mem::swap(old_col, &mut new_column);
2555        self.clear_schema();
2556        Ok(self)
2557    }
2558
2559    /// Apply a closure to a column. This is the recommended way to do in place modification.
2560    ///
2561    /// # Example
2562    ///
2563    /// ```rust
2564    /// # use polars_core::prelude::*;
2565    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2566    /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2567    /// let mut df = DataFrame::new(vec![s0, s1])?;
2568    ///
2569    /// fn str_to_len(str_val: &Column) -> Column {
2570    ///     str_val.str()
2571    ///         .unwrap()
2572    ///         .into_iter()
2573    ///         .map(|opt_name: Option<&str>| {
2574    ///             opt_name.map(|name: &str| name.len() as u32)
2575    ///          })
2576    ///         .collect::<UInt32Chunked>()
2577    ///         .into_column()
2578    /// }
2579    ///
2580    /// // Replace the names column by the length of the names.
2581    /// df.apply("names", str_to_len);
2582    /// # Ok::<(), PolarsError>(())
2583    /// ```
2584    /// Results in:
2585    ///
2586    /// ```text
2587    /// +--------+-------+
2588    /// | foo    |       |
2589    /// | ---    | names |
2590    /// | str    | u32   |
2591    /// +========+=======+
2592    /// | "ham"  | 4     |
2593    /// +--------+-------+
2594    /// | "spam" | 6     |
2595    /// +--------+-------+
2596    /// | "egg"  | 3     |
2597    /// +--------+-------+
2598    /// ```
2599    pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2600    where
2601        F: FnOnce(&Column) -> C,
2602        C: IntoColumn,
2603    {
2604        let idx = self.check_name_to_idx(name)?;
2605        self.apply_at_idx(idx, f)?;
2606        Ok(self)
2607    }
2608
2609    /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2610    /// modification.
2611    ///
2612    /// # Example
2613    ///
2614    /// ```rust
2615    /// # use polars_core::prelude::*;
2616    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2617    /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2618    /// let mut df = DataFrame::new(vec![s0, s1])?;
2619    ///
2620    /// // Add 32 to get lowercase ascii values
2621    /// df.apply_at_idx(1, |s| s + 32);
2622    /// # Ok::<(), PolarsError>(())
2623    /// ```
2624    /// Results in:
2625    ///
2626    /// ```text
2627    /// +--------+-------+
2628    /// | foo    | ascii |
2629    /// | ---    | ---   |
2630    /// | str    | i32   |
2631    /// +========+=======+
2632    /// | "ham"  | 102   |
2633    /// +--------+-------+
2634    /// | "spam" | 111   |
2635    /// +--------+-------+
2636    /// | "egg"  | 111   |
2637    /// +--------+-------+
2638    /// ```
2639    pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2640    where
2641        F: FnOnce(&Column) -> C,
2642        C: IntoColumn,
2643    {
2644        let df_height = self.height();
2645        let width = self.width();
2646        let col = self.columns.get_mut(idx).ok_or_else(|| {
2647            polars_err!(
2648                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2649                idx, width
2650            )
2651        })?;
2652        let name = col.name().clone();
2653        let dtype_before = col.dtype().clone();
2654        let new_col = f(col).into_column();
2655        match new_col.len() {
2656            1 => {
2657                let new_col = new_col.new_from_index(0, df_height);
2658                let _ = mem::replace(col, new_col);
2659            },
2660            len if (len == df_height) => {
2661                let _ = mem::replace(col, new_col);
2662            },
2663            len => polars_bail!(
2664                ShapeMismatch:
2665                "resulting Series has length {} while the DataFrame has height {}",
2666                len, df_height
2667            ),
2668        }
2669
2670        // make sure the name remains the same after applying the closure
2671        unsafe {
2672            let col = self.columns.get_unchecked_mut(idx);
2673            col.rename(name);
2674
2675            if col.dtype() != &dtype_before {
2676                self.clear_schema();
2677            }
2678        }
2679        Ok(self)
2680    }
2681
2682    /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2683    /// modification.
2684    ///
2685    /// # Example
2686    ///
2687    /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2688    ///
2689    /// ```rust
2690    /// # use polars_core::prelude::*;
2691    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2692    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2693    /// let mut df = DataFrame::new(vec![s0, s1])?;
2694    ///
2695    /// let idx = vec![0, 1, 4];
2696    ///
2697    /// df.try_apply("foo", |c| {
2698    ///     c.str()?
2699    ///     .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2700    /// });
2701    /// # Ok::<(), PolarsError>(())
2702    /// ```
2703    /// Results in:
2704    ///
2705    /// ```text
2706    /// +---------------------+--------+
2707    /// | foo                 | values |
2708    /// | ---                 | ---    |
2709    /// | str                 | i32    |
2710    /// +=====================+========+
2711    /// | "ham-is-modified"   | 1      |
2712    /// +---------------------+--------+
2713    /// | "spam-is-modified"  | 2      |
2714    /// +---------------------+--------+
2715    /// | "egg"               | 3      |
2716    /// +---------------------+--------+
2717    /// | "bacon"             | 4      |
2718    /// +---------------------+--------+
2719    /// | "quack-is-modified" | 5      |
2720    /// +---------------------+--------+
2721    /// ```
2722    pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2723    where
2724        F: FnOnce(&Column) -> PolarsResult<C>,
2725        C: IntoColumn,
2726    {
2727        let width = self.width();
2728        let col = self.columns.get_mut(idx).ok_or_else(|| {
2729            polars_err!(
2730                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2731                idx, width
2732            )
2733        })?;
2734        let name = col.name().clone();
2735
2736        let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2737
2738        // make sure the name remains the same after applying the closure
2739        unsafe {
2740            let col = self.columns.get_unchecked_mut(idx);
2741            col.rename(name);
2742        }
2743        Ok(self)
2744    }
2745
2746    /// Apply a closure that may fail to a column. This is the recommended way to do in place
2747    /// modification.
2748    ///
2749    /// # Example
2750    ///
2751    /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2752    ///
2753    /// ```rust
2754    /// # use polars_core::prelude::*;
2755    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2756    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2757    /// let mut df = DataFrame::new(vec![s0, s1])?;
2758    ///
2759    /// // create a mask
2760    /// let values = df.column("values")?.as_materialized_series();
2761    /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2762    ///
2763    /// df.try_apply("foo", |c| {
2764    ///     c.str()?
2765    ///     .set(&mask, Some("not_within_bounds"))
2766    /// });
2767    /// # Ok::<(), PolarsError>(())
2768    /// ```
2769    /// Results in:
2770    ///
2771    /// ```text
2772    /// +---------------------+--------+
2773    /// | foo                 | values |
2774    /// | ---                 | ---    |
2775    /// | str                 | i32    |
2776    /// +=====================+========+
2777    /// | "not_within_bounds" | 1      |
2778    /// +---------------------+--------+
2779    /// | "spam"              | 2      |
2780    /// +---------------------+--------+
2781    /// | "egg"               | 3      |
2782    /// +---------------------+--------+
2783    /// | "bacon"             | 4      |
2784    /// +---------------------+--------+
2785    /// | "not_within_bounds" | 5      |
2786    /// +---------------------+--------+
2787    /// ```
2788    pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2789    where
2790        F: FnOnce(&Series) -> PolarsResult<C>,
2791        C: IntoColumn,
2792    {
2793        let idx = self.try_get_column_index(column)?;
2794        self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2795    }
2796
2797    /// Slice the [`DataFrame`] along the rows.
2798    ///
2799    /// # Example
2800    ///
2801    /// ```rust
2802    /// # use polars_core::prelude::*;
2803    /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2804    ///                         "Color" => ["Green", "Red", "White", "White", "Red"])?;
2805    /// let sl: DataFrame = df.slice(2, 3);
2806    ///
2807    /// assert_eq!(sl.shape(), (3, 2));
2808    /// println!("{}", sl);
2809    /// # Ok::<(), PolarsError>(())
2810    /// ```
2811    /// Output:
2812    /// ```text
2813    /// shape: (3, 2)
2814    /// +-------+-------+
2815    /// | Fruit | Color |
2816    /// | ---   | ---   |
2817    /// | str   | str   |
2818    /// +=======+=======+
2819    /// | Grape | White |
2820    /// +-------+-------+
2821    /// | Fig   | White |
2822    /// +-------+-------+
2823    /// | Fig   | Red   |
2824    /// +-------+-------+
2825    /// ```
2826    #[must_use]
2827    pub fn slice(&self, offset: i64, length: usize) -> Self {
2828        if offset == 0 && length == self.height() {
2829            return self.clone();
2830        }
2831        if length == 0 {
2832            return self.clear();
2833        }
2834        let cols = self
2835            .columns
2836            .iter()
2837            .map(|s| s.slice(offset, length))
2838            .collect::<Vec<_>>();
2839
2840        let height = if let Some(fst) = cols.first() {
2841            fst.len()
2842        } else {
2843            let (_, length) = slice_offsets(offset, length, self.height());
2844            length
2845        };
2846
2847        unsafe { DataFrame::new_no_checks(height, cols) }
2848    }
2849
2850    /// Split [`DataFrame`] at the given `offset`.
2851    pub fn split_at(&self, offset: i64) -> (Self, Self) {
2852        let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2853
2854        let (idx, _) = slice_offsets(offset, 0, self.height());
2855
2856        let a = unsafe { DataFrame::new_no_checks(idx, a) };
2857        let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2858        (a, b)
2859    }
2860
2861    #[must_use]
2862    pub fn clear(&self) -> Self {
2863        let cols = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2864        unsafe { DataFrame::new_no_checks(0, cols) }
2865    }
2866
2867    #[must_use]
2868    pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2869        if offset == 0 && length == self.height() {
2870            return self.clone();
2871        }
2872        let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2873        unsafe { DataFrame::new_no_checks(length, columns) }
2874    }
2875
2876    #[must_use]
2877    pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2878        if offset == 0 && length == self.height() {
2879            return self.clone();
2880        }
2881        // @scalar-opt
2882        let columns = self._apply_columns(&|s| {
2883            let mut out = s.slice(offset, length);
2884            out.shrink_to_fit();
2885            out
2886        });
2887        unsafe { DataFrame::new_no_checks(length, columns) }
2888    }
2889
2890    /// Get the head of the [`DataFrame`].
2891    ///
2892    /// # Example
2893    ///
2894    /// ```rust
2895    /// # use polars_core::prelude::*;
2896    /// let countries: DataFrame =
2897    ///     df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2898    ///         "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2899    ///         "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2900    ///         "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2901    /// assert_eq!(countries.shape(), (5, 4));
2902    ///
2903    /// println!("{}", countries.head(Some(3)));
2904    /// # Ok::<(), PolarsError>(())
2905    /// ```
2906    ///
2907    /// Output:
2908    ///
2909    /// ```text
2910    /// shape: (3, 4)
2911    /// +--------------------+---------------+---------------+------------+
2912    /// | Rank by GDP (2021) | Continent     | Country       | Capital    |
2913    /// | ---                | ---           | ---           | ---        |
2914    /// | i32                | str           | str           | str        |
2915    /// +====================+===============+===============+============+
2916    /// | 1                  | North America | United States | Washington |
2917    /// +--------------------+---------------+---------------+------------+
2918    /// | 2                  | Asia          | China         | Beijing    |
2919    /// +--------------------+---------------+---------------+------------+
2920    /// | 3                  | Asia          | Japan         | Tokyo      |
2921    /// +--------------------+---------------+---------------+------------+
2922    /// ```
2923    #[must_use]
2924    pub fn head(&self, length: Option<usize>) -> Self {
2925        let cols = self
2926            .columns
2927            .iter()
2928            .map(|c| c.head(length))
2929            .collect::<Vec<_>>();
2930
2931        let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2932        let height = usize::min(height, self.height());
2933        unsafe { DataFrame::new_no_checks(height, cols) }
2934    }
2935
2936    /// Get the tail of the [`DataFrame`].
2937    ///
2938    /// # Example
2939    ///
2940    /// ```rust
2941    /// # use polars_core::prelude::*;
2942    /// let countries: DataFrame =
2943    ///     df!("Rank (2021)" => [105, 106, 107, 108, 109],
2944    ///         "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2945    ///         "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2946    /// assert_eq!(countries.shape(), (5, 3));
2947    ///
2948    /// println!("{}", countries.tail(Some(2)));
2949    /// # Ok::<(), PolarsError>(())
2950    /// ```
2951    ///
2952    /// Output:
2953    ///
2954    /// ```text
2955    /// shape: (2, 3)
2956    /// +-------------+--------------------+---------+
2957    /// | Rank (2021) | Apple Price (€/kg) | Country |
2958    /// | ---         | ---                | ---     |
2959    /// | i32         | f64                | str     |
2960    /// +=============+====================+=========+
2961    /// | 108         | 0.65               | Syria   |
2962    /// +-------------+--------------------+---------+
2963    /// | 109         | 0.52               | Turkey  |
2964    /// +-------------+--------------------+---------+
2965    /// ```
2966    #[must_use]
2967    pub fn tail(&self, length: Option<usize>) -> Self {
2968        let cols = self
2969            .columns
2970            .iter()
2971            .map(|c| c.tail(length))
2972            .collect::<Vec<_>>();
2973
2974        let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2975        let height = usize::min(height, self.height());
2976        unsafe { DataFrame::new_no_checks(height, cols) }
2977    }
2978
2979    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2980    ///
2981    /// # Panics
2982    ///
2983    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2984    ///
2985    /// This responsibility is left to the caller as we don't want to take mutable references here,
2986    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2987    /// as well.
2988    pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter<'_> {
2989        debug_assert!(!self.should_rechunk(), "expected equal chunks");
2990        // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2991        // as we must allocate arrow strings/binaries.
2992        let must_convert = compat_level.0 == 0;
2993        let parallel = parallel
2994            && must_convert
2995            && self.columns.len() > 1
2996            && self
2997                .columns
2998                .iter()
2999                .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
3000
3001        RecordBatchIter {
3002            columns: &self.columns,
3003            schema: Arc::new(
3004                self.columns
3005                    .iter()
3006                    .map(|c| c.field().to_arrow(compat_level))
3007                    .collect(),
3008            ),
3009            idx: 0,
3010            n_chunks: self.first_col_n_chunks(),
3011            compat_level,
3012            parallel,
3013        }
3014    }
3015
3016    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
3017    ///
3018    /// # Panics
3019    ///
3020    /// Panics if the [`DataFrame`] that is passed is not rechunked.
3021    ///
3022    /// This responsibility is left to the caller as we don't want to take mutable references here,
3023    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
3024    /// as well.
3025    pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
3026        debug_assert!(!self.should_rechunk());
3027        PhysRecordBatchIter {
3028            schema: Arc::new(
3029                self.get_columns()
3030                    .iter()
3031                    .map(|c| c.field().to_arrow(CompatLevel::newest()))
3032                    .collect(),
3033            ),
3034            arr_iters: self
3035                .materialized_column_iter()
3036                .map(|s| s.chunks().iter())
3037                .collect(),
3038        }
3039    }
3040
3041    /// Get a [`DataFrame`] with all the columns in reversed order.
3042    #[must_use]
3043    pub fn reverse(&self) -> Self {
3044        let cols = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
3045        unsafe { DataFrame::new_no_checks(self.height(), cols) }
3046    }
3047
3048    /// Shift the values by a given period and fill the parts that will be empty due to this operation
3049    /// with `Nones`.
3050    ///
3051    /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
3052    #[must_use]
3053    pub fn shift(&self, periods: i64) -> Self {
3054        let col = self._apply_columns_par(&|s| s.shift(periods));
3055        unsafe { DataFrame::new_no_checks(self.height(), col) }
3056    }
3057
3058    /// Replace None values with one of the following strategies:
3059    /// * Forward fill (replace None with the previous value)
3060    /// * Backward fill (replace None with the next value)
3061    /// * Mean fill (replace None with the mean of the whole array)
3062    /// * Min fill (replace None with the minimum of the whole array)
3063    /// * Max fill (replace None with the maximum of the whole array)
3064    ///
3065    /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
3066    pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
3067        let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
3068
3069        Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
3070    }
3071
3072    /// Pipe different functions/ closure operations that work on a DataFrame together.
3073    pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
3074    where
3075        F: Fn(DataFrame) -> PolarsResult<B>,
3076    {
3077        f(self)
3078    }
3079
3080    /// Pipe different functions/ closure operations that work on a DataFrame together.
3081    pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
3082    where
3083        F: Fn(&mut DataFrame) -> PolarsResult<B>,
3084    {
3085        f(self)
3086    }
3087
3088    /// Pipe different functions/ closure operations that work on a DataFrame together.
3089    pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
3090    where
3091        F: Fn(DataFrame, Args) -> PolarsResult<B>,
3092    {
3093        f(self, args)
3094    }
3095
3096    /// Drop duplicate rows from a [`DataFrame`].
3097    /// *This fails when there is a column of type List in DataFrame*
3098    ///
3099    /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
3100    ///
3101    /// # Example
3102    ///
3103    /// ```no_run
3104    /// # use polars_core::prelude::*;
3105    /// let df = df! {
3106    ///               "flt" => [1., 1., 2., 2., 3., 3.],
3107    ///               "int" => [1, 1, 2, 2, 3, 3, ],
3108    ///               "str" => ["a", "a", "b", "b", "c", "c"]
3109    ///           }?;
3110    ///
3111    /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
3112    /// # Ok::<(), PolarsError>(())
3113    /// ```
3114    /// Returns
3115    ///
3116    /// ```text
3117    /// +-----+-----+-----+
3118    /// | flt | int | str |
3119    /// | --- | --- | --- |
3120    /// | f64 | i32 | str |
3121    /// +=====+=====+=====+
3122    /// | 1   | 1   | "a" |
3123    /// +-----+-----+-----+
3124    /// | 2   | 2   | "b" |
3125    /// +-----+-----+-----+
3126    /// | 3   | 3   | "c" |
3127    /// +-----+-----+-----+
3128    /// ```
3129    #[cfg(feature = "algorithm_group_by")]
3130    pub fn unique_stable(
3131        &self,
3132        subset: Option<&[String]>,
3133        keep: UniqueKeepStrategy,
3134        slice: Option<(i64, usize)>,
3135    ) -> PolarsResult<DataFrame> {
3136        self.unique_impl(
3137            true,
3138            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3139            keep,
3140            slice,
3141        )
3142    }
3143
3144    /// Unstable distinct. See [`DataFrame::unique_stable`].
3145    #[cfg(feature = "algorithm_group_by")]
3146    pub fn unique<I, S>(
3147        &self,
3148        subset: Option<&[String]>,
3149        keep: UniqueKeepStrategy,
3150        slice: Option<(i64, usize)>,
3151    ) -> PolarsResult<DataFrame> {
3152        self.unique_impl(
3153            false,
3154            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3155            keep,
3156            slice,
3157        )
3158    }
3159
3160    #[cfg(feature = "algorithm_group_by")]
3161    pub fn unique_impl(
3162        &self,
3163        maintain_order: bool,
3164        subset: Option<Vec<PlSmallStr>>,
3165        keep: UniqueKeepStrategy,
3166        slice: Option<(i64, usize)>,
3167    ) -> PolarsResult<Self> {
3168        let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3169        let mut df = self.clone();
3170        // take on multiple chunks is terrible
3171        df.as_single_chunk_par();
3172
3173        let columns = match (keep, maintain_order) {
3174            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3175                let gb = df.group_by_stable(names)?;
3176                let groups = gb.get_groups();
3177                let (offset, len) = slice.unwrap_or((0, groups.len()));
3178                let groups = groups.slice(offset, len);
3179                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3180            },
3181            (UniqueKeepStrategy::Last, true) => {
3182                // maintain order by last values, so the sorted groups are not correct as they
3183                // are sorted by the first value
3184                let gb = df.group_by_stable(names)?;
3185                let groups = gb.get_groups();
3186
3187                let last_idx: NoNull<IdxCa> = groups
3188                    .iter()
3189                    .map(|g| match g {
3190                        GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3191                        GroupsIndicator::Slice([first, len]) => first + len - 1,
3192                    })
3193                    .collect();
3194
3195                let mut last_idx = last_idx.into_inner().sort(false);
3196
3197                if let Some((offset, len)) = slice {
3198                    last_idx = last_idx.slice(offset, len);
3199                }
3200
3201                let last_idx = NoNull::new(last_idx);
3202                let out = unsafe { df.take_unchecked(&last_idx) };
3203                return Ok(out);
3204            },
3205            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3206                let gb = df.group_by(names)?;
3207                let groups = gb.get_groups();
3208                let (offset, len) = slice.unwrap_or((0, groups.len()));
3209                let groups = groups.slice(offset, len);
3210                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3211            },
3212            (UniqueKeepStrategy::Last, false) => {
3213                let gb = df.group_by(names)?;
3214                let groups = gb.get_groups();
3215                let (offset, len) = slice.unwrap_or((0, groups.len()));
3216                let groups = groups.slice(offset, len);
3217                df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3218            },
3219            (UniqueKeepStrategy::None, _) => {
3220                let df_part = df.select(names)?;
3221                let mask = df_part.is_unique()?;
3222                let mut filtered = df.filter(&mask)?;
3223
3224                if let Some((offset, len)) = slice {
3225                    filtered = filtered.slice(offset, len);
3226                }
3227                return Ok(filtered);
3228            },
3229        };
3230        let height = Self::infer_height(&columns);
3231        Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3232    }
3233
3234    /// Get a mask of all the unique rows in the [`DataFrame`].
3235    ///
3236    /// # Example
3237    ///
3238    /// ```no_run
3239    /// # use polars_core::prelude::*;
3240    /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3241    ///                         "ISIN" => ["US0378331005", "US5949181045"])?;
3242    /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3243    ///
3244    /// assert!(ca.all());
3245    /// # Ok::<(), PolarsError>(())
3246    /// ```
3247    #[cfg(feature = "algorithm_group_by")]
3248    pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3249        let gb = self.group_by(self.get_column_names_owned())?;
3250        let groups = gb.get_groups();
3251        Ok(is_unique_helper(
3252            groups,
3253            self.height() as IdxSize,
3254            true,
3255            false,
3256        ))
3257    }
3258
3259    /// Get a mask of all the duplicated rows in the [`DataFrame`].
3260    ///
3261    /// # Example
3262    ///
3263    /// ```no_run
3264    /// # use polars_core::prelude::*;
3265    /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3266    ///                         "ISIN" => ["US02079K3059", "US02079K1079"])?;
3267    /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3268    ///
3269    /// assert!(!ca.all());
3270    /// # Ok::<(), PolarsError>(())
3271    /// ```
3272    #[cfg(feature = "algorithm_group_by")]
3273    pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3274        let gb = self.group_by(self.get_column_names_owned())?;
3275        let groups = gb.get_groups();
3276        Ok(is_unique_helper(
3277            groups,
3278            self.height() as IdxSize,
3279            false,
3280            true,
3281        ))
3282    }
3283
3284    /// Create a new [`DataFrame`] that shows the null counts per column.
3285    #[must_use]
3286    pub fn null_count(&self) -> Self {
3287        let cols = self
3288            .columns
3289            .iter()
3290            .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3291            .collect();
3292        unsafe { Self::new_no_checks(1, cols) }
3293    }
3294
3295    /// Hash and combine the row values
3296    #[cfg(feature = "row_hash")]
3297    pub fn hash_rows(
3298        &mut self,
3299        hasher_builder: Option<PlSeedableRandomStateQuality>,
3300    ) -> PolarsResult<UInt64Chunked> {
3301        let dfs = split_df(self, POOL.current_num_threads(), false);
3302        let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3303
3304        let mut iter = cas.into_iter();
3305        let mut acc_ca = iter.next().unwrap();
3306        for ca in iter {
3307            acc_ca.append(&ca)?;
3308        }
3309        Ok(acc_ca.rechunk().into_owned())
3310    }
3311
3312    /// Get the supertype of the columns in this DataFrame
3313    pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3314        self.columns
3315            .iter()
3316            .map(|s| Ok(s.dtype().clone()))
3317            .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3318    }
3319
3320    /// Take by index values given by the slice `idx`.
3321    /// # Warning
3322    /// Be careful with allowing threads when calling this in a large hot loop
3323    /// every thread split may be on rayon stack and lead to SO
3324    #[doc(hidden)]
3325    pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3326        self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3327    }
3328
3329    /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3330    /// if the index value in `idx` are sorted. This will maintain sorted flags.
3331    ///
3332    /// # Warning
3333    /// Be careful with allowing threads when calling this in a large hot loop
3334    /// every thread split may be on rayon stack and lead to SO
3335    #[doc(hidden)]
3336    pub unsafe fn _take_unchecked_slice_sorted(
3337        &self,
3338        idx: &[IdxSize],
3339        allow_threads: bool,
3340        sorted: IsSorted,
3341    ) -> Self {
3342        #[cfg(debug_assertions)]
3343        {
3344            if idx.len() > 2 {
3345                match sorted {
3346                    IsSorted::Ascending => {
3347                        assert!(idx[0] <= idx[idx.len() - 1]);
3348                    },
3349                    IsSorted::Descending => {
3350                        assert!(idx[0] >= idx[idx.len() - 1]);
3351                    },
3352                    _ => {},
3353                }
3354            }
3355        }
3356        let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3357        ca.set_sorted_flag(sorted);
3358        self.take_unchecked_impl(&ca, allow_threads)
3359    }
3360
3361    #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3362    #[doc(hidden)]
3363    pub fn _partition_by_impl(
3364        &self,
3365        cols: &[PlSmallStr],
3366        stable: bool,
3367        include_key: bool,
3368        parallel: bool,
3369    ) -> PolarsResult<Vec<DataFrame>> {
3370        let selected_keys = self.select_columns(cols.iter().cloned())?;
3371        let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3372        let groups = groups.into_groups();
3373
3374        // drop key columns prior to calculation if requested
3375        let df = if include_key {
3376            self.clone()
3377        } else {
3378            self.drop_many(cols.iter().cloned())
3379        };
3380
3381        if parallel {
3382            // don't parallelize this
3383            // there is a lot of parallelization in take and this may easily SO
3384            POOL.install(|| {
3385                match groups.as_ref() {
3386                    GroupsType::Idx(idx) => {
3387                        // Rechunk as the gather may rechunk for every group #17562.
3388                        let mut df = df.clone();
3389                        df.as_single_chunk_par();
3390                        Ok(idx
3391                            .into_par_iter()
3392                            .map(|(_, group)| {
3393                                // groups are in bounds
3394                                unsafe {
3395                                    df._take_unchecked_slice_sorted(
3396                                        group,
3397                                        false,
3398                                        IsSorted::Ascending,
3399                                    )
3400                                }
3401                            })
3402                            .collect())
3403                    },
3404                    GroupsType::Slice { groups, .. } => Ok(groups
3405                        .into_par_iter()
3406                        .map(|[first, len]| df.slice(*first as i64, *len as usize))
3407                        .collect()),
3408                }
3409            })
3410        } else {
3411            match groups.as_ref() {
3412                GroupsType::Idx(idx) => {
3413                    // Rechunk as the gather may rechunk for every group #17562.
3414                    let mut df = df;
3415                    df.as_single_chunk();
3416                    Ok(idx
3417                        .into_iter()
3418                        .map(|(_, group)| {
3419                            // groups are in bounds
3420                            unsafe {
3421                                df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3422                            }
3423                        })
3424                        .collect())
3425                },
3426                GroupsType::Slice { groups, .. } => Ok(groups
3427                    .iter()
3428                    .map(|[first, len]| df.slice(*first as i64, *len as usize))
3429                    .collect()),
3430            }
3431        }
3432    }
3433
3434    /// Split into multiple DataFrames partitioned by groups
3435    #[cfg(feature = "partition_by")]
3436    pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3437    where
3438        I: IntoIterator<Item = S>,
3439        S: Into<PlSmallStr>,
3440    {
3441        let cols: UnitVec<PlSmallStr> = cols.into_iter().map(Into::into).collect();
3442        self._partition_by_impl(cols.as_slice(), false, include_key, true)
3443    }
3444
3445    /// Split into multiple DataFrames partitioned by groups
3446    /// Order of the groups are maintained.
3447    #[cfg(feature = "partition_by")]
3448    pub fn partition_by_stable<I, S>(
3449        &self,
3450        cols: I,
3451        include_key: bool,
3452    ) -> PolarsResult<Vec<DataFrame>>
3453    where
3454        I: IntoIterator<Item = S>,
3455        S: Into<PlSmallStr>,
3456    {
3457        let cols: UnitVec<PlSmallStr> = cols.into_iter().map(Into::into).collect();
3458        self._partition_by_impl(cols.as_slice(), true, include_key, true)
3459    }
3460
3461    /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3462    /// inserted as columns.
3463    #[cfg(feature = "dtype-struct")]
3464    pub fn unnest<I: IntoVec<PlSmallStr>>(
3465        &self,
3466        cols: I,
3467        separator: Option<&str>,
3468    ) -> PolarsResult<DataFrame> {
3469        let cols = cols.into_vec();
3470        self.unnest_impl(cols.into_iter().collect(), separator)
3471    }
3472
3473    #[cfg(feature = "dtype-struct")]
3474    fn unnest_impl(
3475        &self,
3476        cols: PlHashSet<PlSmallStr>,
3477        separator: Option<&str>,
3478    ) -> PolarsResult<DataFrame> {
3479        let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3480        let mut count = 0;
3481        for s in &self.columns {
3482            if cols.contains(s.name()) {
3483                let ca = s.struct_()?.clone();
3484                new_cols.extend(ca.fields_as_series().into_iter().map(|mut f| {
3485                    if let Some(separator) = &separator {
3486                        f.rename(polars_utils::format_pl_smallstr!(
3487                            "{}{}{}",
3488                            s.name(),
3489                            separator,
3490                            f.name()
3491                        ));
3492                    }
3493                    Column::from(f)
3494                }));
3495                count += 1;
3496            } else {
3497                new_cols.push(s.clone())
3498            }
3499        }
3500        if count != cols.len() {
3501            // one or more columns not found
3502            // the code below will return an error with the missing name
3503            let schema = self.schema();
3504            for col in cols {
3505                let _ = schema
3506                    .get(col.as_str())
3507                    .ok_or_else(|| polars_err!(col_not_found = col))?;
3508            }
3509        }
3510        DataFrame::new(new_cols)
3511    }
3512
3513    pub(crate) fn infer_height(cols: &[Column]) -> usize {
3514        cols.first().map_or(0, Column::len)
3515    }
3516
3517    pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3518        // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3519        // append_chunk or something like this. It is just quite difficult to make that safe.
3520        let df = DataFrame::from(rb);
3521        polars_ensure!(
3522            self.schema() == df.schema(),
3523            SchemaMismatch: "cannot append record batch with different schema\n\n
3524        Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3525        );
3526        self.vstack_mut_owned_unchecked(df);
3527        Ok(())
3528    }
3529
3530    pub fn into_columns(self) -> Vec<Column> {
3531        self.columns
3532    }
3533}
3534
3535pub struct RecordBatchIter<'a> {
3536    columns: &'a Vec<Column>,
3537    schema: ArrowSchemaRef,
3538    idx: usize,
3539    n_chunks: usize,
3540    compat_level: CompatLevel,
3541    parallel: bool,
3542}
3543
3544impl Iterator for RecordBatchIter<'_> {
3545    type Item = RecordBatch;
3546
3547    fn next(&mut self) -> Option<Self::Item> {
3548        if self.idx >= self.n_chunks {
3549            return None;
3550        }
3551
3552        // Create a batch of the columns with the same chunk no.
3553        let batch_cols: Vec<ArrayRef> = if self.parallel {
3554            let iter = self
3555                .columns
3556                .par_iter()
3557                .map(Column::as_materialized_series)
3558                .map(|s| s.to_arrow(self.idx, self.compat_level));
3559            POOL.install(|| iter.collect())
3560        } else {
3561            self.columns
3562                .iter()
3563                .map(Column::as_materialized_series)
3564                .map(|s| s.to_arrow(self.idx, self.compat_level))
3565                .collect()
3566        };
3567        self.idx += 1;
3568
3569        let length = batch_cols.first().map_or(0, |arr| arr.len());
3570        Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3571    }
3572
3573    fn size_hint(&self) -> (usize, Option<usize>) {
3574        let n = self.n_chunks - self.idx;
3575        (n, Some(n))
3576    }
3577}
3578
3579pub struct PhysRecordBatchIter<'a> {
3580    schema: ArrowSchemaRef,
3581    arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3582}
3583
3584impl Iterator for PhysRecordBatchIter<'_> {
3585    type Item = RecordBatch;
3586
3587    fn next(&mut self) -> Option<Self::Item> {
3588        let arrs = self
3589            .arr_iters
3590            .iter_mut()
3591            .map(|phys_iter| phys_iter.next().cloned())
3592            .collect::<Option<Vec<_>>>()?;
3593
3594        let length = arrs.first().map_or(0, |arr| arr.len());
3595        Some(RecordBatch::new(length, self.schema.clone(), arrs))
3596    }
3597
3598    fn size_hint(&self) -> (usize, Option<usize>) {
3599        if let Some(iter) = self.arr_iters.first() {
3600            iter.size_hint()
3601        } else {
3602            (0, None)
3603        }
3604    }
3605}
3606
3607impl Default for DataFrame {
3608    fn default() -> Self {
3609        DataFrame::empty()
3610    }
3611}
3612
3613impl From<DataFrame> for Vec<Column> {
3614    fn from(df: DataFrame) -> Self {
3615        df.columns
3616    }
3617}
3618
3619// utility to test if we can vstack/extend the columns
3620fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3621    polars_ensure!(
3622        left.name() == right.name(),
3623        ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3624        left.name(), right.name(),
3625    );
3626    Ok(())
3627}
3628
3629#[cfg(test)]
3630mod test {
3631    use super::*;
3632
3633    fn create_frame() -> DataFrame {
3634        let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3635        let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3636        DataFrame::new(vec![s0, s1]).unwrap()
3637    }
3638
3639    #[test]
3640    #[cfg_attr(miri, ignore)]
3641    fn test_recordbatch_iterator() {
3642        let df = df!(
3643            "foo" => [1, 2, 3, 4, 5]
3644        )
3645        .unwrap();
3646        let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3647        assert_eq!(5, iter.next().unwrap().len());
3648        assert!(iter.next().is_none());
3649    }
3650
3651    #[test]
3652    #[cfg_attr(miri, ignore)]
3653    fn test_select() {
3654        let df = create_frame();
3655        assert_eq!(
3656            df.column("days")
3657                .unwrap()
3658                .as_series()
3659                .unwrap()
3660                .equal(1)
3661                .unwrap()
3662                .sum(),
3663            Some(1)
3664        );
3665    }
3666
3667    #[test]
3668    #[cfg_attr(miri, ignore)]
3669    fn test_filter_broadcast_on_string_col() {
3670        let col_name = "some_col";
3671        let v = vec!["test".to_string()];
3672        let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3673        let mut df = DataFrame::new(vec![s0]).unwrap();
3674
3675        df = df
3676            .filter(
3677                &df.column(col_name)
3678                    .unwrap()
3679                    .as_materialized_series()
3680                    .equal("")
3681                    .unwrap(),
3682            )
3683            .unwrap();
3684        assert_eq!(
3685            df.column(col_name)
3686                .unwrap()
3687                .as_materialized_series()
3688                .n_chunks(),
3689            1
3690        );
3691    }
3692
3693    #[test]
3694    #[cfg_attr(miri, ignore)]
3695    fn test_filter_broadcast_on_list_col() {
3696        let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3697        let ll: ListChunked = [&s1].iter().copied().collect();
3698
3699        let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3700        let new = ll.filter(&mask).unwrap();
3701
3702        assert_eq!(new.chunks.len(), 1);
3703        assert_eq!(new.len(), 0);
3704    }
3705
3706    #[test]
3707    fn slice() {
3708        let df = create_frame();
3709        let sliced_df = df.slice(0, 2);
3710        assert_eq!(sliced_df.shape(), (2, 2));
3711    }
3712
3713    #[test]
3714    fn rechunk_false() {
3715        let df = create_frame();
3716        assert!(!df.should_rechunk())
3717    }
3718
3719    #[test]
3720    fn rechunk_true() -> PolarsResult<()> {
3721        let mut base = df!(
3722            "a" => [1, 2, 3],
3723            "b" => [1, 2, 3]
3724        )?;
3725
3726        // Create a series with multiple chunks
3727        let mut s = Series::new("foo".into(), 0..2);
3728        let s2 = Series::new("bar".into(), 0..1);
3729        s.append(&s2)?;
3730
3731        // Append series to frame
3732        let out = base.with_column(s)?;
3733
3734        // Now we should rechunk
3735        assert!(out.should_rechunk());
3736        Ok(())
3737    }
3738
3739    #[test]
3740    fn test_duplicate_column() {
3741        let mut df = df! {
3742            "foo" => [1, 2, 3]
3743        }
3744        .unwrap();
3745        // check if column is replaced
3746        assert!(
3747            df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3748                .is_ok()
3749        );
3750        assert!(
3751            df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3752                .is_ok()
3753        );
3754        assert!(df.column("bar").is_ok())
3755    }
3756
3757    #[test]
3758    #[cfg_attr(miri, ignore)]
3759    fn distinct() {
3760        let df = df! {
3761            "flt" => [1., 1., 2., 2., 3., 3.],
3762            "int" => [1, 1, 2, 2, 3, 3, ],
3763            "str" => ["a", "a", "b", "b", "c", "c"]
3764        }
3765        .unwrap();
3766        let df = df
3767            .unique_stable(None, UniqueKeepStrategy::First, None)
3768            .unwrap()
3769            .sort(["flt"], SortMultipleOptions::default())
3770            .unwrap();
3771        let valid = df! {
3772            "flt" => [1., 2., 3.],
3773            "int" => [1, 2, 3],
3774            "str" => ["a", "b", "c"]
3775        }
3776        .unwrap();
3777        assert!(df.equals(&valid));
3778    }
3779
3780    #[test]
3781    fn test_vstack() {
3782        // check that it does not accidentally rechunks
3783        let mut df = df! {
3784            "flt" => [1., 1., 2., 2., 3., 3.],
3785            "int" => [1, 1, 2, 2, 3, 3, ],
3786            "str" => ["a", "a", "b", "b", "c", "c"]
3787        }
3788        .unwrap();
3789
3790        df.vstack_mut(&df.slice(0, 3)).unwrap();
3791        assert_eq!(df.first_col_n_chunks(), 2)
3792    }
3793
3794    #[test]
3795    fn test_vstack_on_empty_dataframe() {
3796        let mut df = DataFrame::empty();
3797
3798        let df_data = df! {
3799            "flt" => [1., 1., 2., 2., 3., 3.],
3800            "int" => [1, 1, 2, 2, 3, 3, ],
3801            "str" => ["a", "a", "b", "b", "c", "c"]
3802        }
3803        .unwrap();
3804
3805        df.vstack_mut(&df_data).unwrap();
3806        assert_eq!(df.height, 6)
3807    }
3808
3809    #[test]
3810    fn test_replace_or_add() -> PolarsResult<()> {
3811        let mut df = df!(
3812            "a" => [1, 2, 3],
3813            "b" => [1, 2, 3]
3814        )?;
3815
3816        // check that the new column is "c" and not "bar".
3817        df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3818
3819        assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3820        Ok(())
3821    }
3822
3823    #[test]
3824    fn test_unique_keep_none_with_slice() {
3825        let df = df! {
3826            "x" => [1, 2, 3, 2, 1]
3827        }
3828        .unwrap();
3829        let out = df
3830            .unique_stable(
3831                Some(&["x".to_string()][..]),
3832                UniqueKeepStrategy::None,
3833                Some((0, 2)),
3834            )
3835            .unwrap();
3836        let expected = df! {
3837            "x" => [3]
3838        }
3839        .unwrap();
3840        assert!(out.equals(&expected));
3841    }
3842
3843    #[test]
3844    #[cfg(feature = "dtype-i8")]
3845    fn test_apply_result_schema() {
3846        let mut df = df! {
3847            "x" => [1, 2, 3, 2, 1]
3848        }
3849        .unwrap();
3850
3851        let schema_before = df.schema().clone();
3852        df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3853        assert_ne!(&schema_before, df.schema());
3854    }
3855}