polars_core/frame/
mod.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(any(feature = "rows", feature = "object"))]
33pub mod row;
34mod top_k;
35mod upstream_traits;
36mod validation;
37
38use arrow::record_batch::{RecordBatch, RecordBatchT};
39use polars_utils::pl_str::PlSmallStr;
40#[cfg(feature = "serde")]
41use serde::{Deserialize, Serialize};
42use strum_macros::IntoStaticStr;
43
44use crate::POOL;
45#[cfg(feature = "row_hash")]
46use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48use crate::series::IsSorted;
49
50#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
53#[strum(serialize_all = "snake_case")]
54pub enum UniqueKeepStrategy {
55    /// Keep the first unique row.
56    First,
57    /// Keep the last unique row.
58    Last,
59    /// Keep None of the unique rows.
60    None,
61    /// Keep any of the unique rows
62    /// This allows more optimizations
63    #[default]
64    Any,
65}
66
67fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
68where
69    F: for<'a> FnMut(&'a T) -> &'a str,
70{
71    // Always unique.
72    if items.len() <= 1 {
73        return Ok(());
74    }
75
76    if items.len() <= 4 {
77        // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
78        for i in 0..items.len() - 1 {
79            let name = get_name(&items[i]);
80            for other in items.iter().skip(i + 1) {
81                if name == get_name(other) {
82                    polars_bail!(duplicate = name);
83                }
84            }
85        }
86    } else {
87        let mut names = PlHashSet::with_capacity(items.len());
88        for item in items {
89            let name = get_name(item);
90            if !names.insert(name) {
91                polars_bail!(duplicate = name);
92            }
93        }
94    }
95    Ok(())
96}
97
98/// A contiguous growable collection of `Series` that have the same length.
99///
100/// ## Use declarations
101///
102/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
103///
104/// ```rust
105/// use polars_core::prelude::*; // if the crate polars-core is used directly
106/// // use polars::prelude::*;      if the crate polars is used
107/// ```
108///
109/// # Initialization
110/// ## Default
111///
112/// A `DataFrame` can be initialized empty:
113///
114/// ```rust
115/// # use polars_core::prelude::*;
116/// let df = DataFrame::default();
117/// assert!(df.is_empty());
118/// ```
119///
120/// ## Wrapping a `Vec<Series>`
121///
122/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
123///
124/// ```rust
125/// # use polars_core::prelude::*;
126/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
127/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
128///
129/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
130/// ```
131///
132/// ## Using a macro
133///
134/// The [`df!`] macro is a convenient method:
135///
136/// ```rust
137/// # use polars_core::prelude::*;
138/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
139///                                       "Color" => ["Red", "Yellow", "Green"]);
140/// ```
141///
142/// ## Using a CSV file
143///
144/// See the `polars_io::csv::CsvReader`.
145///
146/// # Indexing
147/// ## By a number
148///
149/// The `Index<usize>` is implemented for the `DataFrame`.
150///
151/// ```rust
152/// # use polars_core::prelude::*;
153/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
154///              "Color" => ["Red", "Yellow", "Green"])?;
155///
156/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
157/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
158/// # Ok::<(), PolarsError>(())
159/// ```
160///
161/// ## By a `Series` name
162///
163/// ```rust
164/// # use polars_core::prelude::*;
165/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
166///              "Color" => ["Red", "Yellow", "Green"])?;
167///
168/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
169/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
170/// # Ok::<(), PolarsError>(())
171/// ```
172#[derive(Clone)]
173pub struct DataFrame {
174    height: usize,
175    // invariant: columns[i].len() == height for each 0 >= i > columns.len()
176    pub(crate) columns: Vec<Column>,
177
178    /// A cached schema. This might not give correct results if the DataFrame was modified in place
179    /// between schema and reading.
180    cached_schema: OnceLock<SchemaRef>,
181}
182
183impl DataFrame {
184    pub fn clear_schema(&mut self) {
185        self.cached_schema = OnceLock::new();
186    }
187
188    #[inline]
189    pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
190        self.columns.iter()
191    }
192
193    #[inline]
194    pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
195        self.columns.iter().map(Column::as_materialized_series)
196    }
197
198    #[inline]
199    pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
200        self.columns.par_iter().map(Column::as_materialized_series)
201    }
202
203    /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
204    ///
205    /// # Implementation
206    /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
207    /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
208    /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
209    ///
210    /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
211    /// However, this function will yield a smaller number. This is because this function returns
212    /// the visible size of the buffer, not its total capacity.
213    ///
214    /// FFI buffers are included in this estimation.
215    pub fn estimated_size(&self) -> usize {
216        self.columns.iter().map(Column::estimated_size).sum()
217    }
218
219    // Reduce monomorphization.
220    fn try_apply_columns(
221        &self,
222        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
223    ) -> PolarsResult<Vec<Column>> {
224        self.columns.iter().map(func).collect()
225    }
226    // Reduce monomorphization.
227    pub fn _apply_columns(&self, func: &dyn Fn(&Column) -> Column) -> Vec<Column> {
228        self.columns.iter().map(func).collect()
229    }
230    // Reduce monomorphization.
231    fn try_apply_columns_par(
232        &self,
233        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
234    ) -> PolarsResult<Vec<Column>> {
235        POOL.install(|| self.columns.par_iter().map(func).collect())
236    }
237    // Reduce monomorphization.
238    pub fn _apply_columns_par(
239        &self,
240        func: &(dyn Fn(&Column) -> Column + Send + Sync),
241    ) -> Vec<Column> {
242        POOL.install(|| self.columns.par_iter().map(func).collect())
243    }
244
245    /// Get the index of the column.
246    fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
247        self.get_column_index(name)
248            .ok_or_else(|| polars_err!(col_not_found = name))
249    }
250
251    fn check_already_present(&self, name: &str) -> PolarsResult<()> {
252        polars_ensure!(
253            self.columns.iter().all(|s| s.name().as_str() != name),
254            Duplicate: "column with name {:?} is already present in the DataFrame", name
255        );
256        Ok(())
257    }
258
259    /// Reserve additional slots into the chunks of the series.
260    pub(crate) fn reserve_chunks(&mut self, additional: usize) {
261        for s in &mut self.columns {
262            if let Column::Series(s) = s {
263                // SAFETY:
264                // do not modify the data, simply resize.
265                unsafe { s.chunks_mut().reserve(additional) }
266            }
267        }
268    }
269
270    /// Create a DataFrame from a Vector of Series.
271    ///
272    /// Errors if a column names are not unique, or if heights are not all equal.
273    ///
274    /// # Example
275    ///
276    /// ```
277    /// # use polars_core::prelude::*;
278    /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
279    /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
280    ///
281    /// let df = DataFrame::new(vec![s0, s1])?;
282    /// # Ok::<(), PolarsError>(())
283    /// ```
284    pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
285        DataFrame::validate_columns_slice(&columns)
286            .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
287        Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
288    }
289
290    pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
291        for col in &columns {
292            polars_ensure!(
293                col.len() == height,
294                ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
295                columns[0].name(), height, col.name(), col.len()
296            );
297        }
298
299        ensure_names_unique(&columns, |s| s.name().as_str())?;
300
301        Ok(DataFrame {
302            height,
303            columns,
304            cached_schema: OnceLock::new(),
305        })
306    }
307
308    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
309    /// columns to match the other columns.
310    pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
311        // The length of the longest non-unit length column determines the
312        // broadcast length. If all columns are unit-length the broadcast length
313        // is one.
314        let broadcast_len = columns
315            .iter()
316            .map(|s| s.len())
317            .filter(|l| *l != 1)
318            .max()
319            .unwrap_or(1);
320        Self::new_with_broadcast_len(columns, broadcast_len)
321    }
322
323    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
324    /// columns to broadcast_len.
325    pub fn new_with_broadcast_len(
326        columns: Vec<Column>,
327        broadcast_len: usize,
328    ) -> PolarsResult<Self> {
329        ensure_names_unique(&columns, |s| s.name().as_str())?;
330        unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
331    }
332
333    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
334    /// columns to match the other columns.
335    ///  
336    /// # Safety
337    /// Does not check that the column names are unique (which they must be).
338    pub unsafe fn new_with_broadcast_no_namecheck(
339        mut columns: Vec<Column>,
340        broadcast_len: usize,
341    ) -> PolarsResult<Self> {
342        for col in &mut columns {
343            // Length not equal to the broadcast len, needs broadcast or is an error.
344            let len = col.len();
345            if len != broadcast_len {
346                if len != 1 {
347                    let name = col.name().to_owned();
348                    let extra_info =
349                        if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
350                            format!(" (matching column '{}')", c.name())
351                        } else {
352                            String::new()
353                        };
354                    polars_bail!(
355                        ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
356                    );
357                }
358                *col = col.new_from_index(0, broadcast_len);
359            }
360        }
361
362        let length = if columns.is_empty() { 0 } else { broadcast_len };
363
364        Ok(unsafe { DataFrame::new_no_checks(length, columns) })
365    }
366
367    pub fn new_from_index(&self, index: usize, height: usize) -> Self {
368        let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
369        unsafe { Self::new_no_checks(height, cols.collect()) }
370    }
371
372    /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
373    ///
374    /// # Example
375    ///
376    /// ```rust
377    /// use polars_core::prelude::DataFrame;
378    /// static EMPTY: DataFrame = DataFrame::empty();
379    /// ```
380    pub const fn empty() -> Self {
381        Self::empty_with_height(0)
382    }
383
384    /// Creates an empty `DataFrame` with a specific `height`.
385    pub const fn empty_with_height(height: usize) -> Self {
386        DataFrame {
387            height,
388            columns: vec![],
389            cached_schema: OnceLock::new(),
390        }
391    }
392
393    /// Create an empty `DataFrame` with empty columns as per the `schema`.
394    pub fn empty_with_arc_schema(schema: Arc<Schema>) -> Self {
395        let mut df = Self::empty_with_schema(&schema);
396        df.cached_schema = OnceLock::from(schema);
397        df
398    }
399
400    /// Create an empty `DataFrame` with empty columns as per the `schema`.
401    pub fn empty_with_schema(schema: &Schema) -> Self {
402        let cols = schema
403            .iter()
404            .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
405            .collect();
406        unsafe { DataFrame::new_no_checks(0, cols) }
407    }
408
409    /// Create an empty `DataFrame` with empty columns as per the `schema`.
410    pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
411        let cols = schema
412            .iter_values()
413            .map(|fld| {
414                Column::from(Series::new_empty(
415                    fld.name.clone(),
416                    &(DataType::from_arrow_field(fld)),
417                ))
418            })
419            .collect();
420        unsafe { DataFrame::new_no_checks(0, cols) }
421    }
422
423    /// Create a new `DataFrame` with the given schema, only containing nulls.
424    pub fn full_null(schema: &Schema, height: usize) -> Self {
425        let columns = schema
426            .iter_fields()
427            .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
428            .collect();
429        unsafe { DataFrame::new_no_checks(height, columns) }
430    }
431
432    /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
433    ///
434    /// # Example
435    ///
436    /// ```rust
437    /// # use polars_core::prelude::*;
438    /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
439    /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
440    /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
441    ///
442    /// assert_eq!(df.pop(), Some(s2));
443    /// assert_eq!(df.pop(), Some(s1));
444    /// assert_eq!(df.pop(), None);
445    /// assert!(df.is_empty());
446    /// # Ok::<(), PolarsError>(())
447    /// ```
448    pub fn pop(&mut self) -> Option<Column> {
449        self.clear_schema();
450
451        self.columns.pop()
452    }
453
454    /// Add a new column at index 0 that counts the rows.
455    ///
456    /// # Example
457    ///
458    /// ```
459    /// # use polars_core::prelude::*;
460    /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
461    /// assert_eq!(df1.shape(), (4, 1));
462    ///
463    /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
464    /// assert_eq!(df2.shape(), (4, 2));
465    /// println!("{}", df2);
466    ///
467    /// # Ok::<(), PolarsError>(())
468    /// ```
469    ///
470    /// Output:
471    ///
472    /// ```text
473    ///  shape: (4, 2)
474    ///  +-----+----------+
475    ///  | Id  | Name     |
476    ///  | --- | ---      |
477    ///  | u32 | str      |
478    ///  +=====+==========+
479    ///  | 0   | James    |
480    ///  +-----+----------+
481    ///  | 1   | Mary     |
482    ///  +-----+----------+
483    ///  | 2   | John     |
484    ///  +-----+----------+
485    ///  | 3   | Patricia |
486    ///  +-----+----------+
487    /// ```
488    pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
489        let mut columns = Vec::with_capacity(self.columns.len() + 1);
490        let offset = offset.unwrap_or(0);
491
492        let col = Column::new_row_index(name, offset, self.height())?;
493        columns.push(col);
494        columns.extend_from_slice(&self.columns);
495        DataFrame::new(columns)
496    }
497
498    /// Add a row index column in place.
499    ///
500    /// # Safety
501    /// The caller should ensure the DataFrame does not already contain a column with the given name.
502    ///
503    /// # Panics
504    /// Panics if the resulting column would reach or overflow IdxSize::MAX.
505    pub unsafe fn with_row_index_mut(
506        &mut self,
507        name: PlSmallStr,
508        offset: Option<IdxSize>,
509    ) -> &mut Self {
510        // TODO: Make this function unsafe
511        debug_assert!(
512            self.columns.iter().all(|c| c.name() != &name),
513            "with_row_index_mut(): column with name {} already exists",
514            &name
515        );
516
517        let offset = offset.unwrap_or(0);
518        let col = Column::new_row_index(name, offset, self.height()).unwrap();
519
520        self.clear_schema();
521        self.columns.insert(0, col);
522        self
523    }
524
525    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
526    /// `Series`.
527    ///
528    /// Calculates the height from the first column or `0` if no columns are given.
529    ///
530    /// # Safety
531    ///
532    /// It is the callers responsibility to uphold the contract of all `Series`
533    /// having an equal length and a unique name, if not this may panic down the line.
534    pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
535        let height = columns.first().map_or(0, Column::len);
536        unsafe { Self::new_no_checks(height, columns) }
537    }
538
539    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
540    /// `Series`.
541    ///
542    /// It is advised to use [DataFrame::new] in favor of this method.
543    ///
544    /// # Safety
545    ///
546    /// It is the callers responsibility to uphold the contract of all `Series`
547    /// having an equal length and a unique name, if not this may panic down the line.
548    pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
549        if cfg!(debug_assertions) {
550            DataFrame::validate_columns_slice(&columns).unwrap();
551        }
552
553        unsafe { Self::_new_no_checks_impl(height, columns) }
554    }
555
556    /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
557    /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
558    /// constructed with this method is generally highly unsafe and should not be long-lived.
559    #[allow(clippy::missing_safety_doc)]
560    pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
561        DataFrame {
562            height,
563            columns,
564            cached_schema: OnceLock::new(),
565        }
566    }
567
568    /// Shrink the capacity of this DataFrame to fit its length.
569    pub fn shrink_to_fit(&mut self) {
570        // Don't parallelize this. Memory overhead
571        for s in &mut self.columns {
572            s.shrink_to_fit();
573        }
574    }
575
576    /// Aggregate all the chunks in the DataFrame to a single chunk.
577    pub fn as_single_chunk(&mut self) -> &mut Self {
578        // Don't parallelize this. Memory overhead
579        for s in &mut self.columns {
580            *s = s.rechunk();
581        }
582        self
583    }
584
585    /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
586    /// This may lead to more peak memory consumption.
587    pub fn as_single_chunk_par(&mut self) -> &mut Self {
588        if self.columns.iter().any(|c| c.n_chunks() > 1) {
589            self.columns = self._apply_columns_par(&|s| s.rechunk());
590        }
591        self
592    }
593
594    /// Rechunks all columns to only have a single chunk.
595    pub fn rechunk_mut(&mut self) {
596        // SAFETY: We never adjust the length or names of the columns.
597        let columns = unsafe { self.get_columns_mut() };
598
599        for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
600            *col = col.rechunk();
601        }
602    }
603
604    pub fn _deshare_views_mut(&mut self) {
605        // SAFETY: We never adjust the length or names of the columns.
606        unsafe {
607            let columns = self.get_columns_mut();
608            for col in columns {
609                let Column::Series(s) = col else { continue };
610
611                if let Ok(ca) = s.binary() {
612                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
613                    *col = Column::from(gc_ca.into_series());
614                } else if let Ok(ca) = s.str() {
615                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
616                    *col = Column::from(gc_ca.into_series());
617                }
618            }
619        }
620    }
621
622    /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
623    pub fn rechunk_to_record_batch(
624        self,
625        compat_level: CompatLevel,
626    ) -> RecordBatchT<Box<dyn Array>> {
627        let height = self.height();
628
629        let (schema, arrays) = self
630            .columns
631            .into_iter()
632            .map(|col| {
633                let mut series = col.take_materialized_series();
634                // Rechunk to one chunk if necessary
635                if series.n_chunks() > 1 {
636                    series = series.rechunk();
637                }
638                (
639                    series.field().to_arrow(compat_level),
640                    series.to_arrow(0, compat_level),
641                )
642            })
643            .collect();
644
645        RecordBatchT::new(height, Arc::new(schema), arrays)
646    }
647
648    /// Returns true if the chunks of the columns do not align and re-chunking should be done
649    pub fn should_rechunk(&self) -> bool {
650        // Fast check. It is also needed for correctness, as code below doesn't check if the number
651        // of chunks is equal.
652        if !self
653            .get_columns()
654            .iter()
655            .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
656            .all_equal()
657        {
658            return true;
659        }
660
661        // From here we check chunk lengths.
662        let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
663        match chunk_lengths.next() {
664            None => false,
665            Some(first_column_chunk_lengths) => {
666                // Fast Path for single Chunk Series
667                if first_column_chunk_lengths.size_hint().0 == 1 {
668                    return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
669                }
670                // Always rechunk if we have more chunks than rows.
671                // except when we have an empty df containing a single chunk
672                let height = self.height();
673                let n_chunks = first_column_chunk_lengths.size_hint().0;
674                if n_chunks > height && !(height == 0 && n_chunks == 1) {
675                    return true;
676                }
677                // Slow Path for multi Chunk series
678                let v: Vec<_> = first_column_chunk_lengths.collect();
679                for cl in chunk_lengths {
680                    if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
681                        return true;
682                    }
683                }
684                false
685            },
686        }
687    }
688
689    /// Ensure all the chunks in the [`DataFrame`] are aligned.
690    pub fn align_chunks_par(&mut self) -> &mut Self {
691        if self.should_rechunk() {
692            self.as_single_chunk_par()
693        } else {
694            self
695        }
696    }
697
698    pub fn align_chunks(&mut self) -> &mut Self {
699        if self.should_rechunk() {
700            self.as_single_chunk()
701        } else {
702            self
703        }
704    }
705
706    /// Get the [`DataFrame`] schema.
707    ///
708    /// # Example
709    ///
710    /// ```rust
711    /// # use polars_core::prelude::*;
712    /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
713    ///                         "Diameter (m)" => [8.8e26, f64::INFINITY])?;
714    ///
715    /// let f1: Field = Field::new("Thing".into(), DataType::String);
716    /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
717    /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
718    ///
719    /// assert_eq!(&**df.schema(), &sc);
720    /// # Ok::<(), PolarsError>(())
721    /// ```
722    pub fn schema(&self) -> &SchemaRef {
723        let out = self.cached_schema.get_or_init(|| {
724            Arc::new(
725                self.columns
726                    .iter()
727                    .map(|x| (x.name().clone(), x.dtype().clone()))
728                    .collect(),
729            )
730        });
731
732        debug_assert_eq!(out.len(), self.width());
733
734        out
735    }
736
737    /// Get a reference to the [`DataFrame`] columns.
738    ///
739    /// # Example
740    ///
741    /// ```rust
742    /// # use polars_core::prelude::*;
743    /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
744    ///                         "Symbol" => ["A", "C", "G", "T"])?;
745    /// let columns: &[Column] = df.get_columns();
746    ///
747    /// assert_eq!(columns[0].name(), "Name");
748    /// assert_eq!(columns[1].name(), "Symbol");
749    /// # Ok::<(), PolarsError>(())
750    /// ```
751    #[inline]
752    pub fn get_columns(&self) -> &[Column] {
753        &self.columns
754    }
755
756    #[inline]
757    /// Get mutable access to the underlying columns.
758    ///
759    /// # Safety
760    ///
761    /// The caller must ensure the length of all [`Series`] remains equal to `height` or
762    /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
763    /// The caller must ensure that the cached schema is cleared if it modifies the schema by
764    /// calling [`DataFrame::clear_schema`].
765    pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
766        &mut self.columns
767    }
768
769    #[inline]
770    /// Remove all the columns in the [`DataFrame`] but keep the `height`.
771    pub fn clear_columns(&mut self) {
772        unsafe { self.get_columns_mut() }.clear();
773        self.clear_schema();
774    }
775
776    #[inline]
777    /// Extend the columns without checking for name collisions or height.
778    ///
779    /// # Safety
780    ///
781    /// The caller needs to ensure that:
782    /// - Column names are unique within the resulting [`DataFrame`].
783    /// - The length of each appended column matches the height of the [`DataFrame`]. For
784    ///   `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
785    ///   with [`DataFrame::set_height`].
786    pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
787        unsafe { self.get_columns_mut() }.extend(iter);
788        self.clear_schema();
789    }
790
791    /// Take ownership of the underlying columns vec.
792    pub fn take_columns(self) -> Vec<Column> {
793        self.columns
794    }
795
796    /// Iterator over the columns as [`Series`].
797    ///
798    /// # Example
799    ///
800    /// ```rust
801    /// # use polars_core::prelude::*;
802    /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
803    /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
804    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
805    ///
806    /// let mut iterator = df.iter();
807    ///
808    /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
809    /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
810    /// assert_eq!(iterator.next(), None);
811    /// # Ok::<(), PolarsError>(())
812    /// ```
813    pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
814        self.materialized_column_iter()
815    }
816
817    /// # Example
818    ///
819    /// ```rust
820    /// # use polars_core::prelude::*;
821    /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
822    ///                         "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
823    ///
824    /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
825    /// # Ok::<(), PolarsError>(())
826    /// ```
827    pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
828        self.columns.iter().map(|s| s.name()).collect()
829    }
830
831    /// Get the [`Vec<PlSmallStr>`] representing the column names.
832    pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
833        self.columns.iter().map(|s| s.name().clone()).collect()
834    }
835
836    pub fn get_column_names_str(&self) -> Vec<&str> {
837        self.columns.iter().map(|s| s.name().as_str()).collect()
838    }
839
840    /// Set the column names.
841    /// # Example
842    ///
843    /// ```rust
844    /// # use polars_core::prelude::*;
845    /// let mut df: DataFrame = df!("Mathematical set" => ["ā„•", "ℤ", "š”»", "ā„š", "ā„", "ā„‚"])?;
846    /// df.set_column_names(["Set"])?;
847    ///
848    /// assert_eq!(df.get_column_names(), &["Set"]);
849    /// # Ok::<(), PolarsError>(())
850    /// ```
851    pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
852    where
853        I: IntoIterator<Item = S>,
854        S: Into<PlSmallStr>,
855    {
856        let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
857        self._set_column_names_impl(names.as_slice())
858    }
859
860    fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
861        polars_ensure!(
862            names.len() == self.width(),
863            ShapeMismatch: "{} column names provided for a DataFrame of width {}",
864            names.len(), self.width()
865        );
866        ensure_names_unique(names, |s| s.as_str())?;
867
868        let columns = mem::take(&mut self.columns);
869        self.columns = columns
870            .into_iter()
871            .zip(names)
872            .map(|(s, name)| {
873                let mut s = s;
874                s.rename(name.clone());
875                s
876            })
877            .collect();
878        self.clear_schema();
879        Ok(())
880    }
881
882    /// Get the data types of the columns in the [`DataFrame`].
883    ///
884    /// # Example
885    ///
886    /// ```rust
887    /// # use polars_core::prelude::*;
888    /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
889    ///                                "Fraction" => [0.965, 0.035])?;
890    ///
891    /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
892    /// # Ok::<(), PolarsError>(())
893    /// ```
894    pub fn dtypes(&self) -> Vec<DataType> {
895        self.columns.iter().map(|s| s.dtype().clone()).collect()
896    }
897
898    pub(crate) fn first_series_column(&self) -> Option<&Series> {
899        self.columns.iter().find_map(|col| col.as_series())
900    }
901
902    /// The number of chunks for the first column.
903    pub fn first_col_n_chunks(&self) -> usize {
904        match self.first_series_column() {
905            None if self.columns.is_empty() => 0,
906            None => 1,
907            Some(s) => s.n_chunks(),
908        }
909    }
910
911    /// The highest number of chunks for any column.
912    pub fn max_n_chunks(&self) -> usize {
913        self.columns
914            .iter()
915            .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
916            .max()
917            .unwrap_or(0)
918    }
919
920    /// Get a reference to the schema fields of the [`DataFrame`].
921    ///
922    /// # Example
923    ///
924    /// ```rust
925    /// # use polars_core::prelude::*;
926    /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
927    ///                            "Fraction" => [0.708, 0.292])?;
928    ///
929    /// let f1: Field = Field::new("Surface type".into(), DataType::String);
930    /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
931    ///
932    /// assert_eq!(earth.fields(), &[f1, f2]);
933    /// # Ok::<(), PolarsError>(())
934    /// ```
935    pub fn fields(&self) -> Vec<Field> {
936        self.columns
937            .iter()
938            .map(|s| s.field().into_owned())
939            .collect()
940    }
941
942    /// Get (height, width) of the [`DataFrame`].
943    ///
944    /// # Example
945    ///
946    /// ```rust
947    /// # use polars_core::prelude::*;
948    /// let df0: DataFrame = DataFrame::default();
949    /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
950    /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
951    ///                          "2" => [1, 2, 3, 4, 5])?;
952    ///
953    /// assert_eq!(df0.shape(), (0 ,0));
954    /// assert_eq!(df1.shape(), (5, 1));
955    /// assert_eq!(df2.shape(), (5, 2));
956    /// # Ok::<(), PolarsError>(())
957    /// ```
958    pub fn shape(&self) -> (usize, usize) {
959        (self.height, self.columns.len())
960    }
961
962    /// Get the width of the [`DataFrame`] which is the number of columns.
963    ///
964    /// # Example
965    ///
966    /// ```rust
967    /// # use polars_core::prelude::*;
968    /// let df0: DataFrame = DataFrame::default();
969    /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
970    /// let df2: DataFrame = df!("Series 1" => [0; 0],
971    ///                          "Series 2" => [0; 0])?;
972    ///
973    /// assert_eq!(df0.width(), 0);
974    /// assert_eq!(df1.width(), 1);
975    /// assert_eq!(df2.width(), 2);
976    /// # Ok::<(), PolarsError>(())
977    /// ```
978    pub fn width(&self) -> usize {
979        self.columns.len()
980    }
981
982    /// Get the height of the [`DataFrame`] which is the number of rows.
983    ///
984    /// # Example
985    ///
986    /// ```rust
987    /// # use polars_core::prelude::*;
988    /// let df0: DataFrame = DataFrame::default();
989    /// let df1: DataFrame = df!("Currency" => ["€", "$"])?;
990    /// let df2: DataFrame = df!("Currency" => ["€", "$", "Ā„", "Ā£", "₿"])?;
991    ///
992    /// assert_eq!(df0.height(), 0);
993    /// assert_eq!(df1.height(), 2);
994    /// assert_eq!(df2.height(), 5);
995    /// # Ok::<(), PolarsError>(())
996    /// ```
997    pub fn height(&self) -> usize {
998        self.height
999    }
1000
1001    /// Returns the size as number of rows * number of columns
1002    pub fn size(&self) -> usize {
1003        let s = self.shape();
1004        s.0 * s.1
1005    }
1006
1007    /// Returns `true` if the [`DataFrame`] contains no rows.
1008    ///
1009    /// # Example
1010    ///
1011    /// ```rust
1012    /// # use polars_core::prelude::*;
1013    /// let df1: DataFrame = DataFrame::default();
1014    /// assert!(df1.is_empty());
1015    ///
1016    /// let df2: DataFrame = df!("First name" => ["Forever"],
1017    ///                          "Last name" => ["Alone"])?;
1018    /// assert!(!df2.is_empty());
1019    /// # Ok::<(), PolarsError>(())
1020    /// ```
1021    pub fn is_empty(&self) -> bool {
1022        matches!(self.shape(), (0, _) | (_, 0))
1023    }
1024
1025    /// Set the height (i.e. number of rows) of this [`DataFrame`].
1026    ///
1027    /// # Safety
1028    ///
1029    /// This needs to be equal to the length of all the columns.
1030    pub unsafe fn set_height(&mut self, height: usize) {
1031        self.height = height;
1032    }
1033
1034    /// Add multiple [`Series`] to a [`DataFrame`].
1035    /// The added `Series` are required to have the same length.
1036    ///
1037    /// # Example
1038    ///
1039    /// ```rust
1040    /// # use polars_core::prelude::*;
1041    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1042    /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1043    /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1044    ///
1045    /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1046    /// assert_eq!(df2.shape(), (3, 3));
1047    /// println!("{}", df2);
1048    /// # Ok::<(), PolarsError>(())
1049    /// ```
1050    ///
1051    /// Output:
1052    ///
1053    /// ```text
1054    /// shape: (3, 3)
1055    /// +---------+--------+----------+
1056    /// | Element | Proton | Electron |
1057    /// | ---     | ---    | ---      |
1058    /// | str     | i32    | i32      |
1059    /// +=========+========+==========+
1060    /// | Copper  | 29     | 29       |
1061    /// +---------+--------+----------+
1062    /// | Silver  | 47     | 47       |
1063    /// +---------+--------+----------+
1064    /// | Gold    | 79     | 79       |
1065    /// +---------+--------+----------+
1066    /// ```
1067    pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1068        let mut new_cols = self.columns.clone();
1069        new_cols.extend_from_slice(columns);
1070        DataFrame::new(new_cols)
1071    }
1072
1073    /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1074    ///
1075    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1076    ///
1077    /// # Example
1078    ///
1079    /// ```rust
1080    /// # use polars_core::prelude::*;
1081    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1082    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1083    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1084    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1085    ///
1086    /// let df3: DataFrame = df1.vstack(&df2)?;
1087    ///
1088    /// assert_eq!(df3.shape(), (5, 2));
1089    /// println!("{}", df3);
1090    /// # Ok::<(), PolarsError>(())
1091    /// ```
1092    ///
1093    /// Output:
1094    ///
1095    /// ```text
1096    /// shape: (5, 2)
1097    /// +-----------+-------------------+
1098    /// | Element   | Melting Point (K) |
1099    /// | ---       | ---               |
1100    /// | str       | f64               |
1101    /// +===========+===================+
1102    /// | Copper    | 1357.77           |
1103    /// +-----------+-------------------+
1104    /// | Silver    | 1234.93           |
1105    /// +-----------+-------------------+
1106    /// | Gold      | 1337.33           |
1107    /// +-----------+-------------------+
1108    /// | Platinum  | 2041.4            |
1109    /// +-----------+-------------------+
1110    /// | Palladium | 1828.05           |
1111    /// +-----------+-------------------+
1112    /// ```
1113    pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1114        let mut df = self.clone();
1115        df.vstack_mut(other)?;
1116        Ok(df)
1117    }
1118
1119    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1120    ///
1121    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1122    ///
1123    /// # Example
1124    ///
1125    /// ```rust
1126    /// # use polars_core::prelude::*;
1127    /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1128    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1129    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1130    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1131    ///
1132    /// df1.vstack_mut(&df2)?;
1133    ///
1134    /// assert_eq!(df1.shape(), (5, 2));
1135    /// println!("{}", df1);
1136    /// # Ok::<(), PolarsError>(())
1137    /// ```
1138    ///
1139    /// Output:
1140    ///
1141    /// ```text
1142    /// shape: (5, 2)
1143    /// +-----------+-------------------+
1144    /// | Element   | Melting Point (K) |
1145    /// | ---       | ---               |
1146    /// | str       | f64               |
1147    /// +===========+===================+
1148    /// | Copper    | 1357.77           |
1149    /// +-----------+-------------------+
1150    /// | Silver    | 1234.93           |
1151    /// +-----------+-------------------+
1152    /// | Gold      | 1337.33           |
1153    /// +-----------+-------------------+
1154    /// | Platinum  | 2041.4            |
1155    /// +-----------+-------------------+
1156    /// | Palladium | 1828.05           |
1157    /// +-----------+-------------------+
1158    /// ```
1159    pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1160        if self.width() != other.width() {
1161            polars_ensure!(
1162                self.width() == 0,
1163                ShapeMismatch:
1164                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1165                self.width(), other.width(),
1166            );
1167            self.columns.clone_from(&other.columns);
1168            self.height = other.height;
1169            return Ok(self);
1170        }
1171
1172        self.columns
1173            .iter_mut()
1174            .zip(other.columns.iter())
1175            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1176                ensure_can_extend(&*left, right)?;
1177                left.append(right).map_err(|e| {
1178                    e.context(format!("failed to vstack column '{}'", right.name()).into())
1179                })?;
1180                Ok(())
1181            })?;
1182        self.height += other.height;
1183        Ok(self)
1184    }
1185
1186    pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1187        if self.width() != other.width() {
1188            polars_ensure!(
1189                self.width() == 0,
1190                ShapeMismatch:
1191                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1192                self.width(), other.width(),
1193            );
1194            self.columns = other.columns;
1195            self.height = other.height;
1196            return Ok(self);
1197        }
1198
1199        self.columns
1200            .iter_mut()
1201            .zip(other.columns.into_iter())
1202            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1203                ensure_can_extend(&*left, &right)?;
1204                let right_name = right.name().clone();
1205                left.append_owned(right).map_err(|e| {
1206                    e.context(format!("failed to vstack column '{right_name}'").into())
1207                })?;
1208                Ok(())
1209            })?;
1210        self.height += other.height;
1211        Ok(self)
1212    }
1213
1214    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1215    ///
1216    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1217    ///
1218    /// # Panics
1219    /// Panics if the schema's don't match.
1220    pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1221        self.columns
1222            .iter_mut()
1223            .zip(other.columns.iter())
1224            .for_each(|(left, right)| {
1225                left.append(right)
1226                    .map_err(|e| {
1227                        e.context(format!("failed to vstack column '{}'", right.name()).into())
1228                    })
1229                    .expect("should not fail");
1230            });
1231        self.height += other.height;
1232    }
1233
1234    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1235    ///
1236    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1237    ///
1238    /// # Panics
1239    /// Panics if the schema's don't match.
1240    pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1241        self.columns
1242            .iter_mut()
1243            .zip(other.columns)
1244            .for_each(|(left, right)| {
1245                left.append_owned(right).expect("should not fail");
1246            });
1247        self.height += other.height;
1248    }
1249
1250    /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1251    ///
1252    /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1253    /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1254    ///
1255    /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1256    /// and thus will yield faster queries.
1257    ///
1258    /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1259    /// online operations where you add `n` rows and rerun a query.
1260    ///
1261    /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1262    /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1263    /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1264    pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1265        polars_ensure!(
1266            self.width() == other.width(),
1267            ShapeMismatch:
1268            "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1269            self.width(), other.width(),
1270        );
1271
1272        self.columns
1273            .iter_mut()
1274            .zip(other.columns.iter())
1275            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1276                ensure_can_extend(&*left, right)?;
1277                left.extend(right).map_err(|e| {
1278                    e.context(format!("failed to extend column '{}'", right.name()).into())
1279                })?;
1280                Ok(())
1281            })?;
1282        self.height += other.height;
1283        self.clear_schema();
1284        Ok(())
1285    }
1286
1287    /// Remove a column by name and return the column removed.
1288    ///
1289    /// # Example
1290    ///
1291    /// ```rust
1292    /// # use polars_core::prelude::*;
1293    /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1294    ///                             "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1295    ///
1296    /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1297    /// assert!(s1.is_err());
1298    ///
1299    /// let s2: Column = df.drop_in_place("Animal")?;
1300    /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1301    /// # Ok::<(), PolarsError>(())
1302    /// ```
1303    pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1304        let idx = self.check_name_to_idx(name)?;
1305        self.clear_schema();
1306        Ok(self.columns.remove(idx))
1307    }
1308
1309    /// Return a new [`DataFrame`] where all null values are dropped.
1310    ///
1311    /// # Example
1312    ///
1313    /// ```no_run
1314    /// # use polars_core::prelude::*;
1315    /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1316    ///                         "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1317    /// assert_eq!(df1.shape(), (3, 2));
1318    ///
1319    /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1320    /// assert_eq!(df2.shape(), (1, 2));
1321    /// println!("{}", df2);
1322    /// # Ok::<(), PolarsError>(())
1323    /// ```
1324    ///
1325    /// Output:
1326    ///
1327    /// ```text
1328    /// shape: (1, 2)
1329    /// +---------+---------------------+
1330    /// | Country | Tax revenue (% GDP) |
1331    /// | ---     | ---                 |
1332    /// | str     | f64                 |
1333    /// +=========+=====================+
1334    /// | Malta   | 32.7                |
1335    /// +---------+---------------------+
1336    /// ```
1337    pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1338    where
1339        for<'a> &'a S: Into<PlSmallStr>,
1340    {
1341        if let Some(v) = subset {
1342            let v = self.select_columns(v)?;
1343            self._drop_nulls_impl(v.as_slice())
1344        } else {
1345            self._drop_nulls_impl(self.columns.as_slice())
1346        }
1347    }
1348
1349    fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1350        // fast path for no nulls in df
1351        if subset.iter().all(|s| !s.has_nulls()) {
1352            return Ok(self.clone());
1353        }
1354
1355        let mut iter = subset.iter();
1356
1357        let mask = iter
1358            .next()
1359            .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1360        let mut mask = mask.is_not_null();
1361
1362        for c in iter {
1363            mask = mask & c.is_not_null();
1364        }
1365        self.filter(&mask)
1366    }
1367
1368    /// Drop a column by name.
1369    /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1370    /// the current one in place.
1371    ///
1372    /// # Example
1373    ///
1374    /// ```rust
1375    /// # use polars_core::prelude::*;
1376    /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1377    /// let df2: DataFrame = df1.drop("Ray type")?;
1378    ///
1379    /// assert!(df2.is_empty());
1380    /// # Ok::<(), PolarsError>(())
1381    /// ```
1382    pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1383        let idx = self.check_name_to_idx(name)?;
1384        let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1385
1386        self.columns.iter().enumerate().for_each(|(i, s)| {
1387            if i != idx {
1388                new_cols.push(s.clone())
1389            }
1390        });
1391
1392        Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1393    }
1394
1395    /// Drop columns that are in `names`.
1396    pub fn drop_many<I, S>(&self, names: I) -> Self
1397    where
1398        I: IntoIterator<Item = S>,
1399        S: Into<PlSmallStr>,
1400    {
1401        let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1402        self.drop_many_amortized(&names)
1403    }
1404
1405    /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1406    pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1407        if names.is_empty() {
1408            return self.clone();
1409        }
1410        let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1411        self.columns.iter().for_each(|s| {
1412            if !names.contains(s.name()) {
1413                new_cols.push(s.clone())
1414            }
1415        });
1416
1417        unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1418    }
1419
1420    /// Insert a new column at a given index without checking for duplicates.
1421    /// This can leave the [`DataFrame`] at an invalid state
1422    fn insert_column_no_name_check(
1423        &mut self,
1424        index: usize,
1425        column: Column,
1426    ) -> PolarsResult<&mut Self> {
1427        polars_ensure!(
1428            self.width() == 0 || column.len() == self.height(),
1429            ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1430            column.len(), self.height(),
1431        );
1432
1433        if self.width() == 0 {
1434            self.height = column.len();
1435        }
1436
1437        self.columns.insert(index, column);
1438        self.clear_schema();
1439        Ok(self)
1440    }
1441
1442    /// Insert a new column at a given index.
1443    pub fn insert_column<S: IntoColumn>(
1444        &mut self,
1445        index: usize,
1446        column: S,
1447    ) -> PolarsResult<&mut Self> {
1448        let column = column.into_column();
1449        self.check_already_present(column.name().as_str())?;
1450        self.insert_column_no_name_check(index, column)
1451    }
1452
1453    fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1454        if let Some(idx) = self.get_column_index(column.name().as_str()) {
1455            self.replace_column(idx, column)?;
1456        } else {
1457            if self.width() == 0 {
1458                self.height = column.len();
1459            }
1460
1461            self.columns.push(column);
1462            self.clear_schema();
1463        }
1464        Ok(())
1465    }
1466
1467    /// Add a new column to this [`DataFrame`] or replace an existing one.
1468    pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1469        fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1470            let height = df.height();
1471            if column.len() == 1 && height > 1 {
1472                column = column.new_from_index(0, height);
1473            }
1474
1475            if column.len() == height || df.get_columns().is_empty() {
1476                df.add_column_by_search(column)?;
1477                Ok(df)
1478            }
1479            // special case for literals
1480            else if height == 0 && column.len() == 1 {
1481                let s = column.clear();
1482                df.add_column_by_search(s)?;
1483                Ok(df)
1484            } else {
1485                polars_bail!(
1486                    ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1487                    column.len(), height,
1488                );
1489            }
1490        }
1491        let column = column.into_column();
1492        inner(self, column)
1493    }
1494
1495    /// Adds a column to the [`DataFrame`] without doing any checks
1496    /// on length or duplicates.
1497    ///
1498    /// # Safety
1499    /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1500    pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1501        debug_assert!(self.width() == 0 || self.height() == column.len());
1502        debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1503
1504        // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1505        // properly for `width` == 0.
1506        if self.width() == 0 {
1507            unsafe { self.set_height(column.len()) };
1508        }
1509        unsafe { self.get_columns_mut() }.push(column);
1510        self.clear_schema();
1511
1512        self
1513    }
1514
1515    // Note: Schema can be both input or output_schema
1516    fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1517        let name = c.name();
1518        if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1519            if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1520                // Given schema is output_schema and we can push.
1521                if idx == self.columns.len() {
1522                    if self.width() == 0 {
1523                        self.height = c.len();
1524                    }
1525
1526                    self.columns.push(c);
1527                    self.clear_schema();
1528                }
1529                // Schema is incorrect fallback to search
1530                else {
1531                    debug_assert!(false);
1532                    self.add_column_by_search(c)?;
1533                }
1534            } else {
1535                self.replace_column(idx, c)?;
1536            }
1537        } else {
1538            if self.width() == 0 {
1539                self.height = c.len();
1540            }
1541
1542            self.columns.push(c);
1543            self.clear_schema();
1544        }
1545
1546        Ok(())
1547    }
1548
1549    // Note: Schema can be both input or output_schema
1550    pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1551        for (i, s) in series.into_iter().enumerate() {
1552            // we need to branch here
1553            // because users can add multiple columns with the same name
1554            if i == 0 || schema.get(s.name().as_str()).is_some() {
1555                self.with_column_and_schema(s.into_column(), schema)?;
1556            } else {
1557                self.with_column(s.clone().into_column())?;
1558            }
1559        }
1560        Ok(())
1561    }
1562
1563    pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1564        for (i, s) in columns.into_iter().enumerate() {
1565            // we need to branch here
1566            // because users can add multiple columns with the same name
1567            if i == 0 || schema.get(s.name().as_str()).is_some() {
1568                self.with_column_and_schema(s, schema)?;
1569            } else {
1570                self.with_column(s.clone())?;
1571            }
1572        }
1573
1574        Ok(())
1575    }
1576
1577    /// Add a new column to this [`DataFrame`] or replace an existing one.
1578    /// Uses an existing schema to amortize lookups.
1579    /// If the schema is incorrect, we will fallback to linear search.
1580    ///
1581    /// Note: Schema can be both input or output_schema
1582    pub fn with_column_and_schema<C: IntoColumn>(
1583        &mut self,
1584        column: C,
1585        schema: &Schema,
1586    ) -> PolarsResult<&mut Self> {
1587        let mut column = column.into_column();
1588
1589        let height = self.height();
1590        if column.len() == 1 && height > 1 {
1591            column = column.new_from_index(0, height);
1592        }
1593
1594        if column.len() == height || self.columns.is_empty() {
1595            self.add_column_by_schema(column, schema)?;
1596            Ok(self)
1597        }
1598        // special case for literals
1599        else if height == 0 && column.len() == 1 {
1600            let s = column.clear();
1601            self.add_column_by_schema(s, schema)?;
1602            Ok(self)
1603        } else {
1604            polars_bail!(
1605                ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1606                column.len(), height,
1607            );
1608        }
1609    }
1610
1611    /// Get a row in the [`DataFrame`]. Beware this is slow.
1612    ///
1613    /// # Example
1614    ///
1615    /// ```
1616    /// # use polars_core::prelude::*;
1617    /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1618    ///     df.get(idx)
1619    /// }
1620    /// ```
1621    pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1622        match self.columns.first() {
1623            Some(s) => {
1624                if s.len() <= idx {
1625                    return None;
1626                }
1627            },
1628            None => return None,
1629        }
1630        // SAFETY: we just checked bounds
1631        unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1632    }
1633
1634    /// Select a [`Series`] by index.
1635    ///
1636    /// # Example
1637    ///
1638    /// ```rust
1639    /// # use polars_core::prelude::*;
1640    /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1641    ///                         "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1642    ///
1643    /// let s1: Option<&Column> = df.select_at_idx(0);
1644    /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1645    ///
1646    /// assert_eq!(s1, Some(&s2));
1647    /// # Ok::<(), PolarsError>(())
1648    /// ```
1649    pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1650        self.columns.get(idx)
1651    }
1652
1653    /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1654    ///
1655    /// # Examples
1656    ///
1657    /// ```rust
1658    /// # use polars_core::prelude::*;
1659    /// let df = df! {
1660    ///     "0" => [0, 0, 0],
1661    ///     "1" => [1, 1, 1],
1662    ///     "2" => [2, 2, 2]
1663    /// }?;
1664    ///
1665    /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1666    /// assert!(df.equals(&df.select_by_range(..)?));
1667    /// # Ok::<(), PolarsError>(())
1668    /// ```
1669    pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1670    where
1671        R: ops::RangeBounds<usize>,
1672    {
1673        // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1674        // because it is the nightly feature. We should change here if this function were stable.
1675        fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1676        where
1677            R: ops::RangeBounds<usize>,
1678        {
1679            let len = bounds.end;
1680
1681            let start: ops::Bound<&usize> = range.start_bound();
1682            let start = match start {
1683                ops::Bound::Included(&start) => start,
1684                ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1685                    panic!("attempted to index slice from after maximum usize");
1686                }),
1687                ops::Bound::Unbounded => 0,
1688            };
1689
1690            let end: ops::Bound<&usize> = range.end_bound();
1691            let end = match end {
1692                ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1693                    panic!("attempted to index slice up to maximum usize");
1694                }),
1695                ops::Bound::Excluded(&end) => end,
1696                ops::Bound::Unbounded => len,
1697            };
1698
1699            if start > end {
1700                panic!("slice index starts at {start} but ends at {end}");
1701            }
1702            if end > len {
1703                panic!("range end index {end} out of range for slice of length {len}",);
1704            }
1705
1706            ops::Range { start, end }
1707        }
1708
1709        let colnames = self.get_column_names_owned();
1710        let range = get_range(range, ..colnames.len());
1711
1712        self._select_impl(&colnames[range])
1713    }
1714
1715    /// Get column index of a [`Series`] by name.
1716    /// # Example
1717    ///
1718    /// ```rust
1719    /// # use polars_core::prelude::*;
1720    /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1721    ///                         "Health" => [100, 200, 500],
1722    ///                         "Mana" => [250, 100, 0],
1723    ///                         "Strength" => [30, 150, 300])?;
1724    ///
1725    /// assert_eq!(df.get_column_index("Name"), Some(0));
1726    /// assert_eq!(df.get_column_index("Health"), Some(1));
1727    /// assert_eq!(df.get_column_index("Mana"), Some(2));
1728    /// assert_eq!(df.get_column_index("Strength"), Some(3));
1729    /// assert_eq!(df.get_column_index("Haste"), None);
1730    /// # Ok::<(), PolarsError>(())
1731    /// ```
1732    pub fn get_column_index(&self, name: &str) -> Option<usize> {
1733        let schema = self.schema();
1734        if let Some(idx) = schema.index_of(name) {
1735            if self
1736                .get_columns()
1737                .get(idx)
1738                .is_some_and(|c| c.name() == name)
1739            {
1740                return Some(idx);
1741            }
1742        }
1743
1744        self.columns.iter().position(|s| s.name().as_str() == name)
1745    }
1746
1747    /// Get column index of a [`Series`] by name.
1748    pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1749        self.get_column_index(name)
1750            .ok_or_else(|| polars_err!(col_not_found = name))
1751    }
1752
1753    /// Select a single column by name.
1754    ///
1755    /// # Example
1756    ///
1757    /// ```rust
1758    /// # use polars_core::prelude::*;
1759    /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1760    /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1761    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1762    ///
1763    /// assert_eq!(df.column("Password")?, &s1);
1764    /// # Ok::<(), PolarsError>(())
1765    /// ```
1766    pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1767        let idx = self.try_get_column_index(name)?;
1768        Ok(self.select_at_idx(idx).unwrap())
1769    }
1770
1771    /// Selected multiple columns by name.
1772    ///
1773    /// # Example
1774    ///
1775    /// ```rust
1776    /// # use polars_core::prelude::*;
1777    /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1778    ///                         "Max weight (kg)" => [16.0, 35.89])?;
1779    /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1780    ///
1781    /// assert_eq!(&df[0], sv[0]);
1782    /// assert_eq!(&df[1], sv[1]);
1783    /// # Ok::<(), PolarsError>(())
1784    /// ```
1785    pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1786    where
1787        I: IntoIterator<Item = S>,
1788        S: AsRef<str>,
1789    {
1790        names
1791            .into_iter()
1792            .map(|name| self.column(name.as_ref()))
1793            .collect()
1794    }
1795
1796    /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1797    ///
1798    /// # Examples
1799    ///
1800    /// ```
1801    /// # use polars_core::prelude::*;
1802    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1803    ///     df.select(["foo", "bar"])
1804    /// }
1805    /// ```
1806    pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1807    where
1808        I: IntoIterator<Item = S>,
1809        S: Into<PlSmallStr>,
1810    {
1811        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1812        self._select_impl(cols.as_slice())
1813    }
1814
1815    pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1816        ensure_names_unique(cols, |s| s.as_str())?;
1817        self._select_impl_unchecked(cols)
1818    }
1819
1820    pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1821        let selected = self.select_columns_impl(cols)?;
1822        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1823    }
1824
1825    /// Select with a known schema. The schema names must match the column names of this DataFrame.
1826    pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1827    where
1828        I: IntoIterator<Item = S>,
1829        S: Into<PlSmallStr>,
1830    {
1831        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1832        self._select_with_schema_impl(&cols, schema, true)
1833    }
1834
1835    /// Select with a known schema without checking for duplicates in `selection`.
1836    /// The schema names must match the column names of this DataFrame.
1837    pub fn select_with_schema_unchecked<I, S>(
1838        &self,
1839        selection: I,
1840        schema: &Schema,
1841    ) -> PolarsResult<Self>
1842    where
1843        I: IntoIterator<Item = S>,
1844        S: Into<PlSmallStr>,
1845    {
1846        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1847        self._select_with_schema_impl(&cols, schema, false)
1848    }
1849
1850    /// * The schema names must match the column names of this DataFrame.
1851    pub fn _select_with_schema_impl(
1852        &self,
1853        cols: &[PlSmallStr],
1854        schema: &Schema,
1855        check_duplicates: bool,
1856    ) -> PolarsResult<Self> {
1857        if check_duplicates {
1858            ensure_names_unique(cols, |s| s.as_str())?;
1859        }
1860
1861        let selected = self.select_columns_impl_with_schema(cols, schema)?;
1862        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1863    }
1864
1865    /// A non generic implementation to reduce compiler bloat.
1866    fn select_columns_impl_with_schema(
1867        &self,
1868        cols: &[PlSmallStr],
1869        schema: &Schema,
1870    ) -> PolarsResult<Vec<Column>> {
1871        if cfg!(debug_assertions) {
1872            ensure_matching_schema_names(schema, self.schema())?;
1873        }
1874
1875        cols.iter()
1876            .map(|name| {
1877                let index = schema.try_get_full(name.as_str())?.0;
1878                Ok(self.columns[index].clone())
1879            })
1880            .collect()
1881    }
1882
1883    pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1884    where
1885        I: IntoIterator<Item = S>,
1886        S: Into<PlSmallStr>,
1887    {
1888        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1889        self.select_physical_impl(&cols)
1890    }
1891
1892    fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1893        ensure_names_unique(cols, |s| s.as_str())?;
1894        let selected = self.select_columns_physical_impl(cols)?;
1895        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1896    }
1897
1898    pub fn project(&self, to: SchemaRef) -> PolarsResult<Self> {
1899        let from = self.schema();
1900        let columns = to
1901            .iter_names()
1902            .map(|name| Ok(self.columns[from.try_index_of(name.as_str())?].clone()))
1903            .collect::<PolarsResult<Vec<_>>>()?;
1904        let mut df = unsafe { Self::new_no_checks(self.height(), columns) };
1905        df.cached_schema = to.into();
1906        Ok(df)
1907    }
1908
1909    /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1910    ///
1911    /// # Example
1912    ///
1913    /// ```rust
1914    /// # use polars_core::prelude::*;
1915    /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1916    ///                         "Carbon" => [1, 2, 3],
1917    ///                         "Hydrogen" => [4, 6, 8])?;
1918    /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1919    ///
1920    /// assert_eq!(df["Carbon"], sv[0]);
1921    /// assert_eq!(df["Hydrogen"], sv[1]);
1922    /// # Ok::<(), PolarsError>(())
1923    /// ```
1924    pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1925        let cols = selection.into_vec();
1926        self.select_columns_impl(&cols)
1927    }
1928
1929    fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1930        self.columns
1931            .iter()
1932            .enumerate()
1933            .map(|(i, s)| (s.name().as_str(), i))
1934            .collect()
1935    }
1936
1937    /// A non generic implementation to reduce compiler bloat.
1938    fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1939        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1940            let name_to_idx = self._names_to_idx_map();
1941            cols.iter()
1942                .map(|name| {
1943                    let idx = *name_to_idx
1944                        .get(name.as_str())
1945                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1946                    Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1947                })
1948                .collect::<PolarsResult<Vec<_>>>()?
1949        } else {
1950            cols.iter()
1951                .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1952                .collect::<PolarsResult<Vec<_>>>()?
1953        };
1954
1955        Ok(selected)
1956    }
1957
1958    /// A non generic implementation to reduce compiler bloat.
1959    fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1960        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1961            // we hash, because there are user that having millions of columns.
1962            // # https://github.com/pola-rs/polars/issues/1023
1963            let name_to_idx = self._names_to_idx_map();
1964
1965            cols.iter()
1966                .map(|name| {
1967                    let idx = *name_to_idx
1968                        .get(name.as_str())
1969                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1970                    Ok(self.select_at_idx(idx).unwrap().clone())
1971                })
1972                .collect::<PolarsResult<Vec<_>>>()?
1973        } else {
1974            cols.iter()
1975                .map(|c| self.column(c.as_str()).cloned())
1976                .collect::<PolarsResult<Vec<_>>>()?
1977        };
1978
1979        Ok(selected)
1980    }
1981
1982    fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1983        // If there is a filtered column just see how many columns there are left.
1984        if let Some(fst) = filtered.first() {
1985            return fst.len();
1986        }
1987
1988        // Otherwise, count the number of values that would be filtered and return that height.
1989        let num_trues = mask.num_trues();
1990        if mask.len() == self.height() {
1991            num_trues
1992        } else {
1993            // This is for broadcasting masks
1994            debug_assert!(num_trues == 0 || num_trues == 1);
1995            self.height() * num_trues
1996        }
1997    }
1998
1999    /// Take the [`DataFrame`] rows by a boolean mask.
2000    ///
2001    /// # Example
2002    ///
2003    /// ```
2004    /// # use polars_core::prelude::*;
2005    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2006    ///     let mask = df.column("sepal_width")?.is_not_null();
2007    ///     df.filter(&mask)
2008    /// }
2009    /// ```
2010    pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2011        let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
2012        let height = self.filter_height(&new_col, mask);
2013
2014        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2015    }
2016
2017    /// Same as `filter` but does not parallelize.
2018    pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2019        let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
2020        let height = self.filter_height(&new_col, mask);
2021
2022        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2023    }
2024
2025    /// Take [`DataFrame`] rows by index values.
2026    ///
2027    /// # Example
2028    ///
2029    /// ```
2030    /// # use polars_core::prelude::*;
2031    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2032    ///     let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2033    ///     df.take(&idx)
2034    /// }
2035    /// ```
2036    pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2037        let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2038
2039        Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2040    }
2041
2042    /// # Safety
2043    /// The indices must be in-bounds.
2044    pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2045        self.take_unchecked_impl(idx, true)
2046    }
2047
2048    /// # Safety
2049    /// The indices must be in-bounds.
2050    pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2051        let cols = if allow_threads && POOL.current_num_threads() > 1 {
2052            POOL.install(|| {
2053                if POOL.current_num_threads() > self.width() {
2054                    let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2055                    self._apply_columns_par(&|c| {
2056                        (0..idx.len().div_ceil(stride))
2057                            .into_par_iter()
2058                            .map(|i| c.take_unchecked(&idx.slice((i * stride) as i64, stride)))
2059                            .reduce(
2060                                || Column::new_empty(c.name().clone(), c.dtype()),
2061                                |mut a, b| {
2062                                    a.append_owned(b).unwrap();
2063                                    a
2064                                },
2065                            )
2066                    })
2067                } else {
2068                    self._apply_columns_par(&|c| c.take_unchecked(idx))
2069                }
2070            })
2071        } else {
2072            self._apply_columns(&|s| s.take_unchecked(idx))
2073        };
2074        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2075    }
2076
2077    /// # Safety
2078    /// The indices must be in-bounds.
2079    pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2080        self.take_slice_unchecked_impl(idx, true)
2081    }
2082
2083    /// # Safety
2084    /// The indices must be in-bounds.
2085    pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2086        let cols = if allow_threads && POOL.current_num_threads() > 1 {
2087            POOL.install(|| {
2088                if POOL.current_num_threads() > self.width() {
2089                    let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2090                    self._apply_columns_par(&|c| {
2091                        (0..idx.len().div_ceil(stride))
2092                            .into_par_iter()
2093                            .map(|i| {
2094                                let idx = &idx[i * stride..];
2095                                let idx = &idx[..idx.len().min(stride)];
2096                                c.take_slice_unchecked(idx)
2097                            })
2098                            .reduce(
2099                                || Column::new_empty(c.name().clone(), c.dtype()),
2100                                |mut a, b| {
2101                                    a.append_owned(b).unwrap();
2102                                    a
2103                                },
2104                            )
2105                    })
2106                } else {
2107                    self._apply_columns_par(&|s| s.take_slice_unchecked(idx))
2108                }
2109            })
2110        } else {
2111            self._apply_columns(&|s| s.take_slice_unchecked(idx))
2112        };
2113        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2114    }
2115
2116    /// Rename a column in the [`DataFrame`].
2117    ///
2118    /// Should not be called in a loop as that can lead to quadratic behavior.
2119    ///
2120    /// # Example
2121    ///
2122    /// ```
2123    /// # use polars_core::prelude::*;
2124    /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2125    ///     let original_name = "foo";
2126    ///     let new_name = "bar";
2127    ///     df.rename(original_name, new_name.into())
2128    /// }
2129    /// ```
2130    pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2131        if column == name.as_str() {
2132            return Ok(self);
2133        }
2134        polars_ensure!(
2135            !self.schema().contains(&name),
2136            Duplicate: "column rename attempted with already existing name \"{name}\""
2137        );
2138
2139        self.get_column_index(column)
2140            .and_then(|idx| self.columns.get_mut(idx))
2141            .ok_or_else(|| polars_err!(col_not_found = column))
2142            .map(|c| c.rename(name))?;
2143        self.clear_schema();
2144
2145        Ok(self)
2146    }
2147
2148    pub fn rename_many<'a>(
2149        &mut self,
2150        renames: impl Iterator<Item = (&'a str, PlSmallStr)>,
2151    ) -> PolarsResult<&mut Self> {
2152        let mut schema = self.schema().as_ref().clone();
2153        self.clear_schema();
2154
2155        for (from, to) in renames {
2156            if from == to.as_str() {
2157                continue;
2158            }
2159
2160            polars_ensure!(
2161                !schema.contains(&to),
2162                Duplicate: "column rename attempted with already existing name \"{to}\""
2163            );
2164
2165            match schema.get_full(from) {
2166                None => polars_bail!(col_not_found = from),
2167                Some((idx, _, _)) => {
2168                    let (n, _) = schema.get_at_index_mut(idx).unwrap();
2169                    *n = to.clone();
2170                    self.columns.get_mut(idx).unwrap().rename(to);
2171                },
2172            }
2173        }
2174
2175        self.cached_schema = OnceLock::from(Arc::new(schema));
2176        Ok(self)
2177    }
2178
2179    /// Sort [`DataFrame`] in place.
2180    ///
2181    /// See [`DataFrame::sort`] for more instruction.
2182    pub fn sort_in_place(
2183        &mut self,
2184        by: impl IntoVec<PlSmallStr>,
2185        sort_options: SortMultipleOptions,
2186    ) -> PolarsResult<&mut Self> {
2187        let by_column = self.select_columns(by)?;
2188        self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2189        Ok(self)
2190    }
2191
2192    #[doc(hidden)]
2193    /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2194    pub fn sort_impl(
2195        &self,
2196        by_column: Vec<Column>,
2197        mut sort_options: SortMultipleOptions,
2198        slice: Option<(i64, usize)>,
2199    ) -> PolarsResult<Self> {
2200        if by_column.is_empty() {
2201            // If no columns selected, any order (including original order) is correct.
2202            return if let Some((offset, len)) = slice {
2203                Ok(self.slice(offset, len))
2204            } else {
2205                Ok(self.clone())
2206            };
2207        }
2208
2209        // note that the by_column argument also contains evaluated expression from
2210        // polars-lazy that may not even be present in this dataframe. therefore
2211        // when we try to set the first columns as sorted, we ignore the error as
2212        // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2213        let first_descending = sort_options.descending[0];
2214        let first_by_column = by_column[0].name().to_string();
2215
2216        let set_sorted = |df: &mut DataFrame| {
2217            // Mark the first sort column as sorted; if the column does not exist it
2218            // is ok, because we sorted by an expression not present in the dataframe
2219            let _ = df.apply(&first_by_column, |s| {
2220                let mut s = s.clone();
2221                if first_descending {
2222                    s.set_sorted_flag(IsSorted::Descending)
2223                } else {
2224                    s.set_sorted_flag(IsSorted::Ascending)
2225                }
2226                s
2227            });
2228        };
2229        if self.is_empty() {
2230            let mut out = self.clone();
2231            set_sorted(&mut out);
2232            return Ok(out);
2233        }
2234
2235        if let Some((0, k)) = slice {
2236            if k < self.len() {
2237                return self.bottom_k_impl(k, by_column, sort_options);
2238            }
2239        }
2240        // Check if the required column is already sorted; if so we can exit early
2241        // We can do so when there is only one column to sort by, for multiple columns
2242        // it will be complicated to do so
2243        #[cfg(feature = "dtype-categorical")]
2244        let is_not_categorical_enum =
2245            !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2246                || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2247
2248        #[cfg(not(feature = "dtype-categorical"))]
2249        #[allow(non_upper_case_globals)]
2250        const is_not_categorical_enum: bool = true;
2251
2252        if by_column.len() == 1 && is_not_categorical_enum {
2253            let required_sorting = if sort_options.descending[0] {
2254                IsSorted::Descending
2255            } else {
2256                IsSorted::Ascending
2257            };
2258            // If null count is 0 then nulls_last doesnt matter
2259            // Safe to get value at last position since the dataframe is not empty (taken care above)
2260            let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2261                && ((by_column[0].null_count() == 0)
2262                    || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2263                        == sort_options.nulls_last[0]);
2264
2265            if no_sorting_required {
2266                return if let Some((offset, len)) = slice {
2267                    Ok(self.slice(offset, len))
2268                } else {
2269                    Ok(self.clone())
2270                };
2271            }
2272        }
2273
2274        let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2275
2276        // a lot of indirection in both sorting and take
2277        let mut df = self.clone();
2278        let df = df.as_single_chunk_par();
2279        let mut take = match (by_column.len(), has_nested) {
2280            (1, false) => {
2281                let s = &by_column[0];
2282                let options = SortOptions {
2283                    descending: sort_options.descending[0],
2284                    nulls_last: sort_options.nulls_last[0],
2285                    multithreaded: sort_options.multithreaded,
2286                    maintain_order: sort_options.maintain_order,
2287                    limit: sort_options.limit,
2288                };
2289                // fast path for a frame with a single series
2290                // no need to compute the sort indices and then take by these indices
2291                // simply sort and return as frame
2292                if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2293                    let mut out = s.sort_with(options)?;
2294                    if let Some((offset, len)) = slice {
2295                        out = out.slice(offset, len);
2296                    }
2297                    return Ok(out.into_frame());
2298                }
2299                s.arg_sort(options)
2300            },
2301            _ => {
2302                if sort_options.nulls_last.iter().all(|&x| x)
2303                    || has_nested
2304                    || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2305                {
2306                    argsort_multiple_row_fmt(
2307                        &by_column,
2308                        sort_options.descending,
2309                        sort_options.nulls_last,
2310                        sort_options.multithreaded,
2311                    )?
2312                } else {
2313                    let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2314                    first
2315                        .as_materialized_series()
2316                        .arg_sort_multiple(&other, &sort_options)?
2317                }
2318            },
2319        };
2320
2321        if let Some((offset, len)) = slice {
2322            take = take.slice(offset, len);
2323        }
2324
2325        // SAFETY:
2326        // the created indices are in bounds
2327        let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2328        set_sorted(&mut df);
2329        Ok(df)
2330    }
2331
2332    /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2333    ///
2334    /// This dataframe does not necessarily have a specified schema and may be changed at any
2335    /// point. It is primarily used for debugging.
2336    pub fn _to_metadata(&self) -> DataFrame {
2337        let num_columns = self.columns.len();
2338
2339        let mut column_names =
2340            StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2341        let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2342        let mut sorted_asc_ca =
2343            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2344        let mut sorted_dsc_ca =
2345            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2346        let mut fast_explode_list_ca =
2347            BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2348        let mut materialized_at_ca =
2349            StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2350
2351        for col in &self.columns {
2352            let flags = col.get_flags();
2353
2354            let (repr, materialized_at) = match col {
2355                Column::Series(s) => ("series", s.materialized_at()),
2356                Column::Scalar(_) => ("scalar", None),
2357            };
2358            let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2359            let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2360            let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2361
2362            column_names.append_value(col.name().clone());
2363            repr_ca.append_value(repr);
2364            sorted_asc_ca.append_value(sorted_asc);
2365            sorted_dsc_ca.append_value(sorted_dsc);
2366            fast_explode_list_ca.append_value(fast_explode_list);
2367            materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2368        }
2369
2370        unsafe {
2371            DataFrame::new_no_checks(
2372                self.width(),
2373                vec![
2374                    column_names.finish().into_column(),
2375                    repr_ca.finish().into_column(),
2376                    sorted_asc_ca.finish().into_column(),
2377                    sorted_dsc_ca.finish().into_column(),
2378                    fast_explode_list_ca.finish().into_column(),
2379                    materialized_at_ca.finish().into_column(),
2380                ],
2381            )
2382        }
2383    }
2384
2385    /// Return a sorted clone of this [`DataFrame`].
2386    ///
2387    /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2388    /// # Example
2389    ///
2390    /// Sort by a single column with default options:
2391    /// ```
2392    /// # use polars_core::prelude::*;
2393    /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2394    ///     df.sort(["sepal_width"], Default::default())
2395    /// }
2396    /// ```
2397    /// Sort by a single column with specific order:
2398    /// ```
2399    /// # use polars_core::prelude::*;
2400    /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2401    ///     df.sort(
2402    ///         ["sepal_width"],
2403    ///         SortMultipleOptions::new()
2404    ///             .with_order_descending(descending)
2405    ///     )
2406    /// }
2407    /// ```
2408    /// Sort by multiple columns with specifying order for each column:
2409    /// ```
2410    /// # use polars_core::prelude::*;
2411    /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2412    ///     df.sort(
2413    ///         ["sepal_width", "sepal_length"],
2414    ///         SortMultipleOptions::new()
2415    ///             .with_order_descending_multi([false, true])
2416    ///     )
2417    /// }
2418    /// ```
2419    /// See [`SortMultipleOptions`] for more options.
2420    ///
2421    /// Also see [`DataFrame::sort_in_place`].
2422    pub fn sort(
2423        &self,
2424        by: impl IntoVec<PlSmallStr>,
2425        sort_options: SortMultipleOptions,
2426    ) -> PolarsResult<Self> {
2427        let mut df = self.clone();
2428        df.sort_in_place(by, sort_options)?;
2429        Ok(df)
2430    }
2431
2432    /// Replace a column with a [`Series`].
2433    ///
2434    /// # Example
2435    ///
2436    /// ```rust
2437    /// # use polars_core::prelude::*;
2438    /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2439    ///                         "Area (km²)" => [9_833_520, 9_596_961])?;
2440    /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2441    ///
2442    /// assert!(df.replace("Nation", s.clone()).is_err());
2443    /// assert!(df.replace("Country", s).is_ok());
2444    /// # Ok::<(), PolarsError>(())
2445    /// ```
2446    pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2447        self.apply(column, |_| new_col.into_series())
2448    }
2449
2450    /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2451    /// is that now the value of `column: &str` determines the name of the column and not the name
2452    /// of the `Series` passed to this method.
2453    pub fn replace_or_add<S: IntoSeries>(
2454        &mut self,
2455        column: PlSmallStr,
2456        new_col: S,
2457    ) -> PolarsResult<&mut Self> {
2458        let mut new_col = new_col.into_series();
2459        new_col.rename(column);
2460        self.with_column(new_col)
2461    }
2462
2463    /// Replace column at index `idx` with a [`Series`].
2464    ///
2465    /// # Example
2466    ///
2467    /// ```ignored
2468    /// # use polars_core::prelude::*;
2469    /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2470    /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2471    /// let mut df = DataFrame::new(vec![s0, s1])?;
2472    ///
2473    /// // Add 32 to get lowercase ascii values
2474    /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2475    /// # Ok::<(), PolarsError>(())
2476    /// ```
2477    pub fn replace_column<C: IntoColumn>(
2478        &mut self,
2479        index: usize,
2480        new_column: C,
2481    ) -> PolarsResult<&mut Self> {
2482        polars_ensure!(
2483            index < self.width(),
2484            ShapeMismatch:
2485            "unable to replace at index {}, the DataFrame has only {} columns",
2486            index, self.width(),
2487        );
2488        let mut new_column = new_column.into_column();
2489        polars_ensure!(
2490            new_column.len() == self.height(),
2491            ShapeMismatch:
2492            "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2493            new_column.len(), self.height(),
2494        );
2495        let old_col = &mut self.columns[index];
2496        mem::swap(old_col, &mut new_column);
2497        self.clear_schema();
2498        Ok(self)
2499    }
2500
2501    /// Apply a closure to a column. This is the recommended way to do in place modification.
2502    ///
2503    /// # Example
2504    ///
2505    /// ```rust
2506    /// # use polars_core::prelude::*;
2507    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2508    /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2509    /// let mut df = DataFrame::new(vec![s0, s1])?;
2510    ///
2511    /// fn str_to_len(str_val: &Column) -> Column {
2512    ///     str_val.str()
2513    ///         .unwrap()
2514    ///         .into_iter()
2515    ///         .map(|opt_name: Option<&str>| {
2516    ///             opt_name.map(|name: &str| name.len() as u32)
2517    ///          })
2518    ///         .collect::<UInt32Chunked>()
2519    ///         .into_column()
2520    /// }
2521    ///
2522    /// // Replace the names column by the length of the names.
2523    /// df.apply("names", str_to_len);
2524    /// # Ok::<(), PolarsError>(())
2525    /// ```
2526    /// Results in:
2527    ///
2528    /// ```text
2529    /// +--------+-------+
2530    /// | foo    |       |
2531    /// | ---    | names |
2532    /// | str    | u32   |
2533    /// +========+=======+
2534    /// | "ham"  | 4     |
2535    /// +--------+-------+
2536    /// | "spam" | 6     |
2537    /// +--------+-------+
2538    /// | "egg"  | 3     |
2539    /// +--------+-------+
2540    /// ```
2541    pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2542    where
2543        F: FnOnce(&Column) -> C,
2544        C: IntoColumn,
2545    {
2546        let idx = self.check_name_to_idx(name)?;
2547        self.apply_at_idx(idx, f)?;
2548        Ok(self)
2549    }
2550
2551    /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2552    /// modification.
2553    ///
2554    /// # Example
2555    ///
2556    /// ```rust
2557    /// # use polars_core::prelude::*;
2558    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2559    /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2560    /// let mut df = DataFrame::new(vec![s0, s1])?;
2561    ///
2562    /// // Add 32 to get lowercase ascii values
2563    /// df.apply_at_idx(1, |s| s + 32);
2564    /// # Ok::<(), PolarsError>(())
2565    /// ```
2566    /// Results in:
2567    ///
2568    /// ```text
2569    /// +--------+-------+
2570    /// | foo    | ascii |
2571    /// | ---    | ---   |
2572    /// | str    | i32   |
2573    /// +========+=======+
2574    /// | "ham"  | 102   |
2575    /// +--------+-------+
2576    /// | "spam" | 111   |
2577    /// +--------+-------+
2578    /// | "egg"  | 111   |
2579    /// +--------+-------+
2580    /// ```
2581    pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2582    where
2583        F: FnOnce(&Column) -> C,
2584        C: IntoColumn,
2585    {
2586        let df_height = self.height();
2587        let width = self.width();
2588        let col = self.columns.get_mut(idx).ok_or_else(|| {
2589            polars_err!(
2590                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2591                idx, width
2592            )
2593        })?;
2594        let name = col.name().clone();
2595        let dtype_before = col.dtype().clone();
2596        let new_col = f(col).into_column();
2597        match new_col.len() {
2598            1 => {
2599                let new_col = new_col.new_from_index(0, df_height);
2600                let _ = mem::replace(col, new_col);
2601            },
2602            len if (len == df_height) => {
2603                let _ = mem::replace(col, new_col);
2604            },
2605            len => polars_bail!(
2606                ShapeMismatch:
2607                "resulting Series has length {} while the DataFrame has height {}",
2608                len, df_height
2609            ),
2610        }
2611
2612        // make sure the name remains the same after applying the closure
2613        unsafe {
2614            let col = self.columns.get_unchecked_mut(idx);
2615            col.rename(name);
2616
2617            if col.dtype() != &dtype_before {
2618                self.clear_schema();
2619            }
2620        }
2621        Ok(self)
2622    }
2623
2624    /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2625    /// modification.
2626    ///
2627    /// # Example
2628    ///
2629    /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2630    ///
2631    /// ```rust
2632    /// # use polars_core::prelude::*;
2633    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2634    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2635    /// let mut df = DataFrame::new(vec![s0, s1])?;
2636    ///
2637    /// let idx = vec![0, 1, 4];
2638    ///
2639    /// df.try_apply("foo", |c| {
2640    ///     c.str()?
2641    ///     .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2642    /// });
2643    /// # Ok::<(), PolarsError>(())
2644    /// ```
2645    /// Results in:
2646    ///
2647    /// ```text
2648    /// +---------------------+--------+
2649    /// | foo                 | values |
2650    /// | ---                 | ---    |
2651    /// | str                 | i32    |
2652    /// +=====================+========+
2653    /// | "ham-is-modified"   | 1      |
2654    /// +---------------------+--------+
2655    /// | "spam-is-modified"  | 2      |
2656    /// +---------------------+--------+
2657    /// | "egg"               | 3      |
2658    /// +---------------------+--------+
2659    /// | "bacon"             | 4      |
2660    /// +---------------------+--------+
2661    /// | "quack-is-modified" | 5      |
2662    /// +---------------------+--------+
2663    /// ```
2664    pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2665    where
2666        F: FnOnce(&Column) -> PolarsResult<C>,
2667        C: IntoColumn,
2668    {
2669        let width = self.width();
2670        let col = self.columns.get_mut(idx).ok_or_else(|| {
2671            polars_err!(
2672                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2673                idx, width
2674            )
2675        })?;
2676        let name = col.name().clone();
2677
2678        let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2679
2680        // make sure the name remains the same after applying the closure
2681        unsafe {
2682            let col = self.columns.get_unchecked_mut(idx);
2683            col.rename(name);
2684        }
2685        Ok(self)
2686    }
2687
2688    /// Apply a closure that may fail to a column. This is the recommended way to do in place
2689    /// modification.
2690    ///
2691    /// # Example
2692    ///
2693    /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2694    ///
2695    /// ```rust
2696    /// # use polars_core::prelude::*;
2697    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2698    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2699    /// let mut df = DataFrame::new(vec![s0, s1])?;
2700    ///
2701    /// // create a mask
2702    /// let values = df.column("values")?.as_materialized_series();
2703    /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2704    ///
2705    /// df.try_apply("foo", |c| {
2706    ///     c.str()?
2707    ///     .set(&mask, Some("not_within_bounds"))
2708    /// });
2709    /// # Ok::<(), PolarsError>(())
2710    /// ```
2711    /// Results in:
2712    ///
2713    /// ```text
2714    /// +---------------------+--------+
2715    /// | foo                 | values |
2716    /// | ---                 | ---    |
2717    /// | str                 | i32    |
2718    /// +=====================+========+
2719    /// | "not_within_bounds" | 1      |
2720    /// +---------------------+--------+
2721    /// | "spam"              | 2      |
2722    /// +---------------------+--------+
2723    /// | "egg"               | 3      |
2724    /// +---------------------+--------+
2725    /// | "bacon"             | 4      |
2726    /// +---------------------+--------+
2727    /// | "not_within_bounds" | 5      |
2728    /// +---------------------+--------+
2729    /// ```
2730    pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2731    where
2732        F: FnOnce(&Series) -> PolarsResult<C>,
2733        C: IntoColumn,
2734    {
2735        let idx = self.try_get_column_index(column)?;
2736        self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2737    }
2738
2739    /// Slice the [`DataFrame`] along the rows.
2740    ///
2741    /// # Example
2742    ///
2743    /// ```rust
2744    /// # use polars_core::prelude::*;
2745    /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2746    ///                         "Color" => ["Green", "Red", "White", "White", "Red"])?;
2747    /// let sl: DataFrame = df.slice(2, 3);
2748    ///
2749    /// assert_eq!(sl.shape(), (3, 2));
2750    /// println!("{}", sl);
2751    /// # Ok::<(), PolarsError>(())
2752    /// ```
2753    /// Output:
2754    /// ```text
2755    /// shape: (3, 2)
2756    /// +-------+-------+
2757    /// | Fruit | Color |
2758    /// | ---   | ---   |
2759    /// | str   | str   |
2760    /// +=======+=======+
2761    /// | Grape | White |
2762    /// +-------+-------+
2763    /// | Fig   | White |
2764    /// +-------+-------+
2765    /// | Fig   | Red   |
2766    /// +-------+-------+
2767    /// ```
2768    #[must_use]
2769    pub fn slice(&self, offset: i64, length: usize) -> Self {
2770        if offset == 0 && length == self.height() {
2771            return self.clone();
2772        }
2773        if length == 0 {
2774            return self.clear();
2775        }
2776        let col = self
2777            .columns
2778            .iter()
2779            .map(|s| s.slice(offset, length))
2780            .collect::<Vec<_>>();
2781
2782        let height = if let Some(fst) = col.first() {
2783            fst.len()
2784        } else {
2785            let (_, length) = slice_offsets(offset, length, self.height());
2786            length
2787        };
2788
2789        unsafe { DataFrame::new_no_checks(height, col) }
2790    }
2791
2792    /// Split [`DataFrame`] at the given `offset`.
2793    pub fn split_at(&self, offset: i64) -> (Self, Self) {
2794        let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2795
2796        let (idx, _) = slice_offsets(offset, 0, self.height());
2797
2798        let a = unsafe { DataFrame::new_no_checks(idx, a) };
2799        let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2800        (a, b)
2801    }
2802
2803    #[must_use]
2804    pub fn clear(&self) -> Self {
2805        let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2806        unsafe { DataFrame::new_no_checks(0, col) }
2807    }
2808
2809    #[must_use]
2810    pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2811        if offset == 0 && length == self.height() {
2812            return self.clone();
2813        }
2814        let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2815        unsafe { DataFrame::new_no_checks(length, columns) }
2816    }
2817
2818    #[must_use]
2819    pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2820        if offset == 0 && length == self.height() {
2821            return self.clone();
2822        }
2823        // @scalar-opt
2824        let columns = self._apply_columns(&|s| {
2825            let mut out = s.slice(offset, length);
2826            out.shrink_to_fit();
2827            out
2828        });
2829        unsafe { DataFrame::new_no_checks(length, columns) }
2830    }
2831
2832    /// Get the head of the [`DataFrame`].
2833    ///
2834    /// # Example
2835    ///
2836    /// ```rust
2837    /// # use polars_core::prelude::*;
2838    /// let countries: DataFrame =
2839    ///     df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2840    ///         "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2841    ///         "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2842    ///         "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2843    /// assert_eq!(countries.shape(), (5, 4));
2844    ///
2845    /// println!("{}", countries.head(Some(3)));
2846    /// # Ok::<(), PolarsError>(())
2847    /// ```
2848    ///
2849    /// Output:
2850    ///
2851    /// ```text
2852    /// shape: (3, 4)
2853    /// +--------------------+---------------+---------------+------------+
2854    /// | Rank by GDP (2021) | Continent     | Country       | Capital    |
2855    /// | ---                | ---           | ---           | ---        |
2856    /// | i32                | str           | str           | str        |
2857    /// +====================+===============+===============+============+
2858    /// | 1                  | North America | United States | Washington |
2859    /// +--------------------+---------------+---------------+------------+
2860    /// | 2                  | Asia          | China         | Beijing    |
2861    /// +--------------------+---------------+---------------+------------+
2862    /// | 3                  | Asia          | Japan         | Tokyo      |
2863    /// +--------------------+---------------+---------------+------------+
2864    /// ```
2865    #[must_use]
2866    pub fn head(&self, length: Option<usize>) -> Self {
2867        let col = self
2868            .columns
2869            .iter()
2870            .map(|c| c.head(length))
2871            .collect::<Vec<_>>();
2872
2873        let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2874        let height = usize::min(height, self.height());
2875        unsafe { DataFrame::new_no_checks(height, col) }
2876    }
2877
2878    /// Get the tail of the [`DataFrame`].
2879    ///
2880    /// # Example
2881    ///
2882    /// ```rust
2883    /// # use polars_core::prelude::*;
2884    /// let countries: DataFrame =
2885    ///     df!("Rank (2021)" => [105, 106, 107, 108, 109],
2886    ///         "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2887    ///         "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2888    /// assert_eq!(countries.shape(), (5, 3));
2889    ///
2890    /// println!("{}", countries.tail(Some(2)));
2891    /// # Ok::<(), PolarsError>(())
2892    /// ```
2893    ///
2894    /// Output:
2895    ///
2896    /// ```text
2897    /// shape: (2, 3)
2898    /// +-------------+--------------------+---------+
2899    /// | Rank (2021) | Apple Price (€/kg) | Country |
2900    /// | ---         | ---                | ---     |
2901    /// | i32         | f64                | str     |
2902    /// +=============+====================+=========+
2903    /// | 108         | 0.63               | Syria   |
2904    /// +-------------+--------------------+---------+
2905    /// | 109         | 0.63               | Turkey  |
2906    /// +-------------+--------------------+---------+
2907    /// ```
2908    #[must_use]
2909    pub fn tail(&self, length: Option<usize>) -> Self {
2910        let col = self
2911            .columns
2912            .iter()
2913            .map(|c| c.tail(length))
2914            .collect::<Vec<_>>();
2915
2916        let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2917        let height = usize::min(height, self.height());
2918        unsafe { DataFrame::new_no_checks(height, col) }
2919    }
2920
2921    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2922    ///
2923    /// # Panics
2924    ///
2925    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2926    ///
2927    /// This responsibility is left to the caller as we don't want to take mutable references here,
2928    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2929    /// as well.
2930    pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter<'_> {
2931        debug_assert!(!self.should_rechunk(), "expected equal chunks");
2932        // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2933        // as we must allocate arrow strings/binaries.
2934        let must_convert = compat_level.0 == 0;
2935        let parallel = parallel
2936            && must_convert
2937            && self.columns.len() > 1
2938            && self
2939                .columns
2940                .iter()
2941                .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2942
2943        RecordBatchIter {
2944            columns: &self.columns,
2945            schema: Arc::new(
2946                self.columns
2947                    .iter()
2948                    .map(|c| c.field().to_arrow(compat_level))
2949                    .collect(),
2950            ),
2951            idx: 0,
2952            n_chunks: self.first_col_n_chunks(),
2953            compat_level,
2954            parallel,
2955        }
2956    }
2957
2958    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2959    ///
2960    /// # Panics
2961    ///
2962    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2963    ///
2964    /// This responsibility is left to the caller as we don't want to take mutable references here,
2965    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2966    /// as well.
2967    pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2968        debug_assert!(!self.should_rechunk());
2969        PhysRecordBatchIter {
2970            schema: Arc::new(
2971                self.get_columns()
2972                    .iter()
2973                    .map(|c| c.field().to_arrow(CompatLevel::newest()))
2974                    .collect(),
2975            ),
2976            arr_iters: self
2977                .materialized_column_iter()
2978                .map(|s| s.chunks().iter())
2979                .collect(),
2980        }
2981    }
2982
2983    /// Get a [`DataFrame`] with all the columns in reversed order.
2984    #[must_use]
2985    pub fn reverse(&self) -> Self {
2986        let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2987        unsafe { DataFrame::new_no_checks(self.height(), col) }
2988    }
2989
2990    /// Shift the values by a given period and fill the parts that will be empty due to this operation
2991    /// with `Nones`.
2992    ///
2993    /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2994    #[must_use]
2995    pub fn shift(&self, periods: i64) -> Self {
2996        let col = self._apply_columns_par(&|s| s.shift(periods));
2997        unsafe { DataFrame::new_no_checks(self.height(), col) }
2998    }
2999
3000    /// Replace None values with one of the following strategies:
3001    /// * Forward fill (replace None with the previous value)
3002    /// * Backward fill (replace None with the next value)
3003    /// * Mean fill (replace None with the mean of the whole array)
3004    /// * Min fill (replace None with the minimum of the whole array)
3005    /// * Max fill (replace None with the maximum of the whole array)
3006    ///
3007    /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
3008    pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
3009        let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
3010
3011        Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
3012    }
3013
3014    /// Pipe different functions/ closure operations that work on a DataFrame together.
3015    pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
3016    where
3017        F: Fn(DataFrame) -> PolarsResult<B>,
3018    {
3019        f(self)
3020    }
3021
3022    /// Pipe different functions/ closure operations that work on a DataFrame together.
3023    pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
3024    where
3025        F: Fn(&mut DataFrame) -> PolarsResult<B>,
3026    {
3027        f(self)
3028    }
3029
3030    /// Pipe different functions/ closure operations that work on a DataFrame together.
3031    pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
3032    where
3033        F: Fn(DataFrame, Args) -> PolarsResult<B>,
3034    {
3035        f(self, args)
3036    }
3037
3038    /// Drop duplicate rows from a [`DataFrame`].
3039    /// *This fails when there is a column of type List in DataFrame*
3040    ///
3041    /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
3042    ///
3043    /// # Example
3044    ///
3045    /// ```no_run
3046    /// # use polars_core::prelude::*;
3047    /// let df = df! {
3048    ///               "flt" => [1., 1., 2., 2., 3., 3.],
3049    ///               "int" => [1, 1, 2, 2, 3, 3, ],
3050    ///               "str" => ["a", "a", "b", "b", "c", "c"]
3051    ///           }?;
3052    ///
3053    /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
3054    /// # Ok::<(), PolarsError>(())
3055    /// ```
3056    /// Returns
3057    ///
3058    /// ```text
3059    /// +-----+-----+-----+
3060    /// | flt | int | str |
3061    /// | --- | --- | --- |
3062    /// | f64 | i32 | str |
3063    /// +=====+=====+=====+
3064    /// | 1   | 1   | "a" |
3065    /// +-----+-----+-----+
3066    /// | 2   | 2   | "b" |
3067    /// +-----+-----+-----+
3068    /// | 3   | 3   | "c" |
3069    /// +-----+-----+-----+
3070    /// ```
3071    #[cfg(feature = "algorithm_group_by")]
3072    pub fn unique_stable(
3073        &self,
3074        subset: Option<&[String]>,
3075        keep: UniqueKeepStrategy,
3076        slice: Option<(i64, usize)>,
3077    ) -> PolarsResult<DataFrame> {
3078        self.unique_impl(
3079            true,
3080            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3081            keep,
3082            slice,
3083        )
3084    }
3085
3086    /// Unstable distinct. See [`DataFrame::unique_stable`].
3087    #[cfg(feature = "algorithm_group_by")]
3088    pub fn unique<I, S>(
3089        &self,
3090        subset: Option<&[String]>,
3091        keep: UniqueKeepStrategy,
3092        slice: Option<(i64, usize)>,
3093    ) -> PolarsResult<DataFrame> {
3094        self.unique_impl(
3095            false,
3096            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3097            keep,
3098            slice,
3099        )
3100    }
3101
3102    #[cfg(feature = "algorithm_group_by")]
3103    pub fn unique_impl(
3104        &self,
3105        maintain_order: bool,
3106        subset: Option<Vec<PlSmallStr>>,
3107        keep: UniqueKeepStrategy,
3108        slice: Option<(i64, usize)>,
3109    ) -> PolarsResult<Self> {
3110        let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3111        let mut df = self.clone();
3112        // take on multiple chunks is terrible
3113        df.as_single_chunk_par();
3114
3115        let columns = match (keep, maintain_order) {
3116            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3117                let gb = df.group_by_stable(names)?;
3118                let groups = gb.get_groups();
3119                let (offset, len) = slice.unwrap_or((0, groups.len()));
3120                let groups = groups.slice(offset, len);
3121                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3122            },
3123            (UniqueKeepStrategy::Last, true) => {
3124                // maintain order by last values, so the sorted groups are not correct as they
3125                // are sorted by the first value
3126                let gb = df.group_by_stable(names)?;
3127                let groups = gb.get_groups();
3128
3129                let last_idx: NoNull<IdxCa> = groups
3130                    .iter()
3131                    .map(|g| match g {
3132                        GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3133                        GroupsIndicator::Slice([first, len]) => first + len - 1,
3134                    })
3135                    .collect();
3136
3137                let mut last_idx = last_idx.into_inner().sort(false);
3138
3139                if let Some((offset, len)) = slice {
3140                    last_idx = last_idx.slice(offset, len);
3141                }
3142
3143                let last_idx = NoNull::new(last_idx);
3144                let out = unsafe { df.take_unchecked(&last_idx) };
3145                return Ok(out);
3146            },
3147            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3148                let gb = df.group_by(names)?;
3149                let groups = gb.get_groups();
3150                let (offset, len) = slice.unwrap_or((0, groups.len()));
3151                let groups = groups.slice(offset, len);
3152                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3153            },
3154            (UniqueKeepStrategy::Last, false) => {
3155                let gb = df.group_by(names)?;
3156                let groups = gb.get_groups();
3157                let (offset, len) = slice.unwrap_or((0, groups.len()));
3158                let groups = groups.slice(offset, len);
3159                df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3160            },
3161            (UniqueKeepStrategy::None, _) => {
3162                let df_part = df.select(names)?;
3163                let mask = df_part.is_unique()?;
3164                let mut filtered = df.filter(&mask)?;
3165
3166                if let Some((offset, len)) = slice {
3167                    filtered = filtered.slice(offset, len);
3168                }
3169                return Ok(filtered);
3170            },
3171        };
3172        let height = Self::infer_height(&columns);
3173        Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3174    }
3175
3176    /// Get a mask of all the unique rows in the [`DataFrame`].
3177    ///
3178    /// # Example
3179    ///
3180    /// ```no_run
3181    /// # use polars_core::prelude::*;
3182    /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3183    ///                         "ISIN" => ["US0378331005", "US5949181045"])?;
3184    /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3185    ///
3186    /// assert!(ca.all());
3187    /// # Ok::<(), PolarsError>(())
3188    /// ```
3189    #[cfg(feature = "algorithm_group_by")]
3190    pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3191        let gb = self.group_by(self.get_column_names_owned())?;
3192        let groups = gb.get_groups();
3193        Ok(is_unique_helper(
3194            groups,
3195            self.height() as IdxSize,
3196            true,
3197            false,
3198        ))
3199    }
3200
3201    /// Get a mask of all the duplicated rows in the [`DataFrame`].
3202    ///
3203    /// # Example
3204    ///
3205    /// ```no_run
3206    /// # use polars_core::prelude::*;
3207    /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3208    ///                         "ISIN" => ["US02079K3059", "US02079K1079"])?;
3209    /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3210    ///
3211    /// assert!(!ca.all());
3212    /// # Ok::<(), PolarsError>(())
3213    /// ```
3214    #[cfg(feature = "algorithm_group_by")]
3215    pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3216        let gb = self.group_by(self.get_column_names_owned())?;
3217        let groups = gb.get_groups();
3218        Ok(is_unique_helper(
3219            groups,
3220            self.height() as IdxSize,
3221            false,
3222            true,
3223        ))
3224    }
3225
3226    /// Create a new [`DataFrame`] that shows the null counts per column.
3227    #[must_use]
3228    pub fn null_count(&self) -> Self {
3229        let cols = self
3230            .columns
3231            .iter()
3232            .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3233            .collect();
3234        unsafe { Self::new_no_checks(1, cols) }
3235    }
3236
3237    /// Hash and combine the row values
3238    #[cfg(feature = "row_hash")]
3239    pub fn hash_rows(
3240        &mut self,
3241        hasher_builder: Option<PlSeedableRandomStateQuality>,
3242    ) -> PolarsResult<UInt64Chunked> {
3243        let dfs = split_df(self, POOL.current_num_threads(), false);
3244        let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3245
3246        let mut iter = cas.into_iter();
3247        let mut acc_ca = iter.next().unwrap();
3248        for ca in iter {
3249            acc_ca.append(&ca)?;
3250        }
3251        Ok(acc_ca.rechunk().into_owned())
3252    }
3253
3254    /// Get the supertype of the columns in this DataFrame
3255    pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3256        self.columns
3257            .iter()
3258            .map(|s| Ok(s.dtype().clone()))
3259            .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3260    }
3261
3262    /// Take by index values given by the slice `idx`.
3263    /// # Warning
3264    /// Be careful with allowing threads when calling this in a large hot loop
3265    /// every thread split may be on rayon stack and lead to SO
3266    #[doc(hidden)]
3267    pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3268        self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3269    }
3270
3271    /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3272    /// if the index value in `idx` are sorted. This will maintain sorted flags.
3273    ///
3274    /// # Warning
3275    /// Be careful with allowing threads when calling this in a large hot loop
3276    /// every thread split may be on rayon stack and lead to SO
3277    #[doc(hidden)]
3278    pub unsafe fn _take_unchecked_slice_sorted(
3279        &self,
3280        idx: &[IdxSize],
3281        allow_threads: bool,
3282        sorted: IsSorted,
3283    ) -> Self {
3284        #[cfg(debug_assertions)]
3285        {
3286            if idx.len() > 2 {
3287                match sorted {
3288                    IsSorted::Ascending => {
3289                        assert!(idx[0] <= idx[idx.len() - 1]);
3290                    },
3291                    IsSorted::Descending => {
3292                        assert!(idx[0] >= idx[idx.len() - 1]);
3293                    },
3294                    _ => {},
3295                }
3296            }
3297        }
3298        let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3299        ca.set_sorted_flag(sorted);
3300        self.take_unchecked_impl(&ca, allow_threads)
3301    }
3302
3303    #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3304    #[doc(hidden)]
3305    pub fn _partition_by_impl(
3306        &self,
3307        cols: &[PlSmallStr],
3308        stable: bool,
3309        include_key: bool,
3310        parallel: bool,
3311    ) -> PolarsResult<Vec<DataFrame>> {
3312        let selected_keys = self.select_columns(cols.iter().cloned())?;
3313        let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3314        let groups = groups.take_groups();
3315
3316        // drop key columns prior to calculation if requested
3317        let df = if include_key {
3318            self.clone()
3319        } else {
3320            self.drop_many(cols.iter().cloned())
3321        };
3322
3323        if parallel {
3324            // don't parallelize this
3325            // there is a lot of parallelization in take and this may easily SO
3326            POOL.install(|| {
3327                match groups.as_ref() {
3328                    GroupsType::Idx(idx) => {
3329                        // Rechunk as the gather may rechunk for every group #17562.
3330                        let mut df = df.clone();
3331                        df.as_single_chunk_par();
3332                        Ok(idx
3333                            .into_par_iter()
3334                            .map(|(_, group)| {
3335                                // groups are in bounds
3336                                unsafe {
3337                                    df._take_unchecked_slice_sorted(
3338                                        group,
3339                                        false,
3340                                        IsSorted::Ascending,
3341                                    )
3342                                }
3343                            })
3344                            .collect())
3345                    },
3346                    GroupsType::Slice { groups, .. } => Ok(groups
3347                        .into_par_iter()
3348                        .map(|[first, len]| df.slice(*first as i64, *len as usize))
3349                        .collect()),
3350                }
3351            })
3352        } else {
3353            match groups.as_ref() {
3354                GroupsType::Idx(idx) => {
3355                    // Rechunk as the gather may rechunk for every group #17562.
3356                    let mut df = df;
3357                    df.as_single_chunk();
3358                    Ok(idx
3359                        .into_iter()
3360                        .map(|(_, group)| {
3361                            // groups are in bounds
3362                            unsafe {
3363                                df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3364                            }
3365                        })
3366                        .collect())
3367                },
3368                GroupsType::Slice { groups, .. } => Ok(groups
3369                    .iter()
3370                    .map(|[first, len]| df.slice(*first as i64, *len as usize))
3371                    .collect()),
3372            }
3373        }
3374    }
3375
3376    /// Split into multiple DataFrames partitioned by groups
3377    #[cfg(feature = "partition_by")]
3378    pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3379    where
3380        I: IntoIterator<Item = S>,
3381        S: Into<PlSmallStr>,
3382    {
3383        let cols = cols
3384            .into_iter()
3385            .map(Into::into)
3386            .collect::<Vec<PlSmallStr>>();
3387        self._partition_by_impl(cols.as_slice(), false, include_key, true)
3388    }
3389
3390    /// Split into multiple DataFrames partitioned by groups
3391    /// Order of the groups are maintained.
3392    #[cfg(feature = "partition_by")]
3393    pub fn partition_by_stable<I, S>(
3394        &self,
3395        cols: I,
3396        include_key: bool,
3397    ) -> PolarsResult<Vec<DataFrame>>
3398    where
3399        I: IntoIterator<Item = S>,
3400        S: Into<PlSmallStr>,
3401    {
3402        let cols = cols
3403            .into_iter()
3404            .map(Into::into)
3405            .collect::<Vec<PlSmallStr>>();
3406        self._partition_by_impl(cols.as_slice(), true, include_key, true)
3407    }
3408
3409    /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3410    /// inserted as columns.
3411    #[cfg(feature = "dtype-struct")]
3412    pub fn unnest<I: IntoVec<PlSmallStr>>(
3413        &self,
3414        cols: I,
3415        separator: Option<&str>,
3416    ) -> PolarsResult<DataFrame> {
3417        let cols = cols.into_vec();
3418        self.unnest_impl(cols.into_iter().collect(), separator)
3419    }
3420
3421    #[cfg(feature = "dtype-struct")]
3422    fn unnest_impl(
3423        &self,
3424        cols: PlHashSet<PlSmallStr>,
3425        separator: Option<&str>,
3426    ) -> PolarsResult<DataFrame> {
3427        let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3428        let mut count = 0;
3429        for s in &self.columns {
3430            if cols.contains(s.name()) {
3431                let ca = s.struct_()?.clone();
3432                new_cols.extend(ca.fields_as_series().into_iter().map(|mut f| {
3433                    if let Some(separator) = &separator {
3434                        f.rename(polars_utils::format_pl_smallstr!(
3435                            "{}{}{}",
3436                            s.name(),
3437                            separator,
3438                            f.name()
3439                        ));
3440                    }
3441                    Column::from(f)
3442                }));
3443                count += 1;
3444            } else {
3445                new_cols.push(s.clone())
3446            }
3447        }
3448        if count != cols.len() {
3449            // one or more columns not found
3450            // the code below will return an error with the missing name
3451            let schema = self.schema();
3452            for col in cols {
3453                let _ = schema
3454                    .get(col.as_str())
3455                    .ok_or_else(|| polars_err!(col_not_found = col))?;
3456            }
3457        }
3458        DataFrame::new(new_cols)
3459    }
3460
3461    pub(crate) fn infer_height(cols: &[Column]) -> usize {
3462        cols.first().map_or(0, Column::len)
3463    }
3464
3465    pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3466        // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3467        // append_chunk or something like this. It is just quite difficult to make that safe.
3468        let df = DataFrame::from(rb);
3469        polars_ensure!(
3470            self.schema() == df.schema(),
3471            SchemaMismatch: "cannot append record batch with different schema\n\n
3472        Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3473        );
3474        self.vstack_mut_owned_unchecked(df);
3475        Ok(())
3476    }
3477}
3478
3479pub struct RecordBatchIter<'a> {
3480    columns: &'a Vec<Column>,
3481    schema: ArrowSchemaRef,
3482    idx: usize,
3483    n_chunks: usize,
3484    compat_level: CompatLevel,
3485    parallel: bool,
3486}
3487
3488impl Iterator for RecordBatchIter<'_> {
3489    type Item = RecordBatch;
3490
3491    fn next(&mut self) -> Option<Self::Item> {
3492        if self.idx >= self.n_chunks {
3493            return None;
3494        }
3495
3496        // Create a batch of the columns with the same chunk no.
3497        let batch_cols: Vec<ArrayRef> = if self.parallel {
3498            let iter = self
3499                .columns
3500                .par_iter()
3501                .map(Column::as_materialized_series)
3502                .map(|s| s.to_arrow(self.idx, self.compat_level));
3503            POOL.install(|| iter.collect())
3504        } else {
3505            self.columns
3506                .iter()
3507                .map(Column::as_materialized_series)
3508                .map(|s| s.to_arrow(self.idx, self.compat_level))
3509                .collect()
3510        };
3511        self.idx += 1;
3512
3513        let length = batch_cols.first().map_or(0, |arr| arr.len());
3514        Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3515    }
3516
3517    fn size_hint(&self) -> (usize, Option<usize>) {
3518        let n = self.n_chunks - self.idx;
3519        (n, Some(n))
3520    }
3521}
3522
3523pub struct PhysRecordBatchIter<'a> {
3524    schema: ArrowSchemaRef,
3525    arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3526}
3527
3528impl Iterator for PhysRecordBatchIter<'_> {
3529    type Item = RecordBatch;
3530
3531    fn next(&mut self) -> Option<Self::Item> {
3532        let arrs = self
3533            .arr_iters
3534            .iter_mut()
3535            .map(|phys_iter| phys_iter.next().cloned())
3536            .collect::<Option<Vec<_>>>()?;
3537
3538        let length = arrs.first().map_or(0, |arr| arr.len());
3539        Some(RecordBatch::new(length, self.schema.clone(), arrs))
3540    }
3541
3542    fn size_hint(&self) -> (usize, Option<usize>) {
3543        if let Some(iter) = self.arr_iters.first() {
3544            iter.size_hint()
3545        } else {
3546            (0, None)
3547        }
3548    }
3549}
3550
3551impl Default for DataFrame {
3552    fn default() -> Self {
3553        DataFrame::empty()
3554    }
3555}
3556
3557impl From<DataFrame> for Vec<Column> {
3558    fn from(df: DataFrame) -> Self {
3559        df.columns
3560    }
3561}
3562
3563// utility to test if we can vstack/extend the columns
3564fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3565    polars_ensure!(
3566        left.name() == right.name(),
3567        ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3568        left.name(), right.name(),
3569    );
3570    Ok(())
3571}
3572
3573#[cfg(test)]
3574mod test {
3575    use super::*;
3576
3577    fn create_frame() -> DataFrame {
3578        let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3579        let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3580        DataFrame::new(vec![s0, s1]).unwrap()
3581    }
3582
3583    #[test]
3584    #[cfg_attr(miri, ignore)]
3585    fn test_recordbatch_iterator() {
3586        let df = df!(
3587            "foo" => [1, 2, 3, 4, 5]
3588        )
3589        .unwrap();
3590        let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3591        assert_eq!(5, iter.next().unwrap().len());
3592        assert!(iter.next().is_none());
3593    }
3594
3595    #[test]
3596    #[cfg_attr(miri, ignore)]
3597    fn test_select() {
3598        let df = create_frame();
3599        assert_eq!(
3600            df.column("days")
3601                .unwrap()
3602                .as_series()
3603                .unwrap()
3604                .equal(1)
3605                .unwrap()
3606                .sum(),
3607            Some(1)
3608        );
3609    }
3610
3611    #[test]
3612    #[cfg_attr(miri, ignore)]
3613    fn test_filter_broadcast_on_string_col() {
3614        let col_name = "some_col";
3615        let v = vec!["test".to_string()];
3616        let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3617        let mut df = DataFrame::new(vec![s0]).unwrap();
3618
3619        df = df
3620            .filter(
3621                &df.column(col_name)
3622                    .unwrap()
3623                    .as_materialized_series()
3624                    .equal("")
3625                    .unwrap(),
3626            )
3627            .unwrap();
3628        assert_eq!(
3629            df.column(col_name)
3630                .unwrap()
3631                .as_materialized_series()
3632                .n_chunks(),
3633            1
3634        );
3635    }
3636
3637    #[test]
3638    #[cfg_attr(miri, ignore)]
3639    fn test_filter_broadcast_on_list_col() {
3640        let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3641        let ll: ListChunked = [&s1].iter().copied().collect();
3642
3643        let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3644        let new = ll.filter(&mask).unwrap();
3645
3646        assert_eq!(new.chunks.len(), 1);
3647        assert_eq!(new.len(), 0);
3648    }
3649
3650    #[test]
3651    fn slice() {
3652        let df = create_frame();
3653        let sliced_df = df.slice(0, 2);
3654        assert_eq!(sliced_df.shape(), (2, 2));
3655    }
3656
3657    #[test]
3658    fn rechunk_false() {
3659        let df = create_frame();
3660        assert!(!df.should_rechunk())
3661    }
3662
3663    #[test]
3664    fn rechunk_true() -> PolarsResult<()> {
3665        let mut base = df!(
3666            "a" => [1, 2, 3],
3667            "b" => [1, 2, 3]
3668        )?;
3669
3670        // Create a series with multiple chunks
3671        let mut s = Series::new("foo".into(), 0..2);
3672        let s2 = Series::new("bar".into(), 0..1);
3673        s.append(&s2)?;
3674
3675        // Append series to frame
3676        let out = base.with_column(s)?;
3677
3678        // Now we should rechunk
3679        assert!(out.should_rechunk());
3680        Ok(())
3681    }
3682
3683    #[test]
3684    fn test_duplicate_column() {
3685        let mut df = df! {
3686            "foo" => [1, 2, 3]
3687        }
3688        .unwrap();
3689        // check if column is replaced
3690        assert!(
3691            df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3692                .is_ok()
3693        );
3694        assert!(
3695            df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3696                .is_ok()
3697        );
3698        assert!(df.column("bar").is_ok())
3699    }
3700
3701    #[test]
3702    #[cfg_attr(miri, ignore)]
3703    fn distinct() {
3704        let df = df! {
3705            "flt" => [1., 1., 2., 2., 3., 3.],
3706            "int" => [1, 1, 2, 2, 3, 3, ],
3707            "str" => ["a", "a", "b", "b", "c", "c"]
3708        }
3709        .unwrap();
3710        let df = df
3711            .unique_stable(None, UniqueKeepStrategy::First, None)
3712            .unwrap()
3713            .sort(["flt"], SortMultipleOptions::default())
3714            .unwrap();
3715        let valid = df! {
3716            "flt" => [1., 2., 3.],
3717            "int" => [1, 2, 3],
3718            "str" => ["a", "b", "c"]
3719        }
3720        .unwrap();
3721        assert!(df.equals(&valid));
3722    }
3723
3724    #[test]
3725    fn test_vstack() {
3726        // check that it does not accidentally rechunks
3727        let mut df = df! {
3728            "flt" => [1., 1., 2., 2., 3., 3.],
3729            "int" => [1, 1, 2, 2, 3, 3, ],
3730            "str" => ["a", "a", "b", "b", "c", "c"]
3731        }
3732        .unwrap();
3733
3734        df.vstack_mut(&df.slice(0, 3)).unwrap();
3735        assert_eq!(df.first_col_n_chunks(), 2)
3736    }
3737
3738    #[test]
3739    fn test_vstack_on_empty_dataframe() {
3740        let mut df = DataFrame::empty();
3741
3742        let df_data = df! {
3743            "flt" => [1., 1., 2., 2., 3., 3.],
3744            "int" => [1, 1, 2, 2, 3, 3, ],
3745            "str" => ["a", "a", "b", "b", "c", "c"]
3746        }
3747        .unwrap();
3748
3749        df.vstack_mut(&df_data).unwrap();
3750        assert_eq!(df.height, 6)
3751    }
3752
3753    #[test]
3754    fn test_replace_or_add() -> PolarsResult<()> {
3755        let mut df = df!(
3756            "a" => [1, 2, 3],
3757            "b" => [1, 2, 3]
3758        )?;
3759
3760        // check that the new column is "c" and not "bar".
3761        df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3762
3763        assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3764        Ok(())
3765    }
3766
3767    #[test]
3768    fn test_unique_keep_none_with_slice() {
3769        let df = df! {
3770            "x" => [1, 2, 3, 2, 1]
3771        }
3772        .unwrap();
3773        let out = df
3774            .unique_stable(
3775                Some(&["x".to_string()][..]),
3776                UniqueKeepStrategy::None,
3777                Some((0, 2)),
3778            )
3779            .unwrap();
3780        let expected = df! {
3781            "x" => [3]
3782        }
3783        .unwrap();
3784        assert!(out.equals(&expected));
3785    }
3786
3787    #[test]
3788    #[cfg(feature = "dtype-i8")]
3789    fn test_apply_result_schema() {
3790        let mut df = df! {
3791            "x" => [1, 2, 3, 2, 1]
3792        }
3793        .unwrap();
3794
3795        let schema_before = df.schema().clone();
3796        df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3797        assert_ne!(&schema_before, df.schema());
3798    }
3799}