polars_core/frame/
mod.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(feature = "proptest")]
33pub mod proptest;
34#[cfg(any(feature = "rows", feature = "object"))]
35pub mod row;
36mod top_k;
37mod upstream_traits;
38mod validation;
39
40use arrow::record_batch::{RecordBatch, RecordBatchT};
41use polars_utils::pl_str::PlSmallStr;
42#[cfg(feature = "serde")]
43use serde::{Deserialize, Serialize};
44use strum_macros::IntoStaticStr;
45
46use crate::POOL;
47#[cfg(feature = "row_hash")]
48use crate::hashing::_df_rows_to_hashes_threaded_vertical;
49use crate::prelude::sort::arg_sort;
50use crate::series::IsSorted;
51
52#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
55#[strum(serialize_all = "snake_case")]
56pub enum UniqueKeepStrategy {
57    /// Keep the first unique row.
58    First,
59    /// Keep the last unique row.
60    Last,
61    /// Keep None of the unique rows.
62    None,
63    /// Keep any of the unique rows
64    /// This allows more optimizations
65    #[default]
66    Any,
67}
68
69fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
70where
71    F: for<'a> FnMut(&'a T) -> &'a str,
72{
73    // Always unique.
74    if items.len() <= 1 {
75        return Ok(());
76    }
77
78    if items.len() <= 4 {
79        // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
80        for i in 0..items.len() - 1 {
81            let name = get_name(&items[i]);
82            for other in items.iter().skip(i + 1) {
83                if name == get_name(other) {
84                    polars_bail!(duplicate = name);
85                }
86            }
87        }
88    } else {
89        let mut names = PlHashSet::with_capacity(items.len());
90        for item in items {
91            let name = get_name(item);
92            if !names.insert(name) {
93                polars_bail!(duplicate = name);
94            }
95        }
96    }
97    Ok(())
98}
99
100/// A contiguous growable collection of `Series` that have the same length.
101///
102/// ## Use declarations
103///
104/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
105///
106/// ```rust
107/// use polars_core::prelude::*; // if the crate polars-core is used directly
108/// // use polars::prelude::*;      if the crate polars is used
109/// ```
110///
111/// # Initialization
112/// ## Default
113///
114/// A `DataFrame` can be initialized empty:
115///
116/// ```rust
117/// # use polars_core::prelude::*;
118/// let df = DataFrame::default();
119/// assert!(df.is_empty());
120/// ```
121///
122/// ## Wrapping a `Vec<Series>`
123///
124/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
125///
126/// ```rust
127/// # use polars_core::prelude::*;
128/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
129/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
130///
131/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
132/// ```
133///
134/// ## Using a macro
135///
136/// The [`df!`] macro is a convenient method:
137///
138/// ```rust
139/// # use polars_core::prelude::*;
140/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
141///                                       "Color" => ["Red", "Yellow", "Green"]);
142/// ```
143///
144/// ## Using a CSV file
145///
146/// See the `polars_io::csv::CsvReader`.
147///
148/// # Indexing
149/// ## By a number
150///
151/// The `Index<usize>` is implemented for the `DataFrame`.
152///
153/// ```rust
154/// # use polars_core::prelude::*;
155/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
156///              "Color" => ["Red", "Yellow", "Green"])?;
157///
158/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
159/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
160/// # Ok::<(), PolarsError>(())
161/// ```
162///
163/// ## By a `Series` name
164///
165/// ```rust
166/// # use polars_core::prelude::*;
167/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
168///              "Color" => ["Red", "Yellow", "Green"])?;
169///
170/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
171/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
172/// # Ok::<(), PolarsError>(())
173/// ```
174#[derive(Clone)]
175pub struct DataFrame {
176    height: usize,
177    // invariant: columns[i].len() == height for each 0 >= i > columns.len()
178    pub(crate) columns: Vec<Column>,
179
180    /// A cached schema. This might not give correct results if the DataFrame was modified in place
181    /// between schema and reading.
182    cached_schema: OnceLock<SchemaRef>,
183}
184
185impl DataFrame {
186    pub fn clear_schema(&mut self) {
187        self.cached_schema = OnceLock::new();
188    }
189
190    #[inline]
191    pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
192        self.columns.iter()
193    }
194
195    #[inline]
196    pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
197        self.columns.iter().map(Column::as_materialized_series)
198    }
199
200    #[inline]
201    pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
202        self.columns.par_iter().map(Column::as_materialized_series)
203    }
204
205    /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
206    ///
207    /// # Implementation
208    /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
209    /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
210    /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
211    ///
212    /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
213    /// However, this function will yield a smaller number. This is because this function returns
214    /// the visible size of the buffer, not its total capacity.
215    ///
216    /// FFI buffers are included in this estimation.
217    pub fn estimated_size(&self) -> usize {
218        self.columns.iter().map(Column::estimated_size).sum()
219    }
220
221    // Reduce monomorphization.
222    fn try_apply_columns(
223        &self,
224        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
225    ) -> PolarsResult<Vec<Column>> {
226        self.columns.iter().map(func).collect()
227    }
228    // Reduce monomorphization.
229    pub fn _apply_columns(&self, func: &dyn Fn(&Column) -> Column) -> Vec<Column> {
230        self.columns.iter().map(func).collect()
231    }
232    // Reduce monomorphization.
233    fn try_apply_columns_par(
234        &self,
235        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
236    ) -> PolarsResult<Vec<Column>> {
237        POOL.install(|| self.columns.par_iter().map(func).collect())
238    }
239    // Reduce monomorphization.
240    pub fn _apply_columns_par(
241        &self,
242        func: &(dyn Fn(&Column) -> Column + Send + Sync),
243    ) -> Vec<Column> {
244        POOL.install(|| self.columns.par_iter().map(func).collect())
245    }
246
247    /// Get the index of the column.
248    fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
249        self.get_column_index(name)
250            .ok_or_else(|| polars_err!(col_not_found = name))
251    }
252
253    fn check_already_present(&self, name: &str) -> PolarsResult<()> {
254        polars_ensure!(
255            self.columns.iter().all(|s| s.name().as_str() != name),
256            Duplicate: "column with name {:?} is already present in the DataFrame", name
257        );
258        Ok(())
259    }
260
261    /// Reserve additional slots into the chunks of the series.
262    pub(crate) fn reserve_chunks(&mut self, additional: usize) {
263        for s in &mut self.columns {
264            if let Column::Series(s) = s {
265                // SAFETY:
266                // do not modify the data, simply resize.
267                unsafe { s.chunks_mut().reserve(additional) }
268            }
269        }
270    }
271
272    /// Create a DataFrame from a Vector of Series.
273    ///
274    /// Errors if a column names are not unique, or if heights are not all equal.
275    ///
276    /// # Example
277    ///
278    /// ```
279    /// # use polars_core::prelude::*;
280    /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
281    /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
282    ///
283    /// let df = DataFrame::new(vec![s0, s1])?;
284    /// # Ok::<(), PolarsError>(())
285    /// ```
286    pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
287        DataFrame::validate_columns_slice(&columns)
288            .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
289        Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
290    }
291
292    pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
293        for col in &columns {
294            polars_ensure!(
295                col.len() == height,
296                ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
297                columns[0].name(), height, col.name(), col.len()
298            );
299        }
300
301        ensure_names_unique(&columns, |s| s.name().as_str())?;
302
303        Ok(DataFrame {
304            height,
305            columns,
306            cached_schema: OnceLock::new(),
307        })
308    }
309
310    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
311    /// columns to match the other columns.
312    pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
313        // The length of the longest non-unit length column determines the
314        // broadcast length. If all columns are unit-length the broadcast length
315        // is one.
316        let broadcast_len = columns
317            .iter()
318            .map(|s| s.len())
319            .filter(|l| *l != 1)
320            .max()
321            .unwrap_or(1);
322        Self::new_with_broadcast_len(columns, broadcast_len)
323    }
324
325    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
326    /// columns to broadcast_len.
327    pub fn new_with_broadcast_len(
328        columns: Vec<Column>,
329        broadcast_len: usize,
330    ) -> PolarsResult<Self> {
331        ensure_names_unique(&columns, |s| s.name().as_str())?;
332        unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
333    }
334
335    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
336    /// columns to match the other columns.
337    ///  
338    /// # Safety
339    /// Does not check that the column names are unique (which they must be).
340    pub unsafe fn new_with_broadcast_no_namecheck(
341        mut columns: Vec<Column>,
342        broadcast_len: usize,
343    ) -> PolarsResult<Self> {
344        for col in &mut columns {
345            // Length not equal to the broadcast len, needs broadcast or is an error.
346            let len = col.len();
347            if len != broadcast_len {
348                if len != 1 {
349                    let name = col.name().to_owned();
350                    let extra_info =
351                        if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
352                            format!(" (matching column '{}')", c.name())
353                        } else {
354                            String::new()
355                        };
356                    polars_bail!(
357                        ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
358                    );
359                }
360                *col = col.new_from_index(0, broadcast_len);
361            }
362        }
363
364        let length = if columns.is_empty() { 0 } else { broadcast_len };
365
366        Ok(unsafe { DataFrame::new_no_checks(length, columns) })
367    }
368
369    pub fn new_from_index(&self, index: usize, height: usize) -> Self {
370        let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
371        unsafe { Self::new_no_checks(height, cols.collect()) }
372    }
373
374    /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
375    ///
376    /// # Example
377    ///
378    /// ```rust
379    /// use polars_core::prelude::DataFrame;
380    /// static EMPTY: DataFrame = DataFrame::empty();
381    /// ```
382    pub const fn empty() -> Self {
383        Self::empty_with_height(0)
384    }
385
386    /// Creates an empty `DataFrame` with a specific `height`.
387    pub const fn empty_with_height(height: usize) -> Self {
388        DataFrame {
389            height,
390            columns: vec![],
391            cached_schema: OnceLock::new(),
392        }
393    }
394
395    /// Create an empty `DataFrame` with empty columns as per the `schema`.
396    pub fn empty_with_arc_schema(schema: Arc<Schema>) -> Self {
397        let mut df = Self::empty_with_schema(&schema);
398        df.cached_schema = OnceLock::from(schema);
399        df
400    }
401
402    /// Create an empty `DataFrame` with empty columns as per the `schema`.
403    pub fn empty_with_schema(schema: &Schema) -> Self {
404        let cols = schema
405            .iter()
406            .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
407            .collect();
408        unsafe { DataFrame::new_no_checks(0, cols) }
409    }
410
411    /// Create an empty `DataFrame` with empty columns as per the `schema`.
412    pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
413        let cols = schema
414            .iter_values()
415            .map(|fld| {
416                Column::from(Series::new_empty(
417                    fld.name.clone(),
418                    &(DataType::from_arrow_field(fld)),
419                ))
420            })
421            .collect();
422        unsafe { DataFrame::new_no_checks(0, cols) }
423    }
424
425    /// Create a new `DataFrame` with the given schema, only containing nulls.
426    pub fn full_null(schema: &Schema, height: usize) -> Self {
427        let columns = schema
428            .iter_fields()
429            .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
430            .collect();
431        unsafe { DataFrame::new_no_checks(height, columns) }
432    }
433
434    /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
435    ///
436    /// # Example
437    ///
438    /// ```rust
439    /// # use polars_core::prelude::*;
440    /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
441    /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
442    /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
443    ///
444    /// assert_eq!(df.pop(), Some(s2));
445    /// assert_eq!(df.pop(), Some(s1));
446    /// assert_eq!(df.pop(), None);
447    /// assert!(df.is_empty());
448    /// # Ok::<(), PolarsError>(())
449    /// ```
450    pub fn pop(&mut self) -> Option<Column> {
451        self.clear_schema();
452
453        self.columns.pop()
454    }
455
456    /// Add a new column at index 0 that counts the rows.
457    ///
458    /// # Example
459    ///
460    /// ```
461    /// # use polars_core::prelude::*;
462    /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
463    /// assert_eq!(df1.shape(), (4, 1));
464    ///
465    /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
466    /// assert_eq!(df2.shape(), (4, 2));
467    /// println!("{}", df2);
468    ///
469    /// # Ok::<(), PolarsError>(())
470    /// ```
471    ///
472    /// Output:
473    ///
474    /// ```text
475    ///  shape: (4, 2)
476    ///  +-----+----------+
477    ///  | Id  | Name     |
478    ///  | --- | ---      |
479    ///  | u32 | str      |
480    ///  +=====+==========+
481    ///  | 0   | James    |
482    ///  +-----+----------+
483    ///  | 1   | Mary     |
484    ///  +-----+----------+
485    ///  | 2   | John     |
486    ///  +-----+----------+
487    ///  | 3   | Patricia |
488    ///  +-----+----------+
489    /// ```
490    pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
491        let mut columns = Vec::with_capacity(self.columns.len() + 1);
492        let offset = offset.unwrap_or(0);
493
494        let col = Column::new_row_index(name, offset, self.height())?;
495        columns.push(col);
496        columns.extend_from_slice(&self.columns);
497        DataFrame::new(columns)
498    }
499
500    /// Add a row index column in place.
501    ///
502    /// # Safety
503    /// The caller should ensure the DataFrame does not already contain a column with the given name.
504    ///
505    /// # Panics
506    /// Panics if the resulting column would reach or overflow IdxSize::MAX.
507    pub unsafe fn with_row_index_mut(
508        &mut self,
509        name: PlSmallStr,
510        offset: Option<IdxSize>,
511    ) -> &mut Self {
512        // TODO: Make this function unsafe
513        debug_assert!(
514            self.columns.iter().all(|c| c.name() != &name),
515            "with_row_index_mut(): column with name {} already exists",
516            &name
517        );
518
519        let offset = offset.unwrap_or(0);
520        let col = Column::new_row_index(name, offset, self.height()).unwrap();
521
522        self.clear_schema();
523        self.columns.insert(0, col);
524        self
525    }
526
527    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
528    /// `Series`.
529    ///
530    /// Calculates the height from the first column or `0` if no columns are given.
531    ///
532    /// # Safety
533    ///
534    /// It is the callers responsibility to uphold the contract of all `Series`
535    /// having an equal length and a unique name, if not this may panic down the line.
536    pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
537        let height = columns.first().map_or(0, Column::len);
538        unsafe { Self::new_no_checks(height, columns) }
539    }
540
541    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
542    /// `Series`.
543    ///
544    /// It is advised to use [DataFrame::new] in favor of this method.
545    ///
546    /// # Safety
547    ///
548    /// It is the callers responsibility to uphold the contract of all `Series`
549    /// having an equal length and a unique name, if not this may panic down the line.
550    pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
551        if cfg!(debug_assertions) {
552            DataFrame::validate_columns_slice(&columns).unwrap();
553        }
554
555        unsafe { Self::_new_no_checks_impl(height, columns) }
556    }
557
558    /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
559    /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
560    /// constructed with this method is generally highly unsafe and should not be long-lived.
561    #[allow(clippy::missing_safety_doc)]
562    pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
563        DataFrame {
564            height,
565            columns,
566            cached_schema: OnceLock::new(),
567        }
568    }
569
570    /// Shrink the capacity of this DataFrame to fit its length.
571    pub fn shrink_to_fit(&mut self) {
572        // Don't parallelize this. Memory overhead
573        for s in &mut self.columns {
574            s.shrink_to_fit();
575        }
576    }
577
578    /// Aggregate all the chunks in the DataFrame to a single chunk.
579    pub fn as_single_chunk(&mut self) -> &mut Self {
580        // Don't parallelize this. Memory overhead
581        for s in &mut self.columns {
582            *s = s.rechunk();
583        }
584        self
585    }
586
587    /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
588    /// This may lead to more peak memory consumption.
589    pub fn as_single_chunk_par(&mut self) -> &mut Self {
590        if self.columns.iter().any(|c| c.n_chunks() > 1) {
591            self.columns = self._apply_columns_par(&|s| s.rechunk());
592        }
593        self
594    }
595
596    /// Rechunks all columns to only have a single chunk.
597    pub fn rechunk_mut(&mut self) {
598        // SAFETY: We never adjust the length or names of the columns.
599        let columns = unsafe { self.get_columns_mut() };
600
601        for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
602            *col = col.rechunk();
603        }
604    }
605
606    pub fn _deshare_views_mut(&mut self) {
607        // SAFETY: We never adjust the length or names of the columns.
608        unsafe {
609            let columns = self.get_columns_mut();
610            for col in columns {
611                let Column::Series(s) = col else { continue };
612
613                if let Ok(ca) = s.binary() {
614                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
615                    *col = Column::from(gc_ca.into_series());
616                } else if let Ok(ca) = s.str() {
617                    let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
618                    *col = Column::from(gc_ca.into_series());
619                }
620            }
621        }
622    }
623
624    /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
625    pub fn rechunk_to_record_batch(
626        self,
627        compat_level: CompatLevel,
628    ) -> RecordBatchT<Box<dyn Array>> {
629        let height = self.height();
630
631        let (schema, arrays) = self
632            .columns
633            .into_iter()
634            .map(|col| {
635                let mut series = col.take_materialized_series();
636                // Rechunk to one chunk if necessary
637                if series.n_chunks() > 1 {
638                    series = series.rechunk();
639                }
640                (
641                    series.field().to_arrow(compat_level),
642                    series.to_arrow(0, compat_level),
643                )
644            })
645            .collect();
646
647        RecordBatchT::new(height, Arc::new(schema), arrays)
648    }
649
650    /// Returns true if the chunks of the columns do not align and re-chunking should be done
651    pub fn should_rechunk(&self) -> bool {
652        // Fast check. It is also needed for correctness, as code below doesn't check if the number
653        // of chunks is equal.
654        if !self
655            .get_columns()
656            .iter()
657            .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
658            .all_equal()
659        {
660            return true;
661        }
662
663        // From here we check chunk lengths.
664        let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
665        match chunk_lengths.next() {
666            None => false,
667            Some(first_column_chunk_lengths) => {
668                // Fast Path for single Chunk Series
669                if first_column_chunk_lengths.size_hint().0 == 1 {
670                    return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
671                }
672                // Always rechunk if we have more chunks than rows.
673                // except when we have an empty df containing a single chunk
674                let height = self.height();
675                let n_chunks = first_column_chunk_lengths.size_hint().0;
676                if n_chunks > height && !(height == 0 && n_chunks == 1) {
677                    return true;
678                }
679                // Slow Path for multi Chunk series
680                let v: Vec<_> = first_column_chunk_lengths.collect();
681                for cl in chunk_lengths {
682                    if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
683                        return true;
684                    }
685                }
686                false
687            },
688        }
689    }
690
691    /// Ensure all the chunks in the [`DataFrame`] are aligned.
692    pub fn align_chunks_par(&mut self) -> &mut Self {
693        if self.should_rechunk() {
694            self.as_single_chunk_par()
695        } else {
696            self
697        }
698    }
699
700    pub fn align_chunks(&mut self) -> &mut Self {
701        if self.should_rechunk() {
702            self.as_single_chunk()
703        } else {
704            self
705        }
706    }
707
708    /// Get the [`DataFrame`] schema.
709    ///
710    /// # Example
711    ///
712    /// ```rust
713    /// # use polars_core::prelude::*;
714    /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
715    ///                         "Diameter (m)" => [8.8e26, f64::INFINITY])?;
716    ///
717    /// let f1: Field = Field::new("Thing".into(), DataType::String);
718    /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
719    /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
720    ///
721    /// assert_eq!(&**df.schema(), &sc);
722    /// # Ok::<(), PolarsError>(())
723    /// ```
724    pub fn schema(&self) -> &SchemaRef {
725        let out = self.cached_schema.get_or_init(|| {
726            Arc::new(
727                self.columns
728                    .iter()
729                    .map(|x| (x.name().clone(), x.dtype().clone()))
730                    .collect(),
731            )
732        });
733
734        debug_assert_eq!(out.len(), self.width());
735
736        out
737    }
738
739    /// Get a reference to the [`DataFrame`] columns.
740    ///
741    /// # Example
742    ///
743    /// ```rust
744    /// # use polars_core::prelude::*;
745    /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
746    ///                         "Symbol" => ["A", "C", "G", "T"])?;
747    /// let columns: &[Column] = df.get_columns();
748    ///
749    /// assert_eq!(columns[0].name(), "Name");
750    /// assert_eq!(columns[1].name(), "Symbol");
751    /// # Ok::<(), PolarsError>(())
752    /// ```
753    #[inline]
754    pub fn get_columns(&self) -> &[Column] {
755        &self.columns
756    }
757
758    #[inline]
759    /// Get mutable access to the underlying columns.
760    ///
761    /// # Safety
762    ///
763    /// The caller must ensure the length of all [`Series`] remains equal to `height` or
764    /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
765    /// The caller must ensure that the cached schema is cleared if it modifies the schema by
766    /// calling [`DataFrame::clear_schema`].
767    pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
768        &mut self.columns
769    }
770
771    #[inline]
772    /// Remove all the columns in the [`DataFrame`] but keep the `height`.
773    pub fn clear_columns(&mut self) {
774        unsafe { self.get_columns_mut() }.clear();
775        self.clear_schema();
776    }
777
778    #[inline]
779    /// Extend the columns without checking for name collisions or height.
780    ///
781    /// # Safety
782    ///
783    /// The caller needs to ensure that:
784    /// - Column names are unique within the resulting [`DataFrame`].
785    /// - The length of each appended column matches the height of the [`DataFrame`]. For
786    ///   `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
787    ///   with [`DataFrame::set_height`].
788    pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
789        unsafe { self.get_columns_mut() }.extend(iter);
790        self.clear_schema();
791    }
792
793    /// Take ownership of the underlying columns vec.
794    pub fn take_columns(self) -> Vec<Column> {
795        self.columns
796    }
797
798    /// Iterator over the columns as [`Series`].
799    ///
800    /// # Example
801    ///
802    /// ```rust
803    /// # use polars_core::prelude::*;
804    /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
805    /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
806    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
807    ///
808    /// let mut iterator = df.iter();
809    ///
810    /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
811    /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
812    /// assert_eq!(iterator.next(), None);
813    /// # Ok::<(), PolarsError>(())
814    /// ```
815    pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
816        self.materialized_column_iter()
817    }
818
819    /// # Example
820    ///
821    /// ```rust
822    /// # use polars_core::prelude::*;
823    /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
824    ///                         "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
825    ///
826    /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
827    /// # Ok::<(), PolarsError>(())
828    /// ```
829    pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
830        self.columns.iter().map(|s| s.name()).collect()
831    }
832
833    /// Get the [`Vec<PlSmallStr>`] representing the column names.
834    pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
835        self.columns.iter().map(|s| s.name().clone()).collect()
836    }
837
838    pub fn get_column_names_str(&self) -> Vec<&str> {
839        self.columns.iter().map(|s| s.name().as_str()).collect()
840    }
841
842    /// Set the column names.
843    /// # Example
844    ///
845    /// ```rust
846    /// # use polars_core::prelude::*;
847    /// let mut df: DataFrame = df!("Mathematical set" => ["ā„•", "ℤ", "š”»", "ā„š", "ā„", "ā„‚"])?;
848    /// df.set_column_names(["Set"])?;
849    ///
850    /// assert_eq!(df.get_column_names(), &["Set"]);
851    /// # Ok::<(), PolarsError>(())
852    /// ```
853    pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
854    where
855        I: IntoIterator<Item = S>,
856        S: Into<PlSmallStr>,
857    {
858        let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
859        self._set_column_names_impl(names.as_slice())
860    }
861
862    fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
863        polars_ensure!(
864            names.len() == self.width(),
865            ShapeMismatch: "{} column names provided for a DataFrame of width {}",
866            names.len(), self.width()
867        );
868        ensure_names_unique(names, |s| s.as_str())?;
869
870        let columns = mem::take(&mut self.columns);
871        self.columns = columns
872            .into_iter()
873            .zip(names)
874            .map(|(s, name)| {
875                let mut s = s;
876                s.rename(name.clone());
877                s
878            })
879            .collect();
880        self.clear_schema();
881        Ok(())
882    }
883
884    /// Get the data types of the columns in the [`DataFrame`].
885    ///
886    /// # Example
887    ///
888    /// ```rust
889    /// # use polars_core::prelude::*;
890    /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
891    ///                                "Fraction" => [0.965, 0.035])?;
892    ///
893    /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
894    /// # Ok::<(), PolarsError>(())
895    /// ```
896    pub fn dtypes(&self) -> Vec<DataType> {
897        self.columns.iter().map(|s| s.dtype().clone()).collect()
898    }
899
900    pub(crate) fn first_series_column(&self) -> Option<&Series> {
901        self.columns.iter().find_map(|col| col.as_series())
902    }
903
904    /// The number of chunks for the first column.
905    pub fn first_col_n_chunks(&self) -> usize {
906        match self.first_series_column() {
907            None if self.columns.is_empty() => 0,
908            None => 1,
909            Some(s) => s.n_chunks(),
910        }
911    }
912
913    /// The highest number of chunks for any column.
914    pub fn max_n_chunks(&self) -> usize {
915        self.columns
916            .iter()
917            .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
918            .max()
919            .unwrap_or(0)
920    }
921
922    /// Get a reference to the schema fields of the [`DataFrame`].
923    ///
924    /// # Example
925    ///
926    /// ```rust
927    /// # use polars_core::prelude::*;
928    /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
929    ///                            "Fraction" => [0.708, 0.292])?;
930    ///
931    /// let f1: Field = Field::new("Surface type".into(), DataType::String);
932    /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
933    ///
934    /// assert_eq!(earth.fields(), &[f1, f2]);
935    /// # Ok::<(), PolarsError>(())
936    /// ```
937    pub fn fields(&self) -> Vec<Field> {
938        self.columns
939            .iter()
940            .map(|s| s.field().into_owned())
941            .collect()
942    }
943
944    /// Get (height, width) of the [`DataFrame`].
945    ///
946    /// # Example
947    ///
948    /// ```rust
949    /// # use polars_core::prelude::*;
950    /// let df0: DataFrame = DataFrame::default();
951    /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
952    /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
953    ///                          "2" => [1, 2, 3, 4, 5])?;
954    ///
955    /// assert_eq!(df0.shape(), (0 ,0));
956    /// assert_eq!(df1.shape(), (5, 1));
957    /// assert_eq!(df2.shape(), (5, 2));
958    /// # Ok::<(), PolarsError>(())
959    /// ```
960    pub fn shape(&self) -> (usize, usize) {
961        (self.height, self.columns.len())
962    }
963
964    /// Get the width of the [`DataFrame`] which is the number of columns.
965    ///
966    /// # Example
967    ///
968    /// ```rust
969    /// # use polars_core::prelude::*;
970    /// let df0: DataFrame = DataFrame::default();
971    /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
972    /// let df2: DataFrame = df!("Series 1" => [0; 0],
973    ///                          "Series 2" => [0; 0])?;
974    ///
975    /// assert_eq!(df0.width(), 0);
976    /// assert_eq!(df1.width(), 1);
977    /// assert_eq!(df2.width(), 2);
978    /// # Ok::<(), PolarsError>(())
979    /// ```
980    pub fn width(&self) -> usize {
981        self.columns.len()
982    }
983
984    /// Get the height of the [`DataFrame`] which is the number of rows.
985    ///
986    /// # Example
987    ///
988    /// ```rust
989    /// # use polars_core::prelude::*;
990    /// let df0: DataFrame = DataFrame::default();
991    /// let df1: DataFrame = df!("Currency" => ["€", "$"])?;
992    /// let df2: DataFrame = df!("Currency" => ["€", "$", "Ā„", "Ā£", "₿"])?;
993    ///
994    /// assert_eq!(df0.height(), 0);
995    /// assert_eq!(df1.height(), 2);
996    /// assert_eq!(df2.height(), 5);
997    /// # Ok::<(), PolarsError>(())
998    /// ```
999    pub fn height(&self) -> usize {
1000        self.height
1001    }
1002
1003    /// Returns the size as number of rows * number of columns
1004    pub fn size(&self) -> usize {
1005        let s = self.shape();
1006        s.0 * s.1
1007    }
1008
1009    /// Returns `true` if the [`DataFrame`] contains no rows.
1010    ///
1011    /// # Example
1012    ///
1013    /// ```rust
1014    /// # use polars_core::prelude::*;
1015    /// let df1: DataFrame = DataFrame::default();
1016    /// assert!(df1.is_empty());
1017    ///
1018    /// let df2: DataFrame = df!("First name" => ["Forever"],
1019    ///                          "Last name" => ["Alone"])?;
1020    /// assert!(!df2.is_empty());
1021    /// # Ok::<(), PolarsError>(())
1022    /// ```
1023    pub fn is_empty(&self) -> bool {
1024        matches!(self.shape(), (0, _) | (_, 0))
1025    }
1026
1027    /// Set the height (i.e. number of rows) of this [`DataFrame`].
1028    ///
1029    /// # Safety
1030    ///
1031    /// This needs to be equal to the length of all the columns.
1032    pub unsafe fn set_height(&mut self, height: usize) {
1033        self.height = height;
1034    }
1035
1036    /// Add multiple [`Series`] to a [`DataFrame`].
1037    /// The added `Series` are required to have the same length.
1038    ///
1039    /// # Example
1040    ///
1041    /// ```rust
1042    /// # use polars_core::prelude::*;
1043    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1044    /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1045    /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1046    ///
1047    /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1048    /// assert_eq!(df2.shape(), (3, 3));
1049    /// println!("{}", df2);
1050    /// # Ok::<(), PolarsError>(())
1051    /// ```
1052    ///
1053    /// Output:
1054    ///
1055    /// ```text
1056    /// shape: (3, 3)
1057    /// +---------+--------+----------+
1058    /// | Element | Proton | Electron |
1059    /// | ---     | ---    | ---      |
1060    /// | str     | i32    | i32      |
1061    /// +=========+========+==========+
1062    /// | Copper  | 29     | 29       |
1063    /// +---------+--------+----------+
1064    /// | Silver  | 47     | 47       |
1065    /// +---------+--------+----------+
1066    /// | Gold    | 79     | 79       |
1067    /// +---------+--------+----------+
1068    /// ```
1069    pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1070        let mut new_cols = self.columns.clone();
1071        new_cols.extend_from_slice(columns);
1072        DataFrame::new(new_cols)
1073    }
1074
1075    /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1076    ///
1077    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1078    ///
1079    /// # Example
1080    ///
1081    /// ```rust
1082    /// # use polars_core::prelude::*;
1083    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1084    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1085    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1086    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1087    ///
1088    /// let df3: DataFrame = df1.vstack(&df2)?;
1089    ///
1090    /// assert_eq!(df3.shape(), (5, 2));
1091    /// println!("{}", df3);
1092    /// # Ok::<(), PolarsError>(())
1093    /// ```
1094    ///
1095    /// Output:
1096    ///
1097    /// ```text
1098    /// shape: (5, 2)
1099    /// +-----------+-------------------+
1100    /// | Element   | Melting Point (K) |
1101    /// | ---       | ---               |
1102    /// | str       | f64               |
1103    /// +===========+===================+
1104    /// | Copper    | 1357.77           |
1105    /// +-----------+-------------------+
1106    /// | Silver    | 1234.93           |
1107    /// +-----------+-------------------+
1108    /// | Gold      | 1337.33           |
1109    /// +-----------+-------------------+
1110    /// | Platinum  | 2041.4            |
1111    /// +-----------+-------------------+
1112    /// | Palladium | 1828.05           |
1113    /// +-----------+-------------------+
1114    /// ```
1115    pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1116        let mut df = self.clone();
1117        df.vstack_mut(other)?;
1118        Ok(df)
1119    }
1120
1121    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1122    ///
1123    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1124    ///
1125    /// # Example
1126    ///
1127    /// ```rust
1128    /// # use polars_core::prelude::*;
1129    /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1130    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1131    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1132    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1133    ///
1134    /// df1.vstack_mut(&df2)?;
1135    ///
1136    /// assert_eq!(df1.shape(), (5, 2));
1137    /// println!("{}", df1);
1138    /// # Ok::<(), PolarsError>(())
1139    /// ```
1140    ///
1141    /// Output:
1142    ///
1143    /// ```text
1144    /// shape: (5, 2)
1145    /// +-----------+-------------------+
1146    /// | Element   | Melting Point (K) |
1147    /// | ---       | ---               |
1148    /// | str       | f64               |
1149    /// +===========+===================+
1150    /// | Copper    | 1357.77           |
1151    /// +-----------+-------------------+
1152    /// | Silver    | 1234.93           |
1153    /// +-----------+-------------------+
1154    /// | Gold      | 1337.33           |
1155    /// +-----------+-------------------+
1156    /// | Platinum  | 2041.4            |
1157    /// +-----------+-------------------+
1158    /// | Palladium | 1828.05           |
1159    /// +-----------+-------------------+
1160    /// ```
1161    pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1162        if self.width() != other.width() {
1163            polars_ensure!(
1164                self.width() == 0,
1165                ShapeMismatch:
1166                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1167                self.width(), other.width(),
1168            );
1169            self.columns.clone_from(&other.columns);
1170            self.height = other.height;
1171            return Ok(self);
1172        }
1173
1174        self.columns
1175            .iter_mut()
1176            .zip(other.columns.iter())
1177            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1178                ensure_can_extend(&*left, right)?;
1179                left.append(right).map_err(|e| {
1180                    e.context(format!("failed to vstack column '{}'", right.name()).into())
1181                })?;
1182                Ok(())
1183            })?;
1184        self.height += other.height;
1185        Ok(self)
1186    }
1187
1188    pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1189        if self.width() != other.width() {
1190            polars_ensure!(
1191                self.width() == 0,
1192                ShapeMismatch:
1193                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1194                self.width(), other.width(),
1195            );
1196            self.columns = other.columns;
1197            self.height = other.height;
1198            return Ok(self);
1199        }
1200
1201        self.columns
1202            .iter_mut()
1203            .zip(other.columns.into_iter())
1204            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1205                ensure_can_extend(&*left, &right)?;
1206                let right_name = right.name().clone();
1207                left.append_owned(right).map_err(|e| {
1208                    e.context(format!("failed to vstack column '{right_name}'").into())
1209                })?;
1210                Ok(())
1211            })?;
1212        self.height += other.height;
1213        Ok(self)
1214    }
1215
1216    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1217    ///
1218    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1219    ///
1220    /// # Panics
1221    /// Panics if the schema's don't match.
1222    pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1223        self.columns
1224            .iter_mut()
1225            .zip(other.columns.iter())
1226            .for_each(|(left, right)| {
1227                left.append(right)
1228                    .map_err(|e| {
1229                        e.context(format!("failed to vstack column '{}'", right.name()).into())
1230                    })
1231                    .expect("should not fail");
1232            });
1233        self.height += other.height;
1234    }
1235
1236    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1237    ///
1238    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1239    ///
1240    /// # Panics
1241    /// Panics if the schema's don't match.
1242    pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1243        self.columns
1244            .iter_mut()
1245            .zip(other.columns)
1246            .for_each(|(left, right)| {
1247                left.append_owned(right).expect("should not fail");
1248            });
1249        self.height += other.height;
1250    }
1251
1252    /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1253    ///
1254    /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1255    /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1256    ///
1257    /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1258    /// and thus will yield faster queries.
1259    ///
1260    /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1261    /// online operations where you add `n` rows and rerun a query.
1262    ///
1263    /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1264    /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1265    /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1266    pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1267        polars_ensure!(
1268            self.width() == other.width(),
1269            ShapeMismatch:
1270            "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1271            self.width(), other.width(),
1272        );
1273
1274        self.columns
1275            .iter_mut()
1276            .zip(other.columns.iter())
1277            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1278                ensure_can_extend(&*left, right)?;
1279                left.extend(right).map_err(|e| {
1280                    e.context(format!("failed to extend column '{}'", right.name()).into())
1281                })?;
1282                Ok(())
1283            })?;
1284        self.height += other.height;
1285        self.clear_schema();
1286        Ok(())
1287    }
1288
1289    /// Remove a column by name and return the column removed.
1290    ///
1291    /// # Example
1292    ///
1293    /// ```rust
1294    /// # use polars_core::prelude::*;
1295    /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1296    ///                             "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1297    ///
1298    /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1299    /// assert!(s1.is_err());
1300    ///
1301    /// let s2: Column = df.drop_in_place("Animal")?;
1302    /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1303    /// # Ok::<(), PolarsError>(())
1304    /// ```
1305    pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1306        let idx = self.check_name_to_idx(name)?;
1307        self.clear_schema();
1308        Ok(self.columns.remove(idx))
1309    }
1310
1311    /// Return a new [`DataFrame`] where all null values are dropped.
1312    ///
1313    /// # Example
1314    ///
1315    /// ```no_run
1316    /// # use polars_core::prelude::*;
1317    /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1318    ///                         "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1319    /// assert_eq!(df1.shape(), (3, 2));
1320    ///
1321    /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1322    /// assert_eq!(df2.shape(), (1, 2));
1323    /// println!("{}", df2);
1324    /// # Ok::<(), PolarsError>(())
1325    /// ```
1326    ///
1327    /// Output:
1328    ///
1329    /// ```text
1330    /// shape: (1, 2)
1331    /// +---------+---------------------+
1332    /// | Country | Tax revenue (% GDP) |
1333    /// | ---     | ---                 |
1334    /// | str     | f64                 |
1335    /// +=========+=====================+
1336    /// | Malta   | 32.7                |
1337    /// +---------+---------------------+
1338    /// ```
1339    pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1340    where
1341        for<'a> &'a S: Into<PlSmallStr>,
1342    {
1343        if let Some(v) = subset {
1344            let v = self.select_columns(v)?;
1345            self._drop_nulls_impl(v.as_slice())
1346        } else {
1347            self._drop_nulls_impl(self.columns.as_slice())
1348        }
1349    }
1350
1351    fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1352        // fast path for no nulls in df
1353        if subset.iter().all(|s| !s.has_nulls()) {
1354            return Ok(self.clone());
1355        }
1356
1357        let mut iter = subset.iter();
1358
1359        let mask = iter
1360            .next()
1361            .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1362        let mut mask = mask.is_not_null();
1363
1364        for c in iter {
1365            mask = mask & c.is_not_null();
1366        }
1367        self.filter(&mask)
1368    }
1369
1370    /// Drop a column by name.
1371    /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1372    /// the current one in place.
1373    ///
1374    /// # Example
1375    ///
1376    /// ```rust
1377    /// # use polars_core::prelude::*;
1378    /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1379    /// let df2: DataFrame = df1.drop("Ray type")?;
1380    ///
1381    /// assert!(df2.is_empty());
1382    /// # Ok::<(), PolarsError>(())
1383    /// ```
1384    pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1385        let idx = self.check_name_to_idx(name)?;
1386        let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1387
1388        self.columns.iter().enumerate().for_each(|(i, s)| {
1389            if i != idx {
1390                new_cols.push(s.clone())
1391            }
1392        });
1393
1394        Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1395    }
1396
1397    /// Drop columns that are in `names`.
1398    pub fn drop_many<I, S>(&self, names: I) -> Self
1399    where
1400        I: IntoIterator<Item = S>,
1401        S: Into<PlSmallStr>,
1402    {
1403        let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1404        self.drop_many_amortized(&names)
1405    }
1406
1407    /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1408    pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1409        if names.is_empty() {
1410            return self.clone();
1411        }
1412        let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1413        self.columns.iter().for_each(|s| {
1414            if !names.contains(s.name()) {
1415                new_cols.push(s.clone())
1416            }
1417        });
1418
1419        unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1420    }
1421
1422    /// Insert a new column at a given index without checking for duplicates.
1423    /// This can leave the [`DataFrame`] at an invalid state
1424    fn insert_column_no_name_check(
1425        &mut self,
1426        index: usize,
1427        column: Column,
1428    ) -> PolarsResult<&mut Self> {
1429        polars_ensure!(
1430            self.width() == 0 || column.len() == self.height(),
1431            ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1432            column.len(), self.height(),
1433        );
1434
1435        if self.width() == 0 {
1436            self.height = column.len();
1437        }
1438
1439        self.columns.insert(index, column);
1440        self.clear_schema();
1441        Ok(self)
1442    }
1443
1444    /// Insert a new column at a given index.
1445    pub fn insert_column<S: IntoColumn>(
1446        &mut self,
1447        index: usize,
1448        column: S,
1449    ) -> PolarsResult<&mut Self> {
1450        let column = column.into_column();
1451        self.check_already_present(column.name().as_str())?;
1452        self.insert_column_no_name_check(index, column)
1453    }
1454
1455    fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1456        if let Some(idx) = self.get_column_index(column.name().as_str()) {
1457            self.replace_column(idx, column)?;
1458        } else {
1459            if self.width() == 0 {
1460                self.height = column.len();
1461            }
1462
1463            self.columns.push(column);
1464            self.clear_schema();
1465        }
1466        Ok(())
1467    }
1468
1469    /// Add a new column to this [`DataFrame`] or replace an existing one.
1470    pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1471        fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1472            let height = df.height();
1473            if column.len() == 1 && height > 1 {
1474                column = column.new_from_index(0, height);
1475            }
1476
1477            if column.len() == height || df.get_columns().is_empty() {
1478                df.add_column_by_search(column)?;
1479                Ok(df)
1480            }
1481            // special case for literals
1482            else if height == 0 && column.len() == 1 {
1483                let s = column.clear();
1484                df.add_column_by_search(s)?;
1485                Ok(df)
1486            } else {
1487                polars_bail!(
1488                    ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1489                    column.len(), height,
1490                );
1491            }
1492        }
1493        let column = column.into_column();
1494        inner(self, column)
1495    }
1496
1497    /// Adds a column to the [`DataFrame`] without doing any checks
1498    /// on length or duplicates.
1499    ///
1500    /// # Safety
1501    /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1502    pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1503        debug_assert!(self.width() == 0 || self.height() == column.len());
1504        debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1505
1506        // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1507        // properly for `width` == 0.
1508        if self.width() == 0 {
1509            unsafe { self.set_height(column.len()) };
1510        }
1511        unsafe { self.get_columns_mut() }.push(column);
1512        self.clear_schema();
1513
1514        self
1515    }
1516
1517    // Note: Schema can be both input or output_schema
1518    fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1519        let name = c.name();
1520        if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1521            if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1522                // Given schema is output_schema and we can push.
1523                if idx == self.columns.len() {
1524                    if self.width() == 0 {
1525                        self.height = c.len();
1526                    }
1527
1528                    self.columns.push(c);
1529                    self.clear_schema();
1530                }
1531                // Schema is incorrect fallback to search
1532                else {
1533                    debug_assert!(false);
1534                    self.add_column_by_search(c)?;
1535                }
1536            } else {
1537                self.replace_column(idx, c)?;
1538            }
1539        } else {
1540            if self.width() == 0 {
1541                self.height = c.len();
1542            }
1543
1544            self.columns.push(c);
1545            self.clear_schema();
1546        }
1547
1548        Ok(())
1549    }
1550
1551    // Note: Schema can be both input or output_schema
1552    pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1553        for (i, s) in series.into_iter().enumerate() {
1554            // we need to branch here
1555            // because users can add multiple columns with the same name
1556            if i == 0 || schema.get(s.name().as_str()).is_some() {
1557                self.with_column_and_schema(s.into_column(), schema)?;
1558            } else {
1559                self.with_column(s.clone().into_column())?;
1560            }
1561        }
1562        Ok(())
1563    }
1564
1565    pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1566        for (i, s) in columns.into_iter().enumerate() {
1567            // we need to branch here
1568            // because users can add multiple columns with the same name
1569            if i == 0 || schema.get(s.name().as_str()).is_some() {
1570                self.with_column_and_schema(s, schema)?;
1571            } else {
1572                self.with_column(s.clone())?;
1573            }
1574        }
1575
1576        Ok(())
1577    }
1578
1579    /// Add a new column to this [`DataFrame`] or replace an existing one.
1580    /// Uses an existing schema to amortize lookups.
1581    /// If the schema is incorrect, we will fallback to linear search.
1582    ///
1583    /// Note: Schema can be both input or output_schema
1584    pub fn with_column_and_schema<C: IntoColumn>(
1585        &mut self,
1586        column: C,
1587        schema: &Schema,
1588    ) -> PolarsResult<&mut Self> {
1589        let mut column = column.into_column();
1590
1591        let height = self.height();
1592        if column.len() == 1 && height > 1 {
1593            column = column.new_from_index(0, height);
1594        }
1595
1596        if column.len() == height || self.columns.is_empty() {
1597            self.add_column_by_schema(column, schema)?;
1598            Ok(self)
1599        }
1600        // special case for literals
1601        else if height == 0 && column.len() == 1 {
1602            let s = column.clear();
1603            self.add_column_by_schema(s, schema)?;
1604            Ok(self)
1605        } else {
1606            polars_bail!(
1607                ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1608                column.len(), height,
1609            );
1610        }
1611    }
1612
1613    /// Get a row in the [`DataFrame`]. Beware this is slow.
1614    ///
1615    /// # Example
1616    ///
1617    /// ```
1618    /// # use polars_core::prelude::*;
1619    /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1620    ///     df.get(idx)
1621    /// }
1622    /// ```
1623    pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1624        match self.columns.first() {
1625            Some(s) => {
1626                if s.len() <= idx {
1627                    return None;
1628                }
1629            },
1630            None => return None,
1631        }
1632        // SAFETY: we just checked bounds
1633        unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1634    }
1635
1636    /// Select a [`Series`] by index.
1637    ///
1638    /// # Example
1639    ///
1640    /// ```rust
1641    /// # use polars_core::prelude::*;
1642    /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1643    ///                         "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1644    ///
1645    /// let s1: Option<&Column> = df.select_at_idx(0);
1646    /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1647    ///
1648    /// assert_eq!(s1, Some(&s2));
1649    /// # Ok::<(), PolarsError>(())
1650    /// ```
1651    pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1652        self.columns.get(idx)
1653    }
1654
1655    /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1656    ///
1657    /// # Examples
1658    ///
1659    /// ```rust
1660    /// # use polars_core::prelude::*;
1661    /// let df = df! {
1662    ///     "0" => [0, 0, 0],
1663    ///     "1" => [1, 1, 1],
1664    ///     "2" => [2, 2, 2]
1665    /// }?;
1666    ///
1667    /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1668    /// assert!(df.equals(&df.select_by_range(..)?));
1669    /// # Ok::<(), PolarsError>(())
1670    /// ```
1671    pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1672    where
1673        R: ops::RangeBounds<usize>,
1674    {
1675        // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1676        // because it is the nightly feature. We should change here if this function were stable.
1677        fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1678        where
1679            R: ops::RangeBounds<usize>,
1680        {
1681            let len = bounds.end;
1682
1683            let start: ops::Bound<&usize> = range.start_bound();
1684            let start = match start {
1685                ops::Bound::Included(&start) => start,
1686                ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1687                    panic!("attempted to index slice from after maximum usize");
1688                }),
1689                ops::Bound::Unbounded => 0,
1690            };
1691
1692            let end: ops::Bound<&usize> = range.end_bound();
1693            let end = match end {
1694                ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1695                    panic!("attempted to index slice up to maximum usize");
1696                }),
1697                ops::Bound::Excluded(&end) => end,
1698                ops::Bound::Unbounded => len,
1699            };
1700
1701            if start > end {
1702                panic!("slice index starts at {start} but ends at {end}");
1703            }
1704            if end > len {
1705                panic!("range end index {end} out of range for slice of length {len}",);
1706            }
1707
1708            ops::Range { start, end }
1709        }
1710
1711        let colnames = self.get_column_names_owned();
1712        let range = get_range(range, ..colnames.len());
1713
1714        self._select_impl(&colnames[range])
1715    }
1716
1717    /// Get column index of a [`Series`] by name.
1718    /// # Example
1719    ///
1720    /// ```rust
1721    /// # use polars_core::prelude::*;
1722    /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1723    ///                         "Health" => [100, 200, 500],
1724    ///                         "Mana" => [250, 100, 0],
1725    ///                         "Strength" => [30, 150, 300])?;
1726    ///
1727    /// assert_eq!(df.get_column_index("Name"), Some(0));
1728    /// assert_eq!(df.get_column_index("Health"), Some(1));
1729    /// assert_eq!(df.get_column_index("Mana"), Some(2));
1730    /// assert_eq!(df.get_column_index("Strength"), Some(3));
1731    /// assert_eq!(df.get_column_index("Haste"), None);
1732    /// # Ok::<(), PolarsError>(())
1733    /// ```
1734    pub fn get_column_index(&self, name: &str) -> Option<usize> {
1735        let schema = self.schema();
1736        if let Some(idx) = schema.index_of(name) {
1737            if self
1738                .get_columns()
1739                .get(idx)
1740                .is_some_and(|c| c.name() == name)
1741            {
1742                return Some(idx);
1743            }
1744        }
1745
1746        self.columns.iter().position(|s| s.name().as_str() == name)
1747    }
1748
1749    /// Get column index of a [`Series`] by name.
1750    pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1751        self.get_column_index(name)
1752            .ok_or_else(|| polars_err!(col_not_found = name))
1753    }
1754
1755    /// Select a single column by name.
1756    ///
1757    /// # Example
1758    ///
1759    /// ```rust
1760    /// # use polars_core::prelude::*;
1761    /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1762    /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1763    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1764    ///
1765    /// assert_eq!(df.column("Password")?, &s1);
1766    /// # Ok::<(), PolarsError>(())
1767    /// ```
1768    pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1769        let idx = self.try_get_column_index(name)?;
1770        Ok(self.select_at_idx(idx).unwrap())
1771    }
1772
1773    /// Selected multiple columns by name.
1774    ///
1775    /// # Example
1776    ///
1777    /// ```rust
1778    /// # use polars_core::prelude::*;
1779    /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1780    ///                         "Max weight (kg)" => [16.0, 35.89])?;
1781    /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1782    ///
1783    /// assert_eq!(&df[0], sv[0]);
1784    /// assert_eq!(&df[1], sv[1]);
1785    /// # Ok::<(), PolarsError>(())
1786    /// ```
1787    pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1788    where
1789        I: IntoIterator<Item = S>,
1790        S: AsRef<str>,
1791    {
1792        names
1793            .into_iter()
1794            .map(|name| self.column(name.as_ref()))
1795            .collect()
1796    }
1797
1798    /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1799    ///
1800    /// # Examples
1801    ///
1802    /// ```
1803    /// # use polars_core::prelude::*;
1804    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1805    ///     df.select(["foo", "bar"])
1806    /// }
1807    /// ```
1808    pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1809    where
1810        I: IntoIterator<Item = S>,
1811        S: Into<PlSmallStr>,
1812    {
1813        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1814        self._select_impl(cols.as_slice())
1815    }
1816
1817    pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1818        ensure_names_unique(cols, |s| s.as_str())?;
1819        self._select_impl_unchecked(cols)
1820    }
1821
1822    pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1823        let selected = self.select_columns_impl(cols)?;
1824        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1825    }
1826
1827    /// Select with a known schema. The schema names must match the column names of this DataFrame.
1828    pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1829    where
1830        I: IntoIterator<Item = S>,
1831        S: Into<PlSmallStr>,
1832    {
1833        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1834        self._select_with_schema_impl(&cols, schema, true)
1835    }
1836
1837    /// Select with a known schema without checking for duplicates in `selection`.
1838    /// The schema names must match the column names of this DataFrame.
1839    pub fn select_with_schema_unchecked<I, S>(
1840        &self,
1841        selection: I,
1842        schema: &Schema,
1843    ) -> PolarsResult<Self>
1844    where
1845        I: IntoIterator<Item = S>,
1846        S: Into<PlSmallStr>,
1847    {
1848        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1849        self._select_with_schema_impl(&cols, schema, false)
1850    }
1851
1852    /// * The schema names must match the column names of this DataFrame.
1853    pub fn _select_with_schema_impl(
1854        &self,
1855        cols: &[PlSmallStr],
1856        schema: &Schema,
1857        check_duplicates: bool,
1858    ) -> PolarsResult<Self> {
1859        if check_duplicates {
1860            ensure_names_unique(cols, |s| s.as_str())?;
1861        }
1862
1863        let selected = self.select_columns_impl_with_schema(cols, schema)?;
1864        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1865    }
1866
1867    /// A non generic implementation to reduce compiler bloat.
1868    fn select_columns_impl_with_schema(
1869        &self,
1870        cols: &[PlSmallStr],
1871        schema: &Schema,
1872    ) -> PolarsResult<Vec<Column>> {
1873        if cfg!(debug_assertions) {
1874            ensure_matching_schema_names(schema, self.schema())?;
1875        }
1876
1877        cols.iter()
1878            .map(|name| {
1879                let index = schema.try_get_full(name.as_str())?.0;
1880                Ok(self.columns[index].clone())
1881            })
1882            .collect()
1883    }
1884
1885    pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1886    where
1887        I: IntoIterator<Item = S>,
1888        S: Into<PlSmallStr>,
1889    {
1890        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1891        self.select_physical_impl(&cols)
1892    }
1893
1894    fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1895        ensure_names_unique(cols, |s| s.as_str())?;
1896        let selected = self.select_columns_physical_impl(cols)?;
1897        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1898    }
1899
1900    pub fn project(&self, to: SchemaRef) -> PolarsResult<Self> {
1901        let from = self.schema();
1902        let columns = to
1903            .iter_names()
1904            .map(|name| Ok(self.columns[from.try_index_of(name.as_str())?].clone()))
1905            .collect::<PolarsResult<Vec<_>>>()?;
1906        let mut df = unsafe { Self::new_no_checks(self.height(), columns) };
1907        df.cached_schema = to.into();
1908        Ok(df)
1909    }
1910
1911    /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1912    ///
1913    /// # Example
1914    ///
1915    /// ```rust
1916    /// # use polars_core::prelude::*;
1917    /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1918    ///                         "Carbon" => [1, 2, 3],
1919    ///                         "Hydrogen" => [4, 6, 8])?;
1920    /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1921    ///
1922    /// assert_eq!(df["Carbon"], sv[0]);
1923    /// assert_eq!(df["Hydrogen"], sv[1]);
1924    /// # Ok::<(), PolarsError>(())
1925    /// ```
1926    pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1927        let cols = selection.into_vec();
1928        self.select_columns_impl(&cols)
1929    }
1930
1931    fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1932        self.columns
1933            .iter()
1934            .enumerate()
1935            .map(|(i, s)| (s.name().as_str(), i))
1936            .collect()
1937    }
1938
1939    /// A non generic implementation to reduce compiler bloat.
1940    fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1941        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1942            let name_to_idx = self._names_to_idx_map();
1943            cols.iter()
1944                .map(|name| {
1945                    let idx = *name_to_idx
1946                        .get(name.as_str())
1947                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1948                    Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1949                })
1950                .collect::<PolarsResult<Vec<_>>>()?
1951        } else {
1952            cols.iter()
1953                .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1954                .collect::<PolarsResult<Vec<_>>>()?
1955        };
1956
1957        Ok(selected)
1958    }
1959
1960    /// A non generic implementation to reduce compiler bloat.
1961    fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1962        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1963            // we hash, because there are user that having millions of columns.
1964            // # https://github.com/pola-rs/polars/issues/1023
1965            let name_to_idx = self._names_to_idx_map();
1966
1967            cols.iter()
1968                .map(|name| {
1969                    let idx = *name_to_idx
1970                        .get(name.as_str())
1971                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1972                    Ok(self.select_at_idx(idx).unwrap().clone())
1973                })
1974                .collect::<PolarsResult<Vec<_>>>()?
1975        } else {
1976            cols.iter()
1977                .map(|c| self.column(c.as_str()).cloned())
1978                .collect::<PolarsResult<Vec<_>>>()?
1979        };
1980
1981        Ok(selected)
1982    }
1983
1984    fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1985        // If there is a filtered column just see how many columns there are left.
1986        if let Some(fst) = filtered.first() {
1987            return fst.len();
1988        }
1989
1990        // Otherwise, count the number of values that would be filtered and return that height.
1991        let num_trues = mask.num_trues();
1992        if mask.len() == self.height() {
1993            num_trues
1994        } else {
1995            // This is for broadcasting masks
1996            debug_assert!(num_trues == 0 || num_trues == 1);
1997            self.height() * num_trues
1998        }
1999    }
2000
2001    /// Take the [`DataFrame`] rows by a boolean mask.
2002    ///
2003    /// # Example
2004    ///
2005    /// ```
2006    /// # use polars_core::prelude::*;
2007    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2008    ///     let mask = df.column("sepal_width")?.is_not_null();
2009    ///     df.filter(&mask)
2010    /// }
2011    /// ```
2012    pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2013        let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
2014        let height = self.filter_height(&new_col, mask);
2015
2016        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2017    }
2018
2019    /// Same as `filter` but does not parallelize.
2020    pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2021        let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
2022        let height = self.filter_height(&new_col, mask);
2023
2024        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2025    }
2026
2027    /// Take [`DataFrame`] rows by index values.
2028    ///
2029    /// # Example
2030    ///
2031    /// ```
2032    /// # use polars_core::prelude::*;
2033    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2034    ///     let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2035    ///     df.take(&idx)
2036    /// }
2037    /// ```
2038    pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2039        let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2040
2041        Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2042    }
2043
2044    /// # Safety
2045    /// The indices must be in-bounds.
2046    pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2047        self.take_unchecked_impl(idx, true)
2048    }
2049
2050    /// # Safety
2051    /// The indices must be in-bounds.
2052    pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2053        let cols = if allow_threads && POOL.current_num_threads() > 1 {
2054            POOL.install(|| {
2055                if POOL.current_num_threads() > self.width() {
2056                    let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2057                    if self.len() / stride >= 2 {
2058                        self._apply_columns_par(&|c| {
2059                            // Nested types initiate a rechunk in their take_unchecked implementation.
2060                            // If we do not rechunk, it will result in rechunk storms downstream.
2061                            let c = if c.dtype().is_nested() {
2062                                &c.rechunk()
2063                            } else {
2064                                c
2065                            };
2066
2067                            (0..idx.len().div_ceil(stride))
2068                                .into_par_iter()
2069                                .map(|i| c.take_unchecked(&idx.slice((i * stride) as i64, stride)))
2070                                .reduce(
2071                                    || Column::new_empty(c.name().clone(), c.dtype()),
2072                                    |mut a, b| {
2073                                        a.append_owned(b).unwrap();
2074                                        a
2075                                    },
2076                                )
2077                        })
2078                    } else {
2079                        self._apply_columns_par(&|c| c.take_unchecked(idx))
2080                    }
2081                } else {
2082                    self._apply_columns_par(&|c| c.take_unchecked(idx))
2083                }
2084            })
2085        } else {
2086            self._apply_columns(&|s| s.take_unchecked(idx))
2087        };
2088        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2089    }
2090
2091    /// # Safety
2092    /// The indices must be in-bounds.
2093    pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2094        self.take_slice_unchecked_impl(idx, true)
2095    }
2096
2097    /// # Safety
2098    /// The indices must be in-bounds.
2099    pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2100        let cols = if allow_threads && POOL.current_num_threads() > 1 {
2101            POOL.install(|| {
2102                if POOL.current_num_threads() > self.width() {
2103                    let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2104                    if self.len() / stride >= 2 {
2105                        self._apply_columns_par(&|c| {
2106                            // Nested types initiate a rechunk in their take_unchecked implementation.
2107                            // If we do not rechunk, it will result in rechunk storms downstream.
2108                            let c = if c.dtype().is_nested() {
2109                                &c.rechunk()
2110                            } else {
2111                                c
2112                            };
2113
2114                            (0..idx.len().div_ceil(stride))
2115                                .into_par_iter()
2116                                .map(|i| {
2117                                    let idx = &idx[i * stride..];
2118                                    let idx = &idx[..idx.len().min(stride)];
2119                                    c.take_slice_unchecked(idx)
2120                                })
2121                                .reduce(
2122                                    || Column::new_empty(c.name().clone(), c.dtype()),
2123                                    |mut a, b| {
2124                                        a.append_owned(b).unwrap();
2125                                        a
2126                                    },
2127                                )
2128                        })
2129                    } else {
2130                        self._apply_columns_par(&|s| s.take_slice_unchecked(idx))
2131                    }
2132                } else {
2133                    self._apply_columns_par(&|s| s.take_slice_unchecked(idx))
2134                }
2135            })
2136        } else {
2137            self._apply_columns(&|s| s.take_slice_unchecked(idx))
2138        };
2139        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2140    }
2141
2142    /// Rename a column in the [`DataFrame`].
2143    ///
2144    /// Should not be called in a loop as that can lead to quadratic behavior.
2145    ///
2146    /// # Example
2147    ///
2148    /// ```
2149    /// # use polars_core::prelude::*;
2150    /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2151    ///     let original_name = "foo";
2152    ///     let new_name = "bar";
2153    ///     df.rename(original_name, new_name.into())
2154    /// }
2155    /// ```
2156    pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2157        if column == name.as_str() {
2158            return Ok(self);
2159        }
2160        polars_ensure!(
2161            !self.schema().contains(&name),
2162            Duplicate: "column rename attempted with already existing name \"{name}\""
2163        );
2164
2165        self.get_column_index(column)
2166            .and_then(|idx| self.columns.get_mut(idx))
2167            .ok_or_else(|| polars_err!(col_not_found = column))
2168            .map(|c| c.rename(name))?;
2169        self.clear_schema();
2170
2171        Ok(self)
2172    }
2173
2174    pub fn rename_many<'a>(
2175        &mut self,
2176        renames: impl Iterator<Item = (&'a str, PlSmallStr)>,
2177    ) -> PolarsResult<&mut Self> {
2178        let mut schema = self.schema().as_ref().clone();
2179        self.clear_schema();
2180
2181        for (from, to) in renames {
2182            if from == to.as_str() {
2183                continue;
2184            }
2185
2186            polars_ensure!(
2187                !schema.contains(&to),
2188                Duplicate: "column rename attempted with already existing name \"{to}\""
2189            );
2190
2191            match schema.get_full(from) {
2192                None => polars_bail!(col_not_found = from),
2193                Some((idx, _, _)) => {
2194                    let (n, _) = schema.get_at_index_mut(idx).unwrap();
2195                    *n = to.clone();
2196                    self.columns.get_mut(idx).unwrap().rename(to);
2197                },
2198            }
2199        }
2200
2201        self.cached_schema = OnceLock::from(Arc::new(schema));
2202        Ok(self)
2203    }
2204
2205    /// Sort [`DataFrame`] in place.
2206    ///
2207    /// See [`DataFrame::sort`] for more instruction.
2208    pub fn sort_in_place(
2209        &mut self,
2210        by: impl IntoVec<PlSmallStr>,
2211        sort_options: SortMultipleOptions,
2212    ) -> PolarsResult<&mut Self> {
2213        let by_column = self.select_columns(by)?;
2214        self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2215        Ok(self)
2216    }
2217
2218    #[doc(hidden)]
2219    /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2220    pub fn sort_impl(
2221        &self,
2222        by_column: Vec<Column>,
2223        sort_options: SortMultipleOptions,
2224        slice: Option<(i64, usize)>,
2225    ) -> PolarsResult<Self> {
2226        if by_column.is_empty() {
2227            // If no columns selected, any order (including original order) is correct.
2228            return if let Some((offset, len)) = slice {
2229                Ok(self.slice(offset, len))
2230            } else {
2231                Ok(self.clone())
2232            };
2233        }
2234
2235        // note that the by_column argument also contains evaluated expression from
2236        // polars-lazy that may not even be present in this dataframe. therefore
2237        // when we try to set the first columns as sorted, we ignore the error as
2238        // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2239        let first_descending = sort_options.descending[0];
2240        let first_by_column = by_column[0].name().to_string();
2241
2242        let set_sorted = |df: &mut DataFrame| {
2243            // Mark the first sort column as sorted; if the column does not exist it
2244            // is ok, because we sorted by an expression not present in the dataframe
2245            let _ = df.apply(&first_by_column, |s| {
2246                let mut s = s.clone();
2247                if first_descending {
2248                    s.set_sorted_flag(IsSorted::Descending)
2249                } else {
2250                    s.set_sorted_flag(IsSorted::Ascending)
2251                }
2252                s
2253            });
2254        };
2255        if self.is_empty() {
2256            let mut out = self.clone();
2257            set_sorted(&mut out);
2258            return Ok(out);
2259        }
2260
2261        if let Some((0, k)) = slice {
2262            if k < self.len() {
2263                return self.bottom_k_impl(k, by_column, sort_options);
2264            }
2265        }
2266        // Check if the required column is already sorted; if so we can exit early
2267        // We can do so when there is only one column to sort by, for multiple columns
2268        // it will be complicated to do so
2269        #[cfg(feature = "dtype-categorical")]
2270        let is_not_categorical_enum =
2271            !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2272                || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2273
2274        #[cfg(not(feature = "dtype-categorical"))]
2275        #[allow(non_upper_case_globals)]
2276        const is_not_categorical_enum: bool = true;
2277
2278        if by_column.len() == 1 && is_not_categorical_enum {
2279            let required_sorting = if sort_options.descending[0] {
2280                IsSorted::Descending
2281            } else {
2282                IsSorted::Ascending
2283            };
2284            // If null count is 0 then nulls_last doesnt matter
2285            // Safe to get value at last position since the dataframe is not empty (taken care above)
2286            let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2287                && ((by_column[0].null_count() == 0)
2288                    || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2289                        == sort_options.nulls_last[0]);
2290
2291            if no_sorting_required {
2292                return if let Some((offset, len)) = slice {
2293                    Ok(self.slice(offset, len))
2294                } else {
2295                    Ok(self.clone())
2296                };
2297            }
2298        }
2299
2300        let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2301        let allow_threads = sort_options.multithreaded;
2302
2303        // a lot of indirection in both sorting and take
2304        let mut df = self.clone();
2305        let df = df.as_single_chunk_par();
2306        let mut take = match (by_column.len(), has_nested) {
2307            (1, false) => {
2308                let s = &by_column[0];
2309                let options = SortOptions {
2310                    descending: sort_options.descending[0],
2311                    nulls_last: sort_options.nulls_last[0],
2312                    multithreaded: sort_options.multithreaded,
2313                    maintain_order: sort_options.maintain_order,
2314                    limit: sort_options.limit,
2315                };
2316                // fast path for a frame with a single series
2317                // no need to compute the sort indices and then take by these indices
2318                // simply sort and return as frame
2319                if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2320                    let mut out = s.sort_with(options)?;
2321                    if let Some((offset, len)) = slice {
2322                        out = out.slice(offset, len);
2323                    }
2324                    return Ok(out.into_frame());
2325                }
2326                s.arg_sort(options)
2327            },
2328            _ => arg_sort(&by_column, sort_options)?,
2329        };
2330
2331        if let Some((offset, len)) = slice {
2332            take = take.slice(offset, len);
2333        }
2334
2335        // SAFETY:
2336        // the created indices are in bounds
2337        let mut df = unsafe { df.take_unchecked_impl(&take, allow_threads) };
2338        set_sorted(&mut df);
2339        Ok(df)
2340    }
2341
2342    /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2343    ///
2344    /// This dataframe does not necessarily have a specified schema and may be changed at any
2345    /// point. It is primarily used for debugging.
2346    pub fn _to_metadata(&self) -> DataFrame {
2347        let num_columns = self.columns.len();
2348
2349        let mut column_names =
2350            StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2351        let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2352        let mut sorted_asc_ca =
2353            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2354        let mut sorted_dsc_ca =
2355            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2356        let mut fast_explode_list_ca =
2357            BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2358        let mut materialized_at_ca =
2359            StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2360
2361        for col in &self.columns {
2362            let flags = col.get_flags();
2363
2364            let (repr, materialized_at) = match col {
2365                Column::Series(s) => ("series", s.materialized_at()),
2366                Column::Scalar(_) => ("scalar", None),
2367            };
2368            let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2369            let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2370            let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2371
2372            column_names.append_value(col.name().clone());
2373            repr_ca.append_value(repr);
2374            sorted_asc_ca.append_value(sorted_asc);
2375            sorted_dsc_ca.append_value(sorted_dsc);
2376            fast_explode_list_ca.append_value(fast_explode_list);
2377            materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2378        }
2379
2380        unsafe {
2381            DataFrame::new_no_checks(
2382                self.width(),
2383                vec![
2384                    column_names.finish().into_column(),
2385                    repr_ca.finish().into_column(),
2386                    sorted_asc_ca.finish().into_column(),
2387                    sorted_dsc_ca.finish().into_column(),
2388                    fast_explode_list_ca.finish().into_column(),
2389                    materialized_at_ca.finish().into_column(),
2390                ],
2391            )
2392        }
2393    }
2394
2395    /// Return a sorted clone of this [`DataFrame`].
2396    ///
2397    /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2398    /// # Example
2399    ///
2400    /// Sort by a single column with default options:
2401    /// ```
2402    /// # use polars_core::prelude::*;
2403    /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2404    ///     df.sort(["sepal_width"], Default::default())
2405    /// }
2406    /// ```
2407    /// Sort by a single column with specific order:
2408    /// ```
2409    /// # use polars_core::prelude::*;
2410    /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2411    ///     df.sort(
2412    ///         ["sepal_width"],
2413    ///         SortMultipleOptions::new()
2414    ///             .with_order_descending(descending)
2415    ///     )
2416    /// }
2417    /// ```
2418    /// Sort by multiple columns with specifying order for each column:
2419    /// ```
2420    /// # use polars_core::prelude::*;
2421    /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2422    ///     df.sort(
2423    ///         ["sepal_width", "sepal_length"],
2424    ///         SortMultipleOptions::new()
2425    ///             .with_order_descending_multi([false, true])
2426    ///     )
2427    /// }
2428    /// ```
2429    /// See [`SortMultipleOptions`] for more options.
2430    ///
2431    /// Also see [`DataFrame::sort_in_place`].
2432    pub fn sort(
2433        &self,
2434        by: impl IntoVec<PlSmallStr>,
2435        sort_options: SortMultipleOptions,
2436    ) -> PolarsResult<Self> {
2437        let mut df = self.clone();
2438        df.sort_in_place(by, sort_options)?;
2439        Ok(df)
2440    }
2441
2442    /// Replace a column with a [`Series`].
2443    ///
2444    /// # Example
2445    ///
2446    /// ```rust
2447    /// # use polars_core::prelude::*;
2448    /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2449    ///                         "Area (km²)" => [9_833_520, 9_596_961])?;
2450    /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2451    ///
2452    /// assert!(df.replace("Nation", s.clone()).is_err());
2453    /// assert!(df.replace("Country", s).is_ok());
2454    /// # Ok::<(), PolarsError>(())
2455    /// ```
2456    pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2457        self.apply(column, |_| new_col.into_series())
2458    }
2459
2460    /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2461    /// is that now the value of `column: &str` determines the name of the column and not the name
2462    /// of the `Series` passed to this method.
2463    pub fn replace_or_add<S: IntoSeries>(
2464        &mut self,
2465        column: PlSmallStr,
2466        new_col: S,
2467    ) -> PolarsResult<&mut Self> {
2468        let mut new_col = new_col.into_series();
2469        new_col.rename(column);
2470        self.with_column(new_col)
2471    }
2472
2473    /// Replace column at index `idx` with a [`Series`].
2474    ///
2475    /// # Example
2476    ///
2477    /// ```ignored
2478    /// # use polars_core::prelude::*;
2479    /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2480    /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2481    /// let mut df = DataFrame::new(vec![s0, s1])?;
2482    ///
2483    /// // Add 32 to get lowercase ascii values
2484    /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2485    /// # Ok::<(), PolarsError>(())
2486    /// ```
2487    pub fn replace_column<C: IntoColumn>(
2488        &mut self,
2489        index: usize,
2490        new_column: C,
2491    ) -> PolarsResult<&mut Self> {
2492        polars_ensure!(
2493            index < self.width(),
2494            ShapeMismatch:
2495            "unable to replace at index {}, the DataFrame has only {} columns",
2496            index, self.width(),
2497        );
2498        let mut new_column = new_column.into_column();
2499        polars_ensure!(
2500            new_column.len() == self.height(),
2501            ShapeMismatch:
2502            "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2503            new_column.len(), self.height(),
2504        );
2505        let old_col = &mut self.columns[index];
2506        mem::swap(old_col, &mut new_column);
2507        self.clear_schema();
2508        Ok(self)
2509    }
2510
2511    /// Apply a closure to a column. This is the recommended way to do in place modification.
2512    ///
2513    /// # Example
2514    ///
2515    /// ```rust
2516    /// # use polars_core::prelude::*;
2517    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2518    /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2519    /// let mut df = DataFrame::new(vec![s0, s1])?;
2520    ///
2521    /// fn str_to_len(str_val: &Column) -> Column {
2522    ///     str_val.str()
2523    ///         .unwrap()
2524    ///         .into_iter()
2525    ///         .map(|opt_name: Option<&str>| {
2526    ///             opt_name.map(|name: &str| name.len() as u32)
2527    ///          })
2528    ///         .collect::<UInt32Chunked>()
2529    ///         .into_column()
2530    /// }
2531    ///
2532    /// // Replace the names column by the length of the names.
2533    /// df.apply("names", str_to_len);
2534    /// # Ok::<(), PolarsError>(())
2535    /// ```
2536    /// Results in:
2537    ///
2538    /// ```text
2539    /// +--------+-------+
2540    /// | foo    |       |
2541    /// | ---    | names |
2542    /// | str    | u32   |
2543    /// +========+=======+
2544    /// | "ham"  | 4     |
2545    /// +--------+-------+
2546    /// | "spam" | 6     |
2547    /// +--------+-------+
2548    /// | "egg"  | 3     |
2549    /// +--------+-------+
2550    /// ```
2551    pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2552    where
2553        F: FnOnce(&Column) -> C,
2554        C: IntoColumn,
2555    {
2556        let idx = self.check_name_to_idx(name)?;
2557        self.apply_at_idx(idx, f)?;
2558        Ok(self)
2559    }
2560
2561    /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2562    /// modification.
2563    ///
2564    /// # Example
2565    ///
2566    /// ```rust
2567    /// # use polars_core::prelude::*;
2568    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2569    /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2570    /// let mut df = DataFrame::new(vec![s0, s1])?;
2571    ///
2572    /// // Add 32 to get lowercase ascii values
2573    /// df.apply_at_idx(1, |s| s + 32);
2574    /// # Ok::<(), PolarsError>(())
2575    /// ```
2576    /// Results in:
2577    ///
2578    /// ```text
2579    /// +--------+-------+
2580    /// | foo    | ascii |
2581    /// | ---    | ---   |
2582    /// | str    | i32   |
2583    /// +========+=======+
2584    /// | "ham"  | 102   |
2585    /// +--------+-------+
2586    /// | "spam" | 111   |
2587    /// +--------+-------+
2588    /// | "egg"  | 111   |
2589    /// +--------+-------+
2590    /// ```
2591    pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2592    where
2593        F: FnOnce(&Column) -> C,
2594        C: IntoColumn,
2595    {
2596        let df_height = self.height();
2597        let width = self.width();
2598        let col = self.columns.get_mut(idx).ok_or_else(|| {
2599            polars_err!(
2600                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2601                idx, width
2602            )
2603        })?;
2604        let name = col.name().clone();
2605        let dtype_before = col.dtype().clone();
2606        let new_col = f(col).into_column();
2607        match new_col.len() {
2608            1 => {
2609                let new_col = new_col.new_from_index(0, df_height);
2610                let _ = mem::replace(col, new_col);
2611            },
2612            len if (len == df_height) => {
2613                let _ = mem::replace(col, new_col);
2614            },
2615            len => polars_bail!(
2616                ShapeMismatch:
2617                "resulting Series has length {} while the DataFrame has height {}",
2618                len, df_height
2619            ),
2620        }
2621
2622        // make sure the name remains the same after applying the closure
2623        unsafe {
2624            let col = self.columns.get_unchecked_mut(idx);
2625            col.rename(name);
2626
2627            if col.dtype() != &dtype_before {
2628                self.clear_schema();
2629            }
2630        }
2631        Ok(self)
2632    }
2633
2634    /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2635    /// modification.
2636    ///
2637    /// # Example
2638    ///
2639    /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2640    ///
2641    /// ```rust
2642    /// # use polars_core::prelude::*;
2643    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2644    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2645    /// let mut df = DataFrame::new(vec![s0, s1])?;
2646    ///
2647    /// let idx = vec![0, 1, 4];
2648    ///
2649    /// df.try_apply("foo", |c| {
2650    ///     c.str()?
2651    ///     .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2652    /// });
2653    /// # Ok::<(), PolarsError>(())
2654    /// ```
2655    /// Results in:
2656    ///
2657    /// ```text
2658    /// +---------------------+--------+
2659    /// | foo                 | values |
2660    /// | ---                 | ---    |
2661    /// | str                 | i32    |
2662    /// +=====================+========+
2663    /// | "ham-is-modified"   | 1      |
2664    /// +---------------------+--------+
2665    /// | "spam-is-modified"  | 2      |
2666    /// +---------------------+--------+
2667    /// | "egg"               | 3      |
2668    /// +---------------------+--------+
2669    /// | "bacon"             | 4      |
2670    /// +---------------------+--------+
2671    /// | "quack-is-modified" | 5      |
2672    /// +---------------------+--------+
2673    /// ```
2674    pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2675    where
2676        F: FnOnce(&Column) -> PolarsResult<C>,
2677        C: IntoColumn,
2678    {
2679        let width = self.width();
2680        let col = self.columns.get_mut(idx).ok_or_else(|| {
2681            polars_err!(
2682                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2683                idx, width
2684            )
2685        })?;
2686        let name = col.name().clone();
2687
2688        let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2689
2690        // make sure the name remains the same after applying the closure
2691        unsafe {
2692            let col = self.columns.get_unchecked_mut(idx);
2693            col.rename(name);
2694        }
2695        Ok(self)
2696    }
2697
2698    /// Apply a closure that may fail to a column. This is the recommended way to do in place
2699    /// modification.
2700    ///
2701    /// # Example
2702    ///
2703    /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2704    ///
2705    /// ```rust
2706    /// # use polars_core::prelude::*;
2707    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2708    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2709    /// let mut df = DataFrame::new(vec![s0, s1])?;
2710    ///
2711    /// // create a mask
2712    /// let values = df.column("values")?.as_materialized_series();
2713    /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2714    ///
2715    /// df.try_apply("foo", |c| {
2716    ///     c.str()?
2717    ///     .set(&mask, Some("not_within_bounds"))
2718    /// });
2719    /// # Ok::<(), PolarsError>(())
2720    /// ```
2721    /// Results in:
2722    ///
2723    /// ```text
2724    /// +---------------------+--------+
2725    /// | foo                 | values |
2726    /// | ---                 | ---    |
2727    /// | str                 | i32    |
2728    /// +=====================+========+
2729    /// | "not_within_bounds" | 1      |
2730    /// +---------------------+--------+
2731    /// | "spam"              | 2      |
2732    /// +---------------------+--------+
2733    /// | "egg"               | 3      |
2734    /// +---------------------+--------+
2735    /// | "bacon"             | 4      |
2736    /// +---------------------+--------+
2737    /// | "not_within_bounds" | 5      |
2738    /// +---------------------+--------+
2739    /// ```
2740    pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2741    where
2742        F: FnOnce(&Series) -> PolarsResult<C>,
2743        C: IntoColumn,
2744    {
2745        let idx = self.try_get_column_index(column)?;
2746        self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2747    }
2748
2749    /// Slice the [`DataFrame`] along the rows.
2750    ///
2751    /// # Example
2752    ///
2753    /// ```rust
2754    /// # use polars_core::prelude::*;
2755    /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2756    ///                         "Color" => ["Green", "Red", "White", "White", "Red"])?;
2757    /// let sl: DataFrame = df.slice(2, 3);
2758    ///
2759    /// assert_eq!(sl.shape(), (3, 2));
2760    /// println!("{}", sl);
2761    /// # Ok::<(), PolarsError>(())
2762    /// ```
2763    /// Output:
2764    /// ```text
2765    /// shape: (3, 2)
2766    /// +-------+-------+
2767    /// | Fruit | Color |
2768    /// | ---   | ---   |
2769    /// | str   | str   |
2770    /// +=======+=======+
2771    /// | Grape | White |
2772    /// +-------+-------+
2773    /// | Fig   | White |
2774    /// +-------+-------+
2775    /// | Fig   | Red   |
2776    /// +-------+-------+
2777    /// ```
2778    #[must_use]
2779    pub fn slice(&self, offset: i64, length: usize) -> Self {
2780        if offset == 0 && length == self.height() {
2781            return self.clone();
2782        }
2783        if length == 0 {
2784            return self.clear();
2785        }
2786        let col = self
2787            .columns
2788            .iter()
2789            .map(|s| s.slice(offset, length))
2790            .collect::<Vec<_>>();
2791
2792        let height = if let Some(fst) = col.first() {
2793            fst.len()
2794        } else {
2795            let (_, length) = slice_offsets(offset, length, self.height());
2796            length
2797        };
2798
2799        unsafe { DataFrame::new_no_checks(height, col) }
2800    }
2801
2802    /// Split [`DataFrame`] at the given `offset`.
2803    pub fn split_at(&self, offset: i64) -> (Self, Self) {
2804        let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2805
2806        let (idx, _) = slice_offsets(offset, 0, self.height());
2807
2808        let a = unsafe { DataFrame::new_no_checks(idx, a) };
2809        let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2810        (a, b)
2811    }
2812
2813    #[must_use]
2814    pub fn clear(&self) -> Self {
2815        let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2816        unsafe { DataFrame::new_no_checks(0, col) }
2817    }
2818
2819    #[must_use]
2820    pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2821        if offset == 0 && length == self.height() {
2822            return self.clone();
2823        }
2824        let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2825        unsafe { DataFrame::new_no_checks(length, columns) }
2826    }
2827
2828    #[must_use]
2829    pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2830        if offset == 0 && length == self.height() {
2831            return self.clone();
2832        }
2833        // @scalar-opt
2834        let columns = self._apply_columns(&|s| {
2835            let mut out = s.slice(offset, length);
2836            out.shrink_to_fit();
2837            out
2838        });
2839        unsafe { DataFrame::new_no_checks(length, columns) }
2840    }
2841
2842    /// Get the head of the [`DataFrame`].
2843    ///
2844    /// # Example
2845    ///
2846    /// ```rust
2847    /// # use polars_core::prelude::*;
2848    /// let countries: DataFrame =
2849    ///     df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2850    ///         "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2851    ///         "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2852    ///         "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2853    /// assert_eq!(countries.shape(), (5, 4));
2854    ///
2855    /// println!("{}", countries.head(Some(3)));
2856    /// # Ok::<(), PolarsError>(())
2857    /// ```
2858    ///
2859    /// Output:
2860    ///
2861    /// ```text
2862    /// shape: (3, 4)
2863    /// +--------------------+---------------+---------------+------------+
2864    /// | Rank by GDP (2021) | Continent     | Country       | Capital    |
2865    /// | ---                | ---           | ---           | ---        |
2866    /// | i32                | str           | str           | str        |
2867    /// +====================+===============+===============+============+
2868    /// | 1                  | North America | United States | Washington |
2869    /// +--------------------+---------------+---------------+------------+
2870    /// | 2                  | Asia          | China         | Beijing    |
2871    /// +--------------------+---------------+---------------+------------+
2872    /// | 3                  | Asia          | Japan         | Tokyo      |
2873    /// +--------------------+---------------+---------------+------------+
2874    /// ```
2875    #[must_use]
2876    pub fn head(&self, length: Option<usize>) -> Self {
2877        let col = self
2878            .columns
2879            .iter()
2880            .map(|c| c.head(length))
2881            .collect::<Vec<_>>();
2882
2883        let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2884        let height = usize::min(height, self.height());
2885        unsafe { DataFrame::new_no_checks(height, col) }
2886    }
2887
2888    /// Get the tail of the [`DataFrame`].
2889    ///
2890    /// # Example
2891    ///
2892    /// ```rust
2893    /// # use polars_core::prelude::*;
2894    /// let countries: DataFrame =
2895    ///     df!("Rank (2021)" => [105, 106, 107, 108, 109],
2896    ///         "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2897    ///         "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2898    /// assert_eq!(countries.shape(), (5, 3));
2899    ///
2900    /// println!("{}", countries.tail(Some(2)));
2901    /// # Ok::<(), PolarsError>(())
2902    /// ```
2903    ///
2904    /// Output:
2905    ///
2906    /// ```text
2907    /// shape: (2, 3)
2908    /// +-------------+--------------------+---------+
2909    /// | Rank (2021) | Apple Price (€/kg) | Country |
2910    /// | ---         | ---                | ---     |
2911    /// | i32         | f64                | str     |
2912    /// +=============+====================+=========+
2913    /// | 108         | 0.63               | Syria   |
2914    /// +-------------+--------------------+---------+
2915    /// | 109         | 0.63               | Turkey  |
2916    /// +-------------+--------------------+---------+
2917    /// ```
2918    #[must_use]
2919    pub fn tail(&self, length: Option<usize>) -> Self {
2920        let col = self
2921            .columns
2922            .iter()
2923            .map(|c| c.tail(length))
2924            .collect::<Vec<_>>();
2925
2926        let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2927        let height = usize::min(height, self.height());
2928        unsafe { DataFrame::new_no_checks(height, col) }
2929    }
2930
2931    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2932    ///
2933    /// # Panics
2934    ///
2935    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2936    ///
2937    /// This responsibility is left to the caller as we don't want to take mutable references here,
2938    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2939    /// as well.
2940    pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter<'_> {
2941        debug_assert!(!self.should_rechunk(), "expected equal chunks");
2942        // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2943        // as we must allocate arrow strings/binaries.
2944        let must_convert = compat_level.0 == 0;
2945        let parallel = parallel
2946            && must_convert
2947            && self.columns.len() > 1
2948            && self
2949                .columns
2950                .iter()
2951                .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2952
2953        RecordBatchIter {
2954            columns: &self.columns,
2955            schema: Arc::new(
2956                self.columns
2957                    .iter()
2958                    .map(|c| c.field().to_arrow(compat_level))
2959                    .collect(),
2960            ),
2961            idx: 0,
2962            n_chunks: self.first_col_n_chunks(),
2963            compat_level,
2964            parallel,
2965        }
2966    }
2967
2968    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2969    ///
2970    /// # Panics
2971    ///
2972    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2973    ///
2974    /// This responsibility is left to the caller as we don't want to take mutable references here,
2975    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2976    /// as well.
2977    pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2978        debug_assert!(!self.should_rechunk());
2979        PhysRecordBatchIter {
2980            schema: Arc::new(
2981                self.get_columns()
2982                    .iter()
2983                    .map(|c| c.field().to_arrow(CompatLevel::newest()))
2984                    .collect(),
2985            ),
2986            arr_iters: self
2987                .materialized_column_iter()
2988                .map(|s| s.chunks().iter())
2989                .collect(),
2990        }
2991    }
2992
2993    /// Get a [`DataFrame`] with all the columns in reversed order.
2994    #[must_use]
2995    pub fn reverse(&self) -> Self {
2996        let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2997        unsafe { DataFrame::new_no_checks(self.height(), col) }
2998    }
2999
3000    /// Shift the values by a given period and fill the parts that will be empty due to this operation
3001    /// with `Nones`.
3002    ///
3003    /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
3004    #[must_use]
3005    pub fn shift(&self, periods: i64) -> Self {
3006        let col = self._apply_columns_par(&|s| s.shift(periods));
3007        unsafe { DataFrame::new_no_checks(self.height(), col) }
3008    }
3009
3010    /// Replace None values with one of the following strategies:
3011    /// * Forward fill (replace None with the previous value)
3012    /// * Backward fill (replace None with the next value)
3013    /// * Mean fill (replace None with the mean of the whole array)
3014    /// * Min fill (replace None with the minimum of the whole array)
3015    /// * Max fill (replace None with the maximum of the whole array)
3016    ///
3017    /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
3018    pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
3019        let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
3020
3021        Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
3022    }
3023
3024    /// Pipe different functions/ closure operations that work on a DataFrame together.
3025    pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
3026    where
3027        F: Fn(DataFrame) -> PolarsResult<B>,
3028    {
3029        f(self)
3030    }
3031
3032    /// Pipe different functions/ closure operations that work on a DataFrame together.
3033    pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
3034    where
3035        F: Fn(&mut DataFrame) -> PolarsResult<B>,
3036    {
3037        f(self)
3038    }
3039
3040    /// Pipe different functions/ closure operations that work on a DataFrame together.
3041    pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
3042    where
3043        F: Fn(DataFrame, Args) -> PolarsResult<B>,
3044    {
3045        f(self, args)
3046    }
3047
3048    /// Drop duplicate rows from a [`DataFrame`].
3049    /// *This fails when there is a column of type List in DataFrame*
3050    ///
3051    /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
3052    ///
3053    /// # Example
3054    ///
3055    /// ```no_run
3056    /// # use polars_core::prelude::*;
3057    /// let df = df! {
3058    ///               "flt" => [1., 1., 2., 2., 3., 3.],
3059    ///               "int" => [1, 1, 2, 2, 3, 3, ],
3060    ///               "str" => ["a", "a", "b", "b", "c", "c"]
3061    ///           }?;
3062    ///
3063    /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
3064    /// # Ok::<(), PolarsError>(())
3065    /// ```
3066    /// Returns
3067    ///
3068    /// ```text
3069    /// +-----+-----+-----+
3070    /// | flt | int | str |
3071    /// | --- | --- | --- |
3072    /// | f64 | i32 | str |
3073    /// +=====+=====+=====+
3074    /// | 1   | 1   | "a" |
3075    /// +-----+-----+-----+
3076    /// | 2   | 2   | "b" |
3077    /// +-----+-----+-----+
3078    /// | 3   | 3   | "c" |
3079    /// +-----+-----+-----+
3080    /// ```
3081    #[cfg(feature = "algorithm_group_by")]
3082    pub fn unique_stable(
3083        &self,
3084        subset: Option<&[String]>,
3085        keep: UniqueKeepStrategy,
3086        slice: Option<(i64, usize)>,
3087    ) -> PolarsResult<DataFrame> {
3088        self.unique_impl(
3089            true,
3090            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3091            keep,
3092            slice,
3093        )
3094    }
3095
3096    /// Unstable distinct. See [`DataFrame::unique_stable`].
3097    #[cfg(feature = "algorithm_group_by")]
3098    pub fn unique<I, S>(
3099        &self,
3100        subset: Option<&[String]>,
3101        keep: UniqueKeepStrategy,
3102        slice: Option<(i64, usize)>,
3103    ) -> PolarsResult<DataFrame> {
3104        self.unique_impl(
3105            false,
3106            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3107            keep,
3108            slice,
3109        )
3110    }
3111
3112    #[cfg(feature = "algorithm_group_by")]
3113    pub fn unique_impl(
3114        &self,
3115        maintain_order: bool,
3116        subset: Option<Vec<PlSmallStr>>,
3117        keep: UniqueKeepStrategy,
3118        slice: Option<(i64, usize)>,
3119    ) -> PolarsResult<Self> {
3120        let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3121        let mut df = self.clone();
3122        // take on multiple chunks is terrible
3123        df.as_single_chunk_par();
3124
3125        let columns = match (keep, maintain_order) {
3126            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3127                let gb = df.group_by_stable(names)?;
3128                let groups = gb.get_groups();
3129                let (offset, len) = slice.unwrap_or((0, groups.len()));
3130                let groups = groups.slice(offset, len);
3131                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3132            },
3133            (UniqueKeepStrategy::Last, true) => {
3134                // maintain order by last values, so the sorted groups are not correct as they
3135                // are sorted by the first value
3136                let gb = df.group_by_stable(names)?;
3137                let groups = gb.get_groups();
3138
3139                let last_idx: NoNull<IdxCa> = groups
3140                    .iter()
3141                    .map(|g| match g {
3142                        GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3143                        GroupsIndicator::Slice([first, len]) => first + len - 1,
3144                    })
3145                    .collect();
3146
3147                let mut last_idx = last_idx.into_inner().sort(false);
3148
3149                if let Some((offset, len)) = slice {
3150                    last_idx = last_idx.slice(offset, len);
3151                }
3152
3153                let last_idx = NoNull::new(last_idx);
3154                let out = unsafe { df.take_unchecked(&last_idx) };
3155                return Ok(out);
3156            },
3157            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3158                let gb = df.group_by(names)?;
3159                let groups = gb.get_groups();
3160                let (offset, len) = slice.unwrap_or((0, groups.len()));
3161                let groups = groups.slice(offset, len);
3162                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3163            },
3164            (UniqueKeepStrategy::Last, false) => {
3165                let gb = df.group_by(names)?;
3166                let groups = gb.get_groups();
3167                let (offset, len) = slice.unwrap_or((0, groups.len()));
3168                let groups = groups.slice(offset, len);
3169                df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3170            },
3171            (UniqueKeepStrategy::None, _) => {
3172                let df_part = df.select(names)?;
3173                let mask = df_part.is_unique()?;
3174                let mut filtered = df.filter(&mask)?;
3175
3176                if let Some((offset, len)) = slice {
3177                    filtered = filtered.slice(offset, len);
3178                }
3179                return Ok(filtered);
3180            },
3181        };
3182        let height = Self::infer_height(&columns);
3183        Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3184    }
3185
3186    /// Get a mask of all the unique rows in the [`DataFrame`].
3187    ///
3188    /// # Example
3189    ///
3190    /// ```no_run
3191    /// # use polars_core::prelude::*;
3192    /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3193    ///                         "ISIN" => ["US0378331005", "US5949181045"])?;
3194    /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3195    ///
3196    /// assert!(ca.all());
3197    /// # Ok::<(), PolarsError>(())
3198    /// ```
3199    #[cfg(feature = "algorithm_group_by")]
3200    pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3201        let gb = self.group_by(self.get_column_names_owned())?;
3202        let groups = gb.get_groups();
3203        Ok(is_unique_helper(
3204            groups,
3205            self.height() as IdxSize,
3206            true,
3207            false,
3208        ))
3209    }
3210
3211    /// Get a mask of all the duplicated rows in the [`DataFrame`].
3212    ///
3213    /// # Example
3214    ///
3215    /// ```no_run
3216    /// # use polars_core::prelude::*;
3217    /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3218    ///                         "ISIN" => ["US02079K3059", "US02079K1079"])?;
3219    /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3220    ///
3221    /// assert!(!ca.all());
3222    /// # Ok::<(), PolarsError>(())
3223    /// ```
3224    #[cfg(feature = "algorithm_group_by")]
3225    pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3226        let gb = self.group_by(self.get_column_names_owned())?;
3227        let groups = gb.get_groups();
3228        Ok(is_unique_helper(
3229            groups,
3230            self.height() as IdxSize,
3231            false,
3232            true,
3233        ))
3234    }
3235
3236    /// Create a new [`DataFrame`] that shows the null counts per column.
3237    #[must_use]
3238    pub fn null_count(&self) -> Self {
3239        let cols = self
3240            .columns
3241            .iter()
3242            .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3243            .collect();
3244        unsafe { Self::new_no_checks(1, cols) }
3245    }
3246
3247    /// Hash and combine the row values
3248    #[cfg(feature = "row_hash")]
3249    pub fn hash_rows(
3250        &mut self,
3251        hasher_builder: Option<PlSeedableRandomStateQuality>,
3252    ) -> PolarsResult<UInt64Chunked> {
3253        let dfs = split_df(self, POOL.current_num_threads(), false);
3254        let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3255
3256        let mut iter = cas.into_iter();
3257        let mut acc_ca = iter.next().unwrap();
3258        for ca in iter {
3259            acc_ca.append(&ca)?;
3260        }
3261        Ok(acc_ca.rechunk().into_owned())
3262    }
3263
3264    /// Get the supertype of the columns in this DataFrame
3265    pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3266        self.columns
3267            .iter()
3268            .map(|s| Ok(s.dtype().clone()))
3269            .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3270    }
3271
3272    /// Take by index values given by the slice `idx`.
3273    /// # Warning
3274    /// Be careful with allowing threads when calling this in a large hot loop
3275    /// every thread split may be on rayon stack and lead to SO
3276    #[doc(hidden)]
3277    pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3278        self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3279    }
3280
3281    /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3282    /// if the index value in `idx` are sorted. This will maintain sorted flags.
3283    ///
3284    /// # Warning
3285    /// Be careful with allowing threads when calling this in a large hot loop
3286    /// every thread split may be on rayon stack and lead to SO
3287    #[doc(hidden)]
3288    pub unsafe fn _take_unchecked_slice_sorted(
3289        &self,
3290        idx: &[IdxSize],
3291        allow_threads: bool,
3292        sorted: IsSorted,
3293    ) -> Self {
3294        #[cfg(debug_assertions)]
3295        {
3296            if idx.len() > 2 {
3297                match sorted {
3298                    IsSorted::Ascending => {
3299                        assert!(idx[0] <= idx[idx.len() - 1]);
3300                    },
3301                    IsSorted::Descending => {
3302                        assert!(idx[0] >= idx[idx.len() - 1]);
3303                    },
3304                    _ => {},
3305                }
3306            }
3307        }
3308        let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3309        ca.set_sorted_flag(sorted);
3310        self.take_unchecked_impl(&ca, allow_threads)
3311    }
3312
3313    #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3314    #[doc(hidden)]
3315    pub fn _partition_by_impl(
3316        &self,
3317        cols: &[PlSmallStr],
3318        stable: bool,
3319        include_key: bool,
3320        parallel: bool,
3321    ) -> PolarsResult<Vec<DataFrame>> {
3322        let selected_keys = self.select_columns(cols.iter().cloned())?;
3323        let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3324        let groups = groups.take_groups();
3325
3326        // drop key columns prior to calculation if requested
3327        let df = if include_key {
3328            self.clone()
3329        } else {
3330            self.drop_many(cols.iter().cloned())
3331        };
3332
3333        if parallel {
3334            // don't parallelize this
3335            // there is a lot of parallelization in take and this may easily SO
3336            POOL.install(|| {
3337                match groups.as_ref() {
3338                    GroupsType::Idx(idx) => {
3339                        // Rechunk as the gather may rechunk for every group #17562.
3340                        let mut df = df.clone();
3341                        df.as_single_chunk_par();
3342                        Ok(idx
3343                            .into_par_iter()
3344                            .map(|(_, group)| {
3345                                // groups are in bounds
3346                                unsafe {
3347                                    df._take_unchecked_slice_sorted(
3348                                        group,
3349                                        false,
3350                                        IsSorted::Ascending,
3351                                    )
3352                                }
3353                            })
3354                            .collect())
3355                    },
3356                    GroupsType::Slice { groups, .. } => Ok(groups
3357                        .into_par_iter()
3358                        .map(|[first, len]| df.slice(*first as i64, *len as usize))
3359                        .collect()),
3360                }
3361            })
3362        } else {
3363            match groups.as_ref() {
3364                GroupsType::Idx(idx) => {
3365                    // Rechunk as the gather may rechunk for every group #17562.
3366                    let mut df = df;
3367                    df.as_single_chunk();
3368                    Ok(idx
3369                        .into_iter()
3370                        .map(|(_, group)| {
3371                            // groups are in bounds
3372                            unsafe {
3373                                df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3374                            }
3375                        })
3376                        .collect())
3377                },
3378                GroupsType::Slice { groups, .. } => Ok(groups
3379                    .iter()
3380                    .map(|[first, len]| df.slice(*first as i64, *len as usize))
3381                    .collect()),
3382            }
3383        }
3384    }
3385
3386    /// Split into multiple DataFrames partitioned by groups
3387    #[cfg(feature = "partition_by")]
3388    pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3389    where
3390        I: IntoIterator<Item = S>,
3391        S: Into<PlSmallStr>,
3392    {
3393        let cols = cols
3394            .into_iter()
3395            .map(Into::into)
3396            .collect::<Vec<PlSmallStr>>();
3397        self._partition_by_impl(cols.as_slice(), false, include_key, true)
3398    }
3399
3400    /// Split into multiple DataFrames partitioned by groups
3401    /// Order of the groups are maintained.
3402    #[cfg(feature = "partition_by")]
3403    pub fn partition_by_stable<I, S>(
3404        &self,
3405        cols: I,
3406        include_key: bool,
3407    ) -> PolarsResult<Vec<DataFrame>>
3408    where
3409        I: IntoIterator<Item = S>,
3410        S: Into<PlSmallStr>,
3411    {
3412        let cols = cols
3413            .into_iter()
3414            .map(Into::into)
3415            .collect::<Vec<PlSmallStr>>();
3416        self._partition_by_impl(cols.as_slice(), true, include_key, true)
3417    }
3418
3419    /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3420    /// inserted as columns.
3421    #[cfg(feature = "dtype-struct")]
3422    pub fn unnest<I: IntoVec<PlSmallStr>>(
3423        &self,
3424        cols: I,
3425        separator: Option<&str>,
3426    ) -> PolarsResult<DataFrame> {
3427        let cols = cols.into_vec();
3428        self.unnest_impl(cols.into_iter().collect(), separator)
3429    }
3430
3431    #[cfg(feature = "dtype-struct")]
3432    fn unnest_impl(
3433        &self,
3434        cols: PlHashSet<PlSmallStr>,
3435        separator: Option<&str>,
3436    ) -> PolarsResult<DataFrame> {
3437        let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3438        let mut count = 0;
3439        for s in &self.columns {
3440            if cols.contains(s.name()) {
3441                let ca = s.struct_()?.clone();
3442                new_cols.extend(ca.fields_as_series().into_iter().map(|mut f| {
3443                    if let Some(separator) = &separator {
3444                        f.rename(polars_utils::format_pl_smallstr!(
3445                            "{}{}{}",
3446                            s.name(),
3447                            separator,
3448                            f.name()
3449                        ));
3450                    }
3451                    Column::from(f)
3452                }));
3453                count += 1;
3454            } else {
3455                new_cols.push(s.clone())
3456            }
3457        }
3458        if count != cols.len() {
3459            // one or more columns not found
3460            // the code below will return an error with the missing name
3461            let schema = self.schema();
3462            for col in cols {
3463                let _ = schema
3464                    .get(col.as_str())
3465                    .ok_or_else(|| polars_err!(col_not_found = col))?;
3466            }
3467        }
3468        DataFrame::new(new_cols)
3469    }
3470
3471    pub(crate) fn infer_height(cols: &[Column]) -> usize {
3472        cols.first().map_or(0, Column::len)
3473    }
3474
3475    pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3476        // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3477        // append_chunk or something like this. It is just quite difficult to make that safe.
3478        let df = DataFrame::from(rb);
3479        polars_ensure!(
3480            self.schema() == df.schema(),
3481            SchemaMismatch: "cannot append record batch with different schema\n\n
3482        Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3483        );
3484        self.vstack_mut_owned_unchecked(df);
3485        Ok(())
3486    }
3487
3488    pub fn into_columns(self) -> Vec<Column> {
3489        self.columns
3490    }
3491}
3492
3493pub struct RecordBatchIter<'a> {
3494    columns: &'a Vec<Column>,
3495    schema: ArrowSchemaRef,
3496    idx: usize,
3497    n_chunks: usize,
3498    compat_level: CompatLevel,
3499    parallel: bool,
3500}
3501
3502impl Iterator for RecordBatchIter<'_> {
3503    type Item = RecordBatch;
3504
3505    fn next(&mut self) -> Option<Self::Item> {
3506        if self.idx >= self.n_chunks {
3507            return None;
3508        }
3509
3510        // Create a batch of the columns with the same chunk no.
3511        let batch_cols: Vec<ArrayRef> = if self.parallel {
3512            let iter = self
3513                .columns
3514                .par_iter()
3515                .map(Column::as_materialized_series)
3516                .map(|s| s.to_arrow(self.idx, self.compat_level));
3517            POOL.install(|| iter.collect())
3518        } else {
3519            self.columns
3520                .iter()
3521                .map(Column::as_materialized_series)
3522                .map(|s| s.to_arrow(self.idx, self.compat_level))
3523                .collect()
3524        };
3525        self.idx += 1;
3526
3527        let length = batch_cols.first().map_or(0, |arr| arr.len());
3528        Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3529    }
3530
3531    fn size_hint(&self) -> (usize, Option<usize>) {
3532        let n = self.n_chunks - self.idx;
3533        (n, Some(n))
3534    }
3535}
3536
3537pub struct PhysRecordBatchIter<'a> {
3538    schema: ArrowSchemaRef,
3539    arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3540}
3541
3542impl Iterator for PhysRecordBatchIter<'_> {
3543    type Item = RecordBatch;
3544
3545    fn next(&mut self) -> Option<Self::Item> {
3546        let arrs = self
3547            .arr_iters
3548            .iter_mut()
3549            .map(|phys_iter| phys_iter.next().cloned())
3550            .collect::<Option<Vec<_>>>()?;
3551
3552        let length = arrs.first().map_or(0, |arr| arr.len());
3553        Some(RecordBatch::new(length, self.schema.clone(), arrs))
3554    }
3555
3556    fn size_hint(&self) -> (usize, Option<usize>) {
3557        if let Some(iter) = self.arr_iters.first() {
3558            iter.size_hint()
3559        } else {
3560            (0, None)
3561        }
3562    }
3563}
3564
3565impl Default for DataFrame {
3566    fn default() -> Self {
3567        DataFrame::empty()
3568    }
3569}
3570
3571impl From<DataFrame> for Vec<Column> {
3572    fn from(df: DataFrame) -> Self {
3573        df.columns
3574    }
3575}
3576
3577// utility to test if we can vstack/extend the columns
3578fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3579    polars_ensure!(
3580        left.name() == right.name(),
3581        ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3582        left.name(), right.name(),
3583    );
3584    Ok(())
3585}
3586
3587#[cfg(test)]
3588mod test {
3589    use super::*;
3590
3591    fn create_frame() -> DataFrame {
3592        let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3593        let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3594        DataFrame::new(vec![s0, s1]).unwrap()
3595    }
3596
3597    #[test]
3598    #[cfg_attr(miri, ignore)]
3599    fn test_recordbatch_iterator() {
3600        let df = df!(
3601            "foo" => [1, 2, 3, 4, 5]
3602        )
3603        .unwrap();
3604        let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3605        assert_eq!(5, iter.next().unwrap().len());
3606        assert!(iter.next().is_none());
3607    }
3608
3609    #[test]
3610    #[cfg_attr(miri, ignore)]
3611    fn test_select() {
3612        let df = create_frame();
3613        assert_eq!(
3614            df.column("days")
3615                .unwrap()
3616                .as_series()
3617                .unwrap()
3618                .equal(1)
3619                .unwrap()
3620                .sum(),
3621            Some(1)
3622        );
3623    }
3624
3625    #[test]
3626    #[cfg_attr(miri, ignore)]
3627    fn test_filter_broadcast_on_string_col() {
3628        let col_name = "some_col";
3629        let v = vec!["test".to_string()];
3630        let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3631        let mut df = DataFrame::new(vec![s0]).unwrap();
3632
3633        df = df
3634            .filter(
3635                &df.column(col_name)
3636                    .unwrap()
3637                    .as_materialized_series()
3638                    .equal("")
3639                    .unwrap(),
3640            )
3641            .unwrap();
3642        assert_eq!(
3643            df.column(col_name)
3644                .unwrap()
3645                .as_materialized_series()
3646                .n_chunks(),
3647            1
3648        );
3649    }
3650
3651    #[test]
3652    #[cfg_attr(miri, ignore)]
3653    fn test_filter_broadcast_on_list_col() {
3654        let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3655        let ll: ListChunked = [&s1].iter().copied().collect();
3656
3657        let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3658        let new = ll.filter(&mask).unwrap();
3659
3660        assert_eq!(new.chunks.len(), 1);
3661        assert_eq!(new.len(), 0);
3662    }
3663
3664    #[test]
3665    fn slice() {
3666        let df = create_frame();
3667        let sliced_df = df.slice(0, 2);
3668        assert_eq!(sliced_df.shape(), (2, 2));
3669    }
3670
3671    #[test]
3672    fn rechunk_false() {
3673        let df = create_frame();
3674        assert!(!df.should_rechunk())
3675    }
3676
3677    #[test]
3678    fn rechunk_true() -> PolarsResult<()> {
3679        let mut base = df!(
3680            "a" => [1, 2, 3],
3681            "b" => [1, 2, 3]
3682        )?;
3683
3684        // Create a series with multiple chunks
3685        let mut s = Series::new("foo".into(), 0..2);
3686        let s2 = Series::new("bar".into(), 0..1);
3687        s.append(&s2)?;
3688
3689        // Append series to frame
3690        let out = base.with_column(s)?;
3691
3692        // Now we should rechunk
3693        assert!(out.should_rechunk());
3694        Ok(())
3695    }
3696
3697    #[test]
3698    fn test_duplicate_column() {
3699        let mut df = df! {
3700            "foo" => [1, 2, 3]
3701        }
3702        .unwrap();
3703        // check if column is replaced
3704        assert!(
3705            df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3706                .is_ok()
3707        );
3708        assert!(
3709            df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3710                .is_ok()
3711        );
3712        assert!(df.column("bar").is_ok())
3713    }
3714
3715    #[test]
3716    #[cfg_attr(miri, ignore)]
3717    fn distinct() {
3718        let df = df! {
3719            "flt" => [1., 1., 2., 2., 3., 3.],
3720            "int" => [1, 1, 2, 2, 3, 3, ],
3721            "str" => ["a", "a", "b", "b", "c", "c"]
3722        }
3723        .unwrap();
3724        let df = df
3725            .unique_stable(None, UniqueKeepStrategy::First, None)
3726            .unwrap()
3727            .sort(["flt"], SortMultipleOptions::default())
3728            .unwrap();
3729        let valid = df! {
3730            "flt" => [1., 2., 3.],
3731            "int" => [1, 2, 3],
3732            "str" => ["a", "b", "c"]
3733        }
3734        .unwrap();
3735        assert!(df.equals(&valid));
3736    }
3737
3738    #[test]
3739    fn test_vstack() {
3740        // check that it does not accidentally rechunks
3741        let mut df = df! {
3742            "flt" => [1., 1., 2., 2., 3., 3.],
3743            "int" => [1, 1, 2, 2, 3, 3, ],
3744            "str" => ["a", "a", "b", "b", "c", "c"]
3745        }
3746        .unwrap();
3747
3748        df.vstack_mut(&df.slice(0, 3)).unwrap();
3749        assert_eq!(df.first_col_n_chunks(), 2)
3750    }
3751
3752    #[test]
3753    fn test_vstack_on_empty_dataframe() {
3754        let mut df = DataFrame::empty();
3755
3756        let df_data = df! {
3757            "flt" => [1., 1., 2., 2., 3., 3.],
3758            "int" => [1, 1, 2, 2, 3, 3, ],
3759            "str" => ["a", "a", "b", "b", "c", "c"]
3760        }
3761        .unwrap();
3762
3763        df.vstack_mut(&df_data).unwrap();
3764        assert_eq!(df.height, 6)
3765    }
3766
3767    #[test]
3768    fn test_replace_or_add() -> PolarsResult<()> {
3769        let mut df = df!(
3770            "a" => [1, 2, 3],
3771            "b" => [1, 2, 3]
3772        )?;
3773
3774        // check that the new column is "c" and not "bar".
3775        df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3776
3777        assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3778        Ok(())
3779    }
3780
3781    #[test]
3782    fn test_unique_keep_none_with_slice() {
3783        let df = df! {
3784            "x" => [1, 2, 3, 2, 1]
3785        }
3786        .unwrap();
3787        let out = df
3788            .unique_stable(
3789                Some(&["x".to_string()][..]),
3790                UniqueKeepStrategy::None,
3791                Some((0, 2)),
3792            )
3793            .unwrap();
3794        let expected = df! {
3795            "x" => [3]
3796        }
3797        .unwrap();
3798        assert!(out.equals(&expected));
3799    }
3800
3801    #[test]
3802    #[cfg(feature = "dtype-i8")]
3803    fn test_apply_result_schema() {
3804        let mut df = df! {
3805            "x" => [1, 2, 3, 2, 1]
3806        }
3807        .unwrap();
3808
3809        let schema_before = df.schema().clone();
3810        df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3811        assert_ne!(&schema_before, df.schema());
3812    }
3813}