Skip to main content

polars_core/frame/
mod.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use arrow::datatypes::ArrowSchemaRef;
4use polars_row::ArrayRef;
5use polars_utils::UnitVec;
6use polars_utils::itertools::Itertools;
7use rayon::prelude::*;
8
9use crate::chunked_array::flags::StatisticsFlags;
10#[cfg(feature = "algorithm_group_by")]
11use crate::chunked_array::ops::unique::is_unique_helper;
12use crate::prelude::gather::check_bounds_ca;
13use crate::prelude::*;
14#[cfg(feature = "row_hash")]
15use crate::utils::split_df;
16use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
17use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
18
19#[cfg(feature = "dataframe_arithmetic")]
20mod arithmetic;
21pub mod builder;
22mod chunks;
23pub use chunks::chunk_df_for_writing;
24mod broadcast;
25pub mod column;
26mod dataframe;
27mod filter;
28mod projection;
29pub use dataframe::DataFrame;
30use filter::filter_zero_width;
31use projection::{AmortizedColumnSelector, LINEAR_SEARCH_LIMIT};
32
33pub mod explode;
34mod from;
35#[cfg(feature = "algorithm_group_by")]
36pub mod group_by;
37pub(crate) mod horizontal;
38#[cfg(any(feature = "rows", feature = "object"))]
39pub mod row;
40mod top_k;
41mod upstream_traits;
42mod validation;
43
44use arrow::record_batch::{RecordBatch, RecordBatchT};
45use polars_utils::pl_str::PlSmallStr;
46#[cfg(feature = "serde")]
47use serde::{Deserialize, Serialize};
48use strum_macros::IntoStaticStr;
49
50use crate::POOL;
51#[cfg(feature = "row_hash")]
52use crate::hashing::_df_rows_to_hashes_threaded_vertical;
53use crate::prelude::sort::arg_sort;
54use crate::series::IsSorted;
55
56#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
57#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
58#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
59#[strum(serialize_all = "snake_case")]
60pub enum UniqueKeepStrategy {
61    /// Keep the first unique row.
62    First,
63    /// Keep the last unique row.
64    Last,
65    /// Keep None of the unique rows.
66    None,
67    /// Keep any of the unique rows
68    /// This allows more optimizations
69    #[default]
70    Any,
71}
72
73#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
74#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
75#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
76#[strum(serialize_all = "snake_case")]
77/// Naming strategy for the results of a pivot.
78pub enum PivotColumnNaming {
79    /// Always combine the values and on-column names.
80    Combine,
81    /// Prefix the values column name only if there is more than one values
82    /// column.
83    #[default]
84    Auto,
85}
86
87impl DataFrame {
88    pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
89        self.columns().iter().map(Column::as_materialized_series)
90    }
91
92    /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
93    ///
94    /// # Implementation
95    /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
96    /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
97    /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
98    ///
99    /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
100    /// However, this function will yield a smaller number. This is because this function returns
101    /// the visible size of the buffer, not its total capacity.
102    ///
103    /// FFI buffers are included in this estimation.
104    pub fn estimated_size(&self) -> usize {
105        self.columns().iter().map(Column::estimated_size).sum()
106    }
107
108    pub fn try_apply_columns(
109        &self,
110        func: impl Fn(&Column) -> PolarsResult<Column> + Send + Sync,
111    ) -> PolarsResult<Vec<Column>> {
112        return inner(self, &func);
113
114        fn inner(
115            slf: &DataFrame,
116            func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
117        ) -> PolarsResult<Vec<Column>> {
118            slf.columns().iter().map(func).collect()
119        }
120    }
121
122    pub fn apply_columns(&self, func: impl Fn(&Column) -> Column + Send + Sync) -> Vec<Column> {
123        return inner(self, &func);
124
125        fn inner(slf: &DataFrame, func: &(dyn Fn(&Column) -> Column + Send + Sync)) -> Vec<Column> {
126            slf.columns().iter().map(func).collect()
127        }
128    }
129
130    pub fn try_apply_columns_par(
131        &self,
132        func: impl Fn(&Column) -> PolarsResult<Column> + Send + Sync,
133    ) -> PolarsResult<Vec<Column>> {
134        return inner(self, &func);
135
136        fn inner(
137            slf: &DataFrame,
138            func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
139        ) -> PolarsResult<Vec<Column>> {
140            POOL.install(|| slf.columns().par_iter().map(func).collect())
141        }
142    }
143
144    pub fn apply_columns_par(&self, func: impl Fn(&Column) -> Column + Send + Sync) -> Vec<Column> {
145        return inner(self, &func);
146
147        fn inner(slf: &DataFrame, func: &(dyn Fn(&Column) -> Column + Send + Sync)) -> Vec<Column> {
148            POOL.install(|| slf.columns().par_iter().map(func).collect())
149        }
150    }
151
152    /// Reserve additional slots into the chunks of the series.
153    pub(crate) fn reserve_chunks(&mut self, additional: usize) {
154        for s in unsafe { self.columns_mut_retain_schema() } {
155            if let Column::Series(s) = s {
156                // SAFETY:
157                // do not modify the data, simply resize.
158                unsafe { s.chunks_mut().reserve(additional) }
159            }
160        }
161    }
162    pub fn new_from_index(&self, index: usize, height: usize) -> Self {
163        let new_cols = self.apply_columns(|c| c.new_from_index(index, height));
164
165        unsafe { Self::_new_unchecked_impl(height, new_cols).with_schema_from(self) }
166    }
167
168    /// Create a new `DataFrame` with the given schema, only containing nulls.
169    pub fn full_null(schema: &Schema, height: usize) -> Self {
170        let columns = schema
171            .iter_fields()
172            .map(|f| Column::full_null(f.name().clone(), height, f.dtype()))
173            .collect();
174
175        unsafe { DataFrame::_new_unchecked_impl(height, columns) }
176    }
177
178    /// Ensure this DataFrame matches the given schema. Casts null columns to
179    /// the expected schema if necessary (but nothing else).
180    pub fn ensure_matches_schema(&mut self, schema: &Schema) -> PolarsResult<()> {
181        let mut did_cast = false;
182        let cached_schema = self.cached_schema().cloned();
183
184        for (col, (name, dt)) in unsafe { self.columns_mut() }.iter_mut().zip(schema.iter()) {
185            polars_ensure!(
186                col.name() == name,
187                SchemaMismatch: "column name mismatch: expected {:?}, found {:?}",
188                name,
189                col.name()
190            );
191
192            let needs_cast = col.dtype().matches_schema_type(dt)?;
193
194            if needs_cast {
195                *col = col.cast(dt)?;
196                did_cast = true;
197            }
198        }
199
200        if !did_cast {
201            unsafe { self.set_opt_schema(cached_schema) };
202        }
203
204        Ok(())
205    }
206
207    /// Add a new column at index 0 that counts the rows.
208    ///
209    /// # Example
210    ///
211    /// ```
212    /// # use polars_core::prelude::*;
213    /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
214    /// assert_eq!(df1.shape(), (4, 1));
215    ///
216    /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
217    /// assert_eq!(df2.shape(), (4, 2));
218    /// println!("{}", df2);
219    ///
220    /// # Ok::<(), PolarsError>(())
221    /// ```
222    ///
223    /// Output:
224    ///
225    /// ```text
226    ///  shape: (4, 2)
227    ///  +-----+----------+
228    ///  | Id  | Name     |
229    ///  | --- | ---      |
230    ///  | u32 | str      |
231    ///  +=====+==========+
232    ///  | 0   | James    |
233    ///  +-----+----------+
234    ///  | 1   | Mary     |
235    ///  +-----+----------+
236    ///  | 2   | John     |
237    ///  +-----+----------+
238    ///  | 3   | Patricia |
239    ///  +-----+----------+
240    /// ```
241    pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
242        let mut new_columns = Vec::with_capacity(self.width() + 1);
243        let offset = offset.unwrap_or(0);
244
245        if self.get_column_index(&name).is_some() {
246            polars_bail!(duplicate = name)
247        }
248
249        let col = Column::new_row_index(name, offset, self.height())?;
250        new_columns.push(col);
251        new_columns.extend_from_slice(self.columns());
252
253        Ok(unsafe { DataFrame::new_unchecked(self.height(), new_columns) })
254    }
255
256    /// Add a row index column in place.
257    ///
258    /// # Safety
259    /// The caller should ensure the DataFrame does not already contain a column with the given name.
260    ///
261    /// # Panics
262    /// Panics if the resulting column would reach or overflow IdxSize::MAX.
263    pub unsafe fn with_row_index_mut(
264        &mut self,
265        name: PlSmallStr,
266        offset: Option<IdxSize>,
267    ) -> &mut Self {
268        debug_assert!(
269            self.get_column_index(&name).is_none(),
270            "with_row_index_mut(): column with name {} already exists",
271            &name
272        );
273
274        let offset = offset.unwrap_or(0);
275        let col = Column::new_row_index(name, offset, self.height()).unwrap();
276
277        unsafe { self.columns_mut() }.insert(0, col);
278        self
279    }
280
281    /// Shrink the capacity of this DataFrame to fit its length.
282    pub fn shrink_to_fit(&mut self) {
283        // Don't parallelize this. Memory overhead
284        for s in unsafe { self.columns_mut_retain_schema() } {
285            s.shrink_to_fit();
286        }
287    }
288
289    /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
290    /// This may lead to more peak memory consumption.
291    pub fn rechunk_mut_par(&mut self) -> &mut Self {
292        if self.columns().iter().any(|c| c.n_chunks() > 1) {
293            POOL.install(|| {
294                unsafe { self.columns_mut_retain_schema() }
295                    .par_iter_mut()
296                    .for_each(|c| *c = c.rechunk());
297            })
298        }
299
300        self
301    }
302
303    /// Rechunks all columns to only have a single chunk.
304    pub fn rechunk_mut(&mut self) -> &mut Self {
305        // SAFETY: We never adjust the length or names of the columns.
306        let columns = unsafe { self.columns_mut() };
307
308        for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
309            *col = col.rechunk();
310        }
311
312        self
313    }
314
315    /// Returns true if the chunks of the columns do not align and re-chunking should be done
316    pub fn should_rechunk(&self) -> bool {
317        // Fast check. It is also needed for correctness, as code below doesn't check if the number
318        // of chunks is equal.
319        if !self
320            .columns()
321            .iter()
322            .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
323            .all_equal()
324        {
325            return true;
326        }
327
328        // From here we check chunk lengths.
329        let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
330        match chunk_lengths.next() {
331            None => false,
332            Some(first_column_chunk_lengths) => {
333                // Fast Path for single Chunk Series
334                if first_column_chunk_lengths.size_hint().0 == 1 {
335                    return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
336                }
337                // Always rechunk if we have more chunks than rows.
338                // except when we have an empty df containing a single chunk
339                let height = self.height();
340                let n_chunks = first_column_chunk_lengths.size_hint().0;
341                if n_chunks > height && !(height == 0 && n_chunks == 1) {
342                    return true;
343                }
344                // Slow Path for multi Chunk series
345                let v: Vec<_> = first_column_chunk_lengths.collect();
346                for cl in chunk_lengths {
347                    if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
348                        return true;
349                    }
350                }
351                false
352            },
353        }
354    }
355
356    /// Ensure all the chunks in the [`DataFrame`] are aligned.
357    pub fn align_chunks_par(&mut self) -> &mut Self {
358        if self.should_rechunk() {
359            self.rechunk_mut_par()
360        } else {
361            self
362        }
363    }
364
365    /// Ensure all the chunks in the [`DataFrame`] are aligned.
366    pub fn align_chunks(&mut self) -> &mut Self {
367        if self.should_rechunk() {
368            self.rechunk_mut()
369        } else {
370            self
371        }
372    }
373
374    /// # Example
375    ///
376    /// ```rust
377    /// # use polars_core::prelude::*;
378    /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
379    ///                         "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
380    ///
381    /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
382    /// # Ok::<(), PolarsError>(())
383    /// ```
384    pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
385        self.columns().iter().map(|s| s.name()).collect()
386    }
387
388    /// Get the [`Vec<PlSmallStr>`] representing the column names.
389    pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
390        self.columns().iter().map(|s| s.name().clone()).collect()
391    }
392
393    /// Set the column names.
394    /// # Example
395    ///
396    /// ```rust
397    /// # use polars_core::prelude::*;
398    /// let mut df: DataFrame = df!("Mathematical set" => ["ℕ", "ℤ", "𝔻", "ℚ", "ℝ", "ℂ"])?;
399    /// df.set_column_names(&["Set"])?;
400    ///
401    /// assert_eq!(df.get_column_names(), &["Set"]);
402    /// # Ok::<(), PolarsError>(())
403    /// ```
404    pub fn set_column_names<T>(&mut self, new_names: &[T]) -> PolarsResult<()>
405    where
406        T: AsRef<str>,
407    {
408        polars_ensure!(
409            new_names.len() == self.width(),
410            ShapeMismatch: "{} column names provided for a DataFrame of width {}",
411            new_names.len(), self.width()
412        );
413
414        validation::ensure_names_unique(new_names)?;
415
416        *unsafe { self.columns_mut() } = std::mem::take(unsafe { self.columns_mut() })
417            .into_iter()
418            .zip(new_names)
419            .map(|(c, name)| c.with_name(PlSmallStr::from_str(name.as_ref())))
420            .collect();
421
422        Ok(())
423    }
424
425    /// Get the data types of the columns in the [`DataFrame`].
426    ///
427    /// # Example
428    ///
429    /// ```rust
430    /// # use polars_core::prelude::*;
431    /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
432    ///                                "Fraction" => [0.965, 0.035])?;
433    ///
434    /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
435    /// # Ok::<(), PolarsError>(())
436    /// ```
437    pub fn dtypes(&self) -> Vec<DataType> {
438        self.columns().iter().map(|s| s.dtype().clone()).collect()
439    }
440
441    /// The number of chunks for the first column.
442    pub fn first_col_n_chunks(&self) -> usize {
443        match self.columns().iter().find_map(|col| col.as_series()) {
444            None if self.width() == 0 => 0,
445            None => 1,
446            Some(s) => s.n_chunks(),
447        }
448    }
449
450    /// The highest number of chunks for any column.
451    pub fn max_n_chunks(&self) -> usize {
452        self.columns()
453            .iter()
454            .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
455            .max()
456            .unwrap_or(0)
457    }
458
459    /// Generate the schema fields of the [`DataFrame`].
460    ///
461    /// # Example
462    ///
463    /// ```rust
464    /// # use polars_core::prelude::*;
465    /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
466    ///                            "Fraction" => [0.708, 0.292])?;
467    ///
468    /// let f1: Field = Field::new("Surface type".into(), DataType::String);
469    /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
470    ///
471    /// assert_eq!(earth.fields(), &[f1, f2]);
472    /// # Ok::<(), PolarsError>(())
473    /// ```
474    pub fn fields(&self) -> Vec<Field> {
475        self.columns()
476            .iter()
477            .map(|s| s.field().into_owned())
478            .collect()
479    }
480
481    /// Add multiple [`Series`] to a [`DataFrame`].
482    /// The added `Series` are required to have the same length.
483    ///
484    /// # Example
485    ///
486    /// ```rust
487    /// # use polars_core::prelude::*;
488    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
489    /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
490    /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
491    ///
492    /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
493    /// assert_eq!(df2.shape(), (3, 3));
494    /// println!("{}", df2);
495    /// # Ok::<(), PolarsError>(())
496    /// ```
497    ///
498    /// Output:
499    ///
500    /// ```text
501    /// shape: (3, 3)
502    /// +---------+--------+----------+
503    /// | Element | Proton | Electron |
504    /// | ---     | ---    | ---      |
505    /// | str     | i32    | i32      |
506    /// +=========+========+==========+
507    /// | Copper  | 29     | 29       |
508    /// +---------+--------+----------+
509    /// | Silver  | 47     | 47       |
510    /// +---------+--------+----------+
511    /// | Gold    | 79     | 79       |
512    /// +---------+--------+----------+
513    /// ```
514    pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
515        let mut new_cols = Vec::with_capacity(self.width() + columns.len());
516
517        new_cols.extend(self.columns().iter().cloned());
518        new_cols.extend_from_slice(columns);
519
520        DataFrame::new(self.height(), new_cols)
521    }
522    /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
523    ///
524    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
525    ///
526    /// # Example
527    ///
528    /// ```rust
529    /// # use polars_core::prelude::*;
530    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
531    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
532    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
533    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
534    ///
535    /// let df3: DataFrame = df1.vstack(&df2)?;
536    ///
537    /// assert_eq!(df3.shape(), (5, 2));
538    /// println!("{}", df3);
539    /// # Ok::<(), PolarsError>(())
540    /// ```
541    ///
542    /// Output:
543    ///
544    /// ```text
545    /// shape: (5, 2)
546    /// +-----------+-------------------+
547    /// | Element   | Melting Point (K) |
548    /// | ---       | ---               |
549    /// | str       | f64               |
550    /// +===========+===================+
551    /// | Copper    | 1357.77           |
552    /// +-----------+-------------------+
553    /// | Silver    | 1234.93           |
554    /// +-----------+-------------------+
555    /// | Gold      | 1337.33           |
556    /// +-----------+-------------------+
557    /// | Platinum  | 2041.4            |
558    /// +-----------+-------------------+
559    /// | Palladium | 1828.05           |
560    /// +-----------+-------------------+
561    /// ```
562    pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
563        let mut df = self.clone();
564        df.vstack_mut(other)?;
565        Ok(df)
566    }
567
568    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
569    ///
570    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
571    ///
572    /// # Example
573    ///
574    /// ```rust
575    /// # use polars_core::prelude::*;
576    /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
577    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
578    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
579    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
580    ///
581    /// df1.vstack_mut(&df2)?;
582    ///
583    /// assert_eq!(df1.shape(), (5, 2));
584    /// println!("{}", df1);
585    /// # Ok::<(), PolarsError>(())
586    /// ```
587    ///
588    /// Output:
589    ///
590    /// ```text
591    /// shape: (5, 2)
592    /// +-----------+-------------------+
593    /// | Element   | Melting Point (K) |
594    /// | ---       | ---               |
595    /// | str       | f64               |
596    /// +===========+===================+
597    /// | Copper    | 1357.77           |
598    /// +-----------+-------------------+
599    /// | Silver    | 1234.93           |
600    /// +-----------+-------------------+
601    /// | Gold      | 1337.33           |
602    /// +-----------+-------------------+
603    /// | Platinum  | 2041.4            |
604    /// +-----------+-------------------+
605    /// | Palladium | 1828.05           |
606    /// +-----------+-------------------+
607    /// ```
608    pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
609        if self.width() != other.width() {
610            polars_ensure!(
611                self.shape() == (0, 0),
612                ShapeMismatch:
613                "unable to append to a DataFrame of shape {:?} with a DataFrame of width {}",
614                self.shape(), other.width(),
615            );
616
617            self.clone_from(other);
618
619            return Ok(self);
620        }
621
622        let new_height = usize::checked_add(self.height(), other.height()).unwrap();
623
624        unsafe { self.columns_mut_retain_schema() }
625            .iter_mut()
626            .zip(other.columns())
627            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
628                ensure_can_extend(&*left, right)?;
629                left.append(right).map_err(|e| {
630                    e.context(format!("failed to vstack column '{}'", right.name()).into())
631                })?;
632                Ok(())
633            })?;
634
635        unsafe { self.set_height(new_height) };
636
637        Ok(self)
638    }
639
640    pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
641        if self.width() != other.width() {
642            polars_ensure!(
643                self.shape() == (0, 0),
644                ShapeMismatch:
645                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
646                self.width(), other.width(),
647            );
648
649            *self = other;
650
651            return Ok(self);
652        }
653
654        let new_height = usize::checked_add(self.height(), other.height()).unwrap();
655
656        unsafe { self.columns_mut_retain_schema() }
657            .iter_mut()
658            .zip(other.into_columns())
659            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
660                ensure_can_extend(&*left, &right)?;
661                let right_name = right.name().clone();
662                left.append_owned(right).map_err(|e| {
663                    e.context(format!("failed to vstack column '{right_name}'").into())
664                })?;
665                Ok(())
666            })?;
667
668        unsafe { self.set_height(new_height) };
669
670        Ok(self)
671    }
672
673    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
674    ///
675    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
676    ///
677    /// # Panics
678    /// Panics if the schema's don't match.
679    pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) -> &mut Self {
680        let new_height = usize::checked_add(self.height(), other.height()).unwrap();
681
682        unsafe { self.columns_mut_retain_schema() }
683            .iter_mut()
684            .zip(other.columns())
685            .for_each(|(left, right)| {
686                left.append(right)
687                    .map_err(|e| {
688                        e.context(format!("failed to vstack column '{}'", right.name()).into())
689                    })
690                    .expect("should not fail");
691            });
692
693        unsafe { self.set_height(new_height) };
694
695        self
696    }
697
698    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
699    ///
700    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
701    ///
702    /// # Panics
703    /// Panics if the schema's don't match.
704    pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) -> &mut Self {
705        let new_height = usize::checked_add(self.height(), other.height()).unwrap();
706
707        unsafe { self.columns_mut_retain_schema() }
708            .iter_mut()
709            .zip(other.into_columns())
710            .for_each(|(left, right)| {
711                left.append_owned(right).expect("should not fail");
712            });
713
714        unsafe { self.set_height(new_height) };
715
716        self
717    }
718
719    /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
720    ///
721    /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
722    /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
723    ///
724    /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
725    /// and thus will yield faster queries.
726    ///
727    /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
728    /// online operations where you add `n` rows and rerun a query.
729    ///
730    /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
731    /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
732    /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
733    pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
734        polars_ensure!(
735            self.width() == other.width(),
736            ShapeMismatch:
737            "unable to extend a DataFrame of width {} with a DataFrame of width {}",
738            self.width(), other.width(),
739        );
740
741        let new_height = usize::checked_add(self.height(), other.height()).unwrap();
742
743        unsafe { self.columns_mut_retain_schema() }
744            .iter_mut()
745            .zip(other.columns())
746            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
747                ensure_can_extend(&*left, right)?;
748                left.extend(right).map_err(|e| {
749                    e.context(format!("failed to extend column '{}'", right.name()).into())
750                })?;
751                Ok(())
752            })?;
753
754        unsafe { self.set_height(new_height) };
755
756        Ok(())
757    }
758
759    /// Remove a column by name and return the column removed.
760    ///
761    /// # Example
762    ///
763    /// ```rust
764    /// # use polars_core::prelude::*;
765    /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
766    ///                             "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
767    ///
768    /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
769    /// assert!(s1.is_err());
770    ///
771    /// let s2: Column = df.drop_in_place("Animal")?;
772    /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
773    /// # Ok::<(), PolarsError>(())
774    /// ```
775    pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
776        let idx = self.try_get_column_index(name)?;
777        Ok(unsafe { self.columns_mut() }.remove(idx))
778    }
779
780    /// Return a new [`DataFrame`] where all null values are dropped.
781    ///
782    /// # Example
783    ///
784    /// ```no_run
785    /// # use polars_core::prelude::*;
786    /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
787    ///                         "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
788    /// assert_eq!(df1.shape(), (3, 2));
789    ///
790    /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
791    /// assert_eq!(df2.shape(), (1, 2));
792    /// println!("{}", df2);
793    /// # Ok::<(), PolarsError>(())
794    /// ```
795    ///
796    /// Output:
797    ///
798    /// ```text
799    /// shape: (1, 2)
800    /// +---------+---------------------+
801    /// | Country | Tax revenue (% GDP) |
802    /// | ---     | ---                 |
803    /// | str     | f64                 |
804    /// +=========+=====================+
805    /// | Malta   | 32.7                |
806    /// +---------+---------------------+
807    /// ```
808    pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
809    where
810        for<'a> &'a S: AsRef<str>,
811    {
812        if let Some(v) = subset {
813            let v = self.select_to_vec(v)?;
814            self._drop_nulls_impl(v.as_slice())
815        } else {
816            self._drop_nulls_impl(self.columns())
817        }
818    }
819
820    fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
821        // fast path for no nulls in df
822        if subset.iter().all(|s| !s.has_nulls()) {
823            return Ok(self.clone());
824        }
825
826        let mut iter = subset.iter();
827
828        let mask = iter
829            .next()
830            .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
831        let mut mask = mask.is_not_null();
832
833        for c in iter {
834            mask = mask & c.is_not_null();
835        }
836        self.filter(&mask)
837    }
838
839    /// Drop a column by name.
840    /// This is a pure method and will return a new [`DataFrame`] instead of modifying
841    /// the current one in place.
842    ///
843    /// # Example
844    ///
845    /// ```rust
846    /// # use polars_core::prelude::*;
847    /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
848    /// let df2: DataFrame = df1.drop("Ray type")?;
849    ///
850    /// assert_eq!(df2.width(), 0);
851    /// # Ok::<(), PolarsError>(())
852    /// ```
853    pub fn drop(&self, name: &str) -> PolarsResult<Self> {
854        let idx = self.try_get_column_index(name)?;
855        let mut new_cols = Vec::with_capacity(self.width() - 1);
856
857        self.columns().iter().enumerate().for_each(|(i, s)| {
858            if i != idx {
859                new_cols.push(s.clone())
860            }
861        });
862
863        Ok(unsafe { DataFrame::_new_unchecked_impl(self.height(), new_cols) })
864    }
865
866    /// Drop columns that are in `names`.
867    pub fn drop_many<I, S>(&self, names: I) -> Self
868    where
869        I: IntoIterator<Item = S>,
870        S: Into<PlSmallStr>,
871    {
872        let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
873        self.drop_many_amortized(&names)
874    }
875
876    /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
877    pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
878        if names.is_empty() {
879            return self.clone();
880        }
881        let mut new_cols = Vec::with_capacity(self.width().saturating_sub(names.len()));
882        self.columns().iter().for_each(|s| {
883            if !names.contains(s.name()) {
884                new_cols.push(s.clone())
885            }
886        });
887
888        unsafe { DataFrame::new_unchecked(self.height(), new_cols) }
889    }
890
891    /// Insert a new column at a given index without checking for duplicates.
892    /// This can leave the [`DataFrame`] at an invalid state
893    fn insert_column_no_namecheck(
894        &mut self,
895        index: usize,
896        column: Column,
897    ) -> PolarsResult<&mut Self> {
898        if self.shape() == (0, 0) {
899            unsafe { self.set_height(column.len()) };
900        }
901
902        polars_ensure!(
903            column.len() == self.height(),
904            ShapeMismatch:
905            "unable to add a column of length {} to a DataFrame of height {}",
906            column.len(), self.height(),
907        );
908
909        unsafe { self.columns_mut() }.insert(index, column);
910        Ok(self)
911    }
912
913    /// Insert a new column at a given index.
914    pub fn insert_column(&mut self, index: usize, column: Column) -> PolarsResult<&mut Self> {
915        let name = column.name();
916
917        polars_ensure!(
918            self.get_column_index(name).is_none(),
919            Duplicate:
920            "column with name {:?} is already present in the DataFrame", name
921        );
922
923        self.insert_column_no_namecheck(index, column)
924    }
925
926    /// Add a new column to this [`DataFrame`] or replace an existing one. Broadcasts unit-length
927    /// columns.
928    pub fn with_column(&mut self, mut column: Column) -> PolarsResult<&mut Self> {
929        if self.shape() == (0, 0) {
930            unsafe { self.set_height(column.len()) };
931        }
932
933        if column.len() != self.height() && column.len() == 1 {
934            column = column.new_from_index(0, self.height());
935        }
936
937        polars_ensure!(
938            column.len() == self.height(),
939            ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
940            column.len(), self.height(),
941        );
942
943        if let Some(i) = self.get_column_index(column.name()) {
944            *unsafe { self.columns_mut() }.get_mut(i).unwrap() = column
945        } else {
946            unsafe { self.columns_mut() }.push(column)
947        };
948
949        Ok(self)
950    }
951
952    /// Adds a column to the [`DataFrame`] without doing any checks
953    /// on length or duplicates.
954    ///
955    /// # Safety
956    /// The caller must ensure `column.len() == self.height()` .
957    pub unsafe fn push_column_unchecked(&mut self, column: Column) -> &mut Self {
958        unsafe { self.columns_mut() }.push(column);
959        self
960    }
961
962    /// Add or replace columns to this [`DataFrame`] or replace an existing one.
963    /// Broadcasts unit-length columns, and uses an existing schema to amortize lookups.
964    pub fn with_columns_mut(
965        &mut self,
966        columns: impl IntoIterator<Item = Column>,
967        output_schema: &Schema,
968    ) -> PolarsResult<()> {
969        let columns = columns.into_iter();
970
971        unsafe {
972            self.columns_mut_retain_schema()
973                .reserve(columns.size_hint().0)
974        }
975
976        for c in columns {
977            self.with_column_and_schema_mut(c, output_schema)?;
978        }
979
980        Ok(())
981    }
982
983    fn with_column_and_schema_mut(
984        &mut self,
985        mut column: Column,
986        output_schema: &Schema,
987    ) -> PolarsResult<&mut Self> {
988        if self.shape() == (0, 0) {
989            unsafe { self.set_height(column.len()) };
990        }
991
992        if column.len() != self.height() && column.len() == 1 {
993            column = column.new_from_index(0, self.height());
994        }
995
996        polars_ensure!(
997            column.len() == self.height(),
998            ShapeMismatch:
999            "unable to add a column of length {} to a DataFrame of height {}",
1000            column.len(), self.height(),
1001        );
1002
1003        let i = output_schema
1004            .index_of(column.name())
1005            .or_else(|| self.get_column_index(column.name()))
1006            .unwrap_or(self.width());
1007
1008        if i < self.width() {
1009            *unsafe { self.columns_mut() }.get_mut(i).unwrap() = column
1010        } else if i == self.width() {
1011            unsafe { self.columns_mut() }.push(column)
1012        } else {
1013            // Unordered column insertion is not handled.
1014            panic!()
1015        }
1016
1017        Ok(self)
1018    }
1019
1020    /// Get a row in the [`DataFrame`]. Beware this is slow.
1021    ///
1022    /// # Example
1023    ///
1024    /// ```
1025    /// # use polars_core::prelude::*;
1026    /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1027    ///     df.get(idx)
1028    /// }
1029    /// ```
1030    pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1031        (idx < self.height()).then(|| self.columns().iter().map(|c| c.get(idx).unwrap()).collect())
1032    }
1033
1034    /// Select a [`Series`] by index.
1035    ///
1036    /// # Example
1037    ///
1038    /// ```rust
1039    /// # use polars_core::prelude::*;
1040    /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1041    ///                         "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1042    ///
1043    /// let s1: Option<&Column> = df.select_at_idx(0);
1044    /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1045    ///
1046    /// assert_eq!(s1, Some(&s2));
1047    /// # Ok::<(), PolarsError>(())
1048    /// ```
1049    pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1050        self.columns().get(idx)
1051    }
1052
1053    /// Get column index of a [`Series`] by name.
1054    /// # Example
1055    ///
1056    /// ```rust
1057    /// # use polars_core::prelude::*;
1058    /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1059    ///                         "Health" => [100, 200, 500],
1060    ///                         "Mana" => [250, 100, 0],
1061    ///                         "Strength" => [30, 150, 300])?;
1062    ///
1063    /// assert_eq!(df.get_column_index("Name"), Some(0));
1064    /// assert_eq!(df.get_column_index("Health"), Some(1));
1065    /// assert_eq!(df.get_column_index("Mana"), Some(2));
1066    /// assert_eq!(df.get_column_index("Strength"), Some(3));
1067    /// assert_eq!(df.get_column_index("Haste"), None);
1068    /// # Ok::<(), PolarsError>(())
1069    /// ```
1070    pub fn get_column_index(&self, name: &str) -> Option<usize> {
1071        if let Some(schema) = self.cached_schema() {
1072            schema.index_of(name)
1073        } else if self.width() <= LINEAR_SEARCH_LIMIT {
1074            self.columns().iter().position(|s| s.name() == name)
1075        } else {
1076            self.schema().index_of(name)
1077        }
1078    }
1079
1080    /// Get column index of a [`Series`] by name.
1081    pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1082        self.get_column_index(name)
1083            .ok_or_else(|| polars_err!(col_not_found = name))
1084    }
1085
1086    /// Select a single column by name.
1087    ///
1088    /// # Example
1089    ///
1090    /// ```rust
1091    /// # use polars_core::prelude::*;
1092    /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1093    /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1094    /// let df: DataFrame = DataFrame::new_infer_height(vec![s1.clone(), s2])?;
1095    ///
1096    /// assert_eq!(df.column("Password")?, &s1);
1097    /// # Ok::<(), PolarsError>(())
1098    /// ```
1099    pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1100        let idx = self.try_get_column_index(name)?;
1101        Ok(self.select_at_idx(idx).unwrap())
1102    }
1103
1104    /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1105    ///
1106    /// # Examples
1107    ///
1108    /// ```
1109    /// # use polars_core::prelude::*;
1110    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1111    ///     df.select(["foo", "bar"])
1112    /// }
1113    /// ```
1114    pub fn select<I, S>(&self, names: I) -> PolarsResult<Self>
1115    where
1116        I: IntoIterator<Item = S>,
1117        S: AsRef<str>,
1118    {
1119        DataFrame::new(self.height(), self.select_to_vec(names)?)
1120    }
1121
1122    /// Does not check for duplicates.
1123    ///
1124    /// # Safety
1125    /// `names` must not contain duplicates.
1126    pub unsafe fn select_unchecked<I, S>(&self, names: I) -> PolarsResult<Self>
1127    where
1128        I: IntoIterator<Item = S>,
1129        S: AsRef<str>,
1130    {
1131        Ok(unsafe { DataFrame::new_unchecked(self.height(), self.select_to_vec(names)?) })
1132    }
1133
1134    /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1135    ///
1136    /// This does not error on duplicate selections.
1137    ///
1138    /// # Example
1139    ///
1140    /// ```rust
1141    /// # use polars_core::prelude::*;
1142    /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1143    ///                         "Carbon" => [1, 2, 3],
1144    ///                         "Hydrogen" => [4, 6, 8])?;
1145    /// let sv: Vec<Column> = df.select_to_vec(["Carbon", "Hydrogen"])?;
1146    ///
1147    /// assert_eq!(df["Carbon"], sv[0]);
1148    /// assert_eq!(df["Hydrogen"], sv[1]);
1149    /// # Ok::<(), PolarsError>(())
1150    /// ```
1151    pub fn select_to_vec(
1152        &self,
1153        selection: impl IntoIterator<Item = impl AsRef<str>>,
1154    ) -> PolarsResult<Vec<Column>> {
1155        AmortizedColumnSelector::new(self).select_multiple(selection)
1156    }
1157
1158    /// Take the [`DataFrame`] rows by a boolean mask.
1159    ///
1160    /// # Example
1161    ///
1162    /// ```
1163    /// # use polars_core::prelude::*;
1164    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1165    ///     let mask = df.column("sepal_width")?.is_not_null();
1166    ///     df.filter(&mask)
1167    /// }
1168    /// ```
1169    pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1170        if self.width() == 0 {
1171            filter_zero_width(self.height(), mask)
1172        } else if mask.len() == 1 && self.len() >= 1 {
1173            if mask.all() && mask.null_count() == 0 {
1174                Ok(self.clone())
1175            } else {
1176                Ok(self.clear())
1177            }
1178        } else {
1179            let new_columns: Vec<Column> = self.try_apply_columns_par(|s| s.filter(mask))?;
1180            let out = unsafe {
1181                DataFrame::new_unchecked(new_columns[0].len(), new_columns).with_schema_from(self)
1182            };
1183
1184            Ok(out)
1185        }
1186    }
1187
1188    /// Same as `filter` but does not parallelize.
1189    pub fn filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1190        if self.width() == 0 {
1191            filter_zero_width(self.height(), mask)
1192        } else if mask.len() == 1 && mask.null_count() == 0 && self.len() >= 1 {
1193            if mask.all() && mask.null_count() == 0 {
1194                Ok(self.clone())
1195            } else {
1196                Ok(self.clear())
1197            }
1198        } else {
1199            let new_columns: Vec<Column> = self.try_apply_columns(|s| s.filter(mask))?;
1200            let out = unsafe {
1201                DataFrame::new_unchecked(new_columns[0].len(), new_columns).with_schema_from(self)
1202            };
1203
1204            Ok(out)
1205        }
1206    }
1207
1208    /// Gather [`DataFrame`] rows by index values.
1209    ///
1210    /// # Example
1211    ///
1212    /// ```
1213    /// # use polars_core::prelude::*;
1214    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1215    ///     let idx = IdxCa::new("idx".into(), [0, 1, 9]);
1216    ///     df.take(&idx)
1217    /// }
1218    /// ```
1219    pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
1220        check_bounds_ca(indices, self.height().try_into().unwrap_or(IdxSize::MAX))?;
1221
1222        let new_cols = self.apply_columns_par(|c| {
1223            assert_eq!(c.len(), self.height());
1224            unsafe { c.take_unchecked(indices) }
1225        });
1226
1227        Ok(unsafe { DataFrame::new_unchecked(indices.len(), new_cols).with_schema_from(self) })
1228    }
1229
1230    /// # Safety
1231    /// The indices must be in-bounds.
1232    pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
1233        self.take_unchecked_impl(idx, true)
1234    }
1235
1236    /// # Safety
1237    /// The indices must be in-bounds.
1238    #[cfg(feature = "algorithm_group_by")]
1239    pub unsafe fn gather_group_unchecked(&self, group: &GroupsIndicator) -> Self {
1240        match group {
1241            GroupsIndicator::Idx((_, indices)) => unsafe {
1242                self.take_slice_unchecked_impl(indices.as_slice(), false)
1243            },
1244            GroupsIndicator::Slice([offset, len]) => self.slice(*offset as i64, *len as usize),
1245        }
1246    }
1247
1248    /// # Safety
1249    /// The indices must be in-bounds.
1250    pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
1251        let cols = if allow_threads && POOL.current_num_threads() > 1 {
1252            POOL.install(|| {
1253                if POOL.current_num_threads() > self.width() {
1254                    let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
1255                    if self.height() / stride >= 2 {
1256                        self.apply_columns_par(|c| {
1257                            // Nested types initiate a rechunk in their take_unchecked implementation.
1258                            // If we do not rechunk, it will result in rechunk storms downstream.
1259                            let c = if c.dtype().is_nested() {
1260                                &c.rechunk()
1261                            } else {
1262                                c
1263                            };
1264
1265                            (0..idx.len().div_ceil(stride))
1266                                .into_par_iter()
1267                                .map(|i| c.take_unchecked(&idx.slice((i * stride) as i64, stride)))
1268                                .reduce(
1269                                    || Column::new_empty(c.name().clone(), c.dtype()),
1270                                    |mut a, b| {
1271                                        a.append_owned(b).unwrap();
1272                                        a
1273                                    },
1274                                )
1275                        })
1276                    } else {
1277                        self.apply_columns_par(|c| c.take_unchecked(idx))
1278                    }
1279                } else {
1280                    self.apply_columns_par(|c| c.take_unchecked(idx))
1281                }
1282            })
1283        } else {
1284            self.apply_columns(|s| s.take_unchecked(idx))
1285        };
1286
1287        unsafe { DataFrame::new_unchecked(idx.len(), cols).with_schema_from(self) }
1288    }
1289
1290    /// # Safety
1291    /// The indices must be in-bounds.
1292    pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
1293        self.take_slice_unchecked_impl(idx, true)
1294    }
1295
1296    /// # Safety
1297    /// The indices must be in-bounds.
1298    pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
1299        let cols = if allow_threads && POOL.current_num_threads() > 1 {
1300            POOL.install(|| {
1301                if POOL.current_num_threads() > self.width() {
1302                    let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
1303                    if self.height() / stride >= 2 {
1304                        self.apply_columns_par(|c| {
1305                            // Nested types initiate a rechunk in their take_unchecked implementation.
1306                            // If we do not rechunk, it will result in rechunk storms downstream.
1307                            let c = if c.dtype().is_nested() {
1308                                &c.rechunk()
1309                            } else {
1310                                c
1311                            };
1312
1313                            (0..idx.len().div_ceil(stride))
1314                                .into_par_iter()
1315                                .map(|i| {
1316                                    let idx = &idx[i * stride..];
1317                                    let idx = &idx[..idx.len().min(stride)];
1318                                    c.take_slice_unchecked(idx)
1319                                })
1320                                .reduce(
1321                                    || Column::new_empty(c.name().clone(), c.dtype()),
1322                                    |mut a, b| {
1323                                        a.append_owned(b).unwrap();
1324                                        a
1325                                    },
1326                                )
1327                        })
1328                    } else {
1329                        self.apply_columns_par(|s| s.take_slice_unchecked(idx))
1330                    }
1331                } else {
1332                    self.apply_columns_par(|s| s.take_slice_unchecked(idx))
1333                }
1334            })
1335        } else {
1336            self.apply_columns(|s| s.take_slice_unchecked(idx))
1337        };
1338        unsafe { DataFrame::new_unchecked(idx.len(), cols).with_schema_from(self) }
1339    }
1340
1341    /// Rename a column in the [`DataFrame`].
1342    ///
1343    /// Should not be called in a loop as that can lead to quadratic behavior.
1344    ///
1345    /// # Example
1346    ///
1347    /// ```
1348    /// # use polars_core::prelude::*;
1349    /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
1350    ///     let original_name = "foo";
1351    ///     let new_name = "bar";
1352    ///     df.rename(original_name, new_name.into())
1353    /// }
1354    /// ```
1355    pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
1356        if column == name.as_str() {
1357            return Ok(self);
1358        }
1359        polars_ensure!(
1360            !self.schema().contains(&name),
1361            Duplicate: "column rename attempted with already existing name \"{name}\""
1362        );
1363
1364        self.get_column_index(column)
1365            .and_then(|idx| unsafe { self.columns_mut() }.get_mut(idx))
1366            .ok_or_else(|| polars_err!(col_not_found = column))
1367            .map(|c| c.rename(name))?;
1368
1369        Ok(self)
1370    }
1371
1372    pub fn rename_many<'a>(
1373        &mut self,
1374        renames: impl Iterator<Item = (&'a str, PlSmallStr)>,
1375    ) -> PolarsResult<&mut Self> {
1376        let mut schema_arc = self.schema().clone();
1377        let schema = Arc::make_mut(&mut schema_arc);
1378
1379        for (from, to) in renames {
1380            if from == to.as_str() {
1381                continue;
1382            }
1383
1384            polars_ensure!(
1385                !schema.contains(&to),
1386                Duplicate: "column rename attempted with already existing name \"{to}\""
1387            );
1388
1389            match schema.get_full(from) {
1390                None => polars_bail!(col_not_found = from),
1391                Some((idx, _, _)) => {
1392                    let (n, _) = schema.get_at_index_mut(idx).unwrap();
1393                    *n = to.clone();
1394                    unsafe { self.columns_mut() }
1395                        .get_mut(idx)
1396                        .unwrap()
1397                        .rename(to);
1398                },
1399            }
1400        }
1401
1402        unsafe { self.set_schema(schema_arc) };
1403
1404        Ok(self)
1405    }
1406
1407    /// Sort [`DataFrame`] in place.
1408    ///
1409    /// See [`DataFrame::sort`] for more instruction.
1410    pub fn sort_in_place(
1411        &mut self,
1412        by: impl IntoIterator<Item = impl AsRef<str>>,
1413        sort_options: SortMultipleOptions,
1414    ) -> PolarsResult<&mut Self> {
1415        let by_column = self.select_to_vec(by)?;
1416
1417        let mut out = self.sort_impl(by_column, sort_options, None)?;
1418        unsafe { out.set_schema_from(self) };
1419
1420        *self = out;
1421
1422        Ok(self)
1423    }
1424
1425    #[doc(hidden)]
1426    /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
1427    pub fn sort_impl(
1428        &self,
1429        by_column: Vec<Column>,
1430        sort_options: SortMultipleOptions,
1431        slice: Option<(i64, usize)>,
1432    ) -> PolarsResult<Self> {
1433        if by_column.is_empty() {
1434            // If no columns selected, any order (including original order) is correct.
1435            return if let Some((offset, len)) = slice {
1436                Ok(self.slice(offset, len))
1437            } else {
1438                Ok(self.clone())
1439            };
1440        }
1441
1442        for column in &by_column {
1443            if column.dtype().is_object() {
1444                polars_bail!(
1445                    InvalidOperation: "column '{}' has a dtype of '{}', which does not support sorting", column.name(), column.dtype()
1446                )
1447            }
1448        }
1449
1450        // note that the by_column argument also contains evaluated expression from
1451        // polars-lazy that may not even be present in this dataframe. therefore
1452        // when we try to set the first columns as sorted, we ignore the error as
1453        // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
1454        let first_descending = sort_options.descending[0];
1455        let first_by_column = by_column[0].name().to_string();
1456
1457        let set_sorted = |df: &mut DataFrame| {
1458            // Mark the first sort column as sorted; if the column does not exist it
1459            // is ok, because we sorted by an expression not present in the dataframe
1460            let _ = df.apply(&first_by_column, |s| {
1461                let mut s = s.clone();
1462                if first_descending {
1463                    s.set_sorted_flag(IsSorted::Descending)
1464                } else {
1465                    s.set_sorted_flag(IsSorted::Ascending)
1466                }
1467                s
1468            });
1469        };
1470
1471        if self.shape_has_zero() {
1472            let mut out = self.clone();
1473            set_sorted(&mut out);
1474            return Ok(out);
1475        }
1476
1477        if let Some((0, k)) = slice {
1478            if k < self.height() {
1479                return self.bottom_k_impl(k, by_column, sort_options);
1480            }
1481        }
1482        // Check if the required column is already sorted; if so we can exit early
1483        // We can do so when there is only one column to sort by, for multiple columns
1484        // it will be complicated to do so
1485        #[cfg(feature = "dtype-categorical")]
1486        let is_not_categorical_enum =
1487            !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
1488                || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
1489
1490        #[cfg(not(feature = "dtype-categorical"))]
1491        #[allow(non_upper_case_globals)]
1492        const is_not_categorical_enum: bool = true;
1493
1494        if by_column.len() == 1 && is_not_categorical_enum {
1495            let required_sorting = if sort_options.descending[0] {
1496                IsSorted::Descending
1497            } else {
1498                IsSorted::Ascending
1499            };
1500            // If null count is 0 then nulls_last doesnt matter
1501            // Safe to get value at last position since the dataframe is not empty (taken care above)
1502            let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
1503                && ((by_column[0].null_count() == 0)
1504                    || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
1505                        == sort_options.nulls_last[0]);
1506
1507            if no_sorting_required {
1508                return if let Some((offset, len)) = slice {
1509                    Ok(self.slice(offset, len))
1510                } else {
1511                    Ok(self.clone())
1512                };
1513            }
1514        }
1515
1516        let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
1517        let allow_threads = sort_options.multithreaded;
1518
1519        // a lot of indirection in both sorting and take
1520        let mut df = self.clone();
1521        let df = df.rechunk_mut_par();
1522        let mut take = match (by_column.len(), has_nested) {
1523            (1, false) => {
1524                let s = &by_column[0];
1525                let options = SortOptions {
1526                    descending: sort_options.descending[0],
1527                    nulls_last: sort_options.nulls_last[0],
1528                    multithreaded: sort_options.multithreaded,
1529                    maintain_order: sort_options.maintain_order,
1530                    limit: sort_options.limit,
1531                };
1532                // fast path for a frame with a single series
1533                // no need to compute the sort indices and then take by these indices
1534                // simply sort and return as frame
1535                if df.width() == 1 && df.try_get_column_index(s.name().as_str()).is_ok() {
1536                    let mut out = s.sort_with(options)?;
1537                    if let Some((offset, len)) = slice {
1538                        out = out.slice(offset, len);
1539                    }
1540                    return Ok(out.into_frame());
1541                }
1542                s.arg_sort(options)
1543            },
1544            _ => arg_sort(&by_column, sort_options)?,
1545        };
1546
1547        if let Some((offset, len)) = slice {
1548            take = take.slice(offset, len);
1549        }
1550
1551        // SAFETY:
1552        // the created indices are in bounds
1553        let mut df = unsafe { df.take_unchecked_impl(&take, allow_threads) };
1554        set_sorted(&mut df);
1555        Ok(df)
1556    }
1557
1558    /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
1559    ///
1560    /// This dataframe does not necessarily have a specified schema and may be changed at any
1561    /// point. It is primarily used for debugging.
1562    pub fn _to_metadata(&self) -> DataFrame {
1563        let num_columns = self.width();
1564
1565        let mut column_names =
1566            StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
1567        let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
1568        let mut sorted_asc_ca =
1569            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
1570        let mut sorted_dsc_ca =
1571            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
1572        let mut fast_explode_list_ca =
1573            BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
1574        let mut materialized_at_ca =
1575            StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
1576
1577        for col in self.columns() {
1578            let flags = col.get_flags();
1579
1580            let (repr, materialized_at) = match col {
1581                Column::Series(s) => ("series", s.materialized_at()),
1582                Column::Scalar(_) => ("scalar", None),
1583            };
1584            let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
1585            let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
1586            let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
1587
1588            column_names.append_value(col.name().clone());
1589            repr_ca.append_value(repr);
1590            sorted_asc_ca.append_value(sorted_asc);
1591            sorted_dsc_ca.append_value(sorted_dsc);
1592            fast_explode_list_ca.append_value(fast_explode_list);
1593            materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
1594        }
1595
1596        unsafe {
1597            DataFrame::new_unchecked(
1598                self.width(),
1599                vec![
1600                    column_names.finish().into_column(),
1601                    repr_ca.finish().into_column(),
1602                    sorted_asc_ca.finish().into_column(),
1603                    sorted_dsc_ca.finish().into_column(),
1604                    fast_explode_list_ca.finish().into_column(),
1605                    materialized_at_ca.finish().into_column(),
1606                ],
1607            )
1608        }
1609    }
1610    /// Return a sorted clone of this [`DataFrame`].
1611    ///
1612    /// In many cases the output chunks will be continuous in memory but this is not guaranteed
1613    /// # Example
1614    ///
1615    /// Sort by a single column with default options:
1616    /// ```
1617    /// # use polars_core::prelude::*;
1618    /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
1619    ///     df.sort(["sepal_width"], Default::default())
1620    /// }
1621    /// ```
1622    /// Sort by a single column with specific order:
1623    /// ```
1624    /// # use polars_core::prelude::*;
1625    /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
1626    ///     df.sort(
1627    ///         ["sepal_width"],
1628    ///         SortMultipleOptions::new()
1629    ///             .with_order_descending(descending)
1630    ///     )
1631    /// }
1632    /// ```
1633    /// Sort by multiple columns with specifying order for each column:
1634    /// ```
1635    /// # use polars_core::prelude::*;
1636    /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
1637    ///     df.sort(
1638    ///         ["sepal_width", "sepal_length"],
1639    ///         SortMultipleOptions::new()
1640    ///             .with_order_descending_multi([false, true])
1641    ///     )
1642    /// }
1643    /// ```
1644    /// See [`SortMultipleOptions`] for more options.
1645    ///
1646    /// Also see [`DataFrame::sort_in_place`].
1647    pub fn sort(
1648        &self,
1649        by: impl IntoIterator<Item = impl AsRef<str>>,
1650        sort_options: SortMultipleOptions,
1651    ) -> PolarsResult<Self> {
1652        let mut df = self.clone();
1653        df.sort_in_place(by, sort_options)?;
1654        Ok(df)
1655    }
1656
1657    /// Replace a column with a [`Column`].
1658    ///
1659    /// # Example
1660    ///
1661    /// ```rust
1662    /// # use polars_core::prelude::*;
1663    /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
1664    ///                         "Area (km²)" => [9_833_520, 9_596_961])?;
1665    /// let s: Column = Column::new("Country".into(), ["USA", "PRC"]);
1666    ///
1667    /// assert!(df.replace("Nation", s.clone()).is_err());
1668    /// assert!(df.replace("Country", s).is_ok());
1669    /// # Ok::<(), PolarsError>(())
1670    /// ```
1671    pub fn replace(&mut self, column: &str, new_col: Column) -> PolarsResult<&mut Self> {
1672        self.apply(column, |_| new_col)
1673    }
1674
1675    /// Replace column at index `idx` with a [`Series`].
1676    ///
1677    /// # Example
1678    ///
1679    /// ```ignored
1680    /// # use polars_core::prelude::*;
1681    /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
1682    /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
1683    /// let mut df = DataFrame::new_infer_height(vec![s0, s1])?;
1684    ///
1685    /// // Add 32 to get lowercase ascii values
1686    /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
1687    /// # Ok::<(), PolarsError>(())
1688    /// ```
1689    pub fn replace_column(&mut self, index: usize, new_column: Column) -> PolarsResult<&mut Self> {
1690        polars_ensure!(
1691            index < self.width(),
1692            ShapeMismatch:
1693            "unable to replace at index {}, the DataFrame has only {} columns",
1694            index, self.width(),
1695        );
1696
1697        polars_ensure!(
1698            new_column.len() == self.height(),
1699            ShapeMismatch:
1700            "unable to replace a column, series length {} doesn't match the DataFrame height {}",
1701            new_column.len(), self.height(),
1702        );
1703
1704        unsafe { *self.columns_mut().get_mut(index).unwrap() = new_column };
1705
1706        Ok(self)
1707    }
1708
1709    /// Apply a closure to a column. This is the recommended way to do in place modification.
1710    ///
1711    /// # Example
1712    ///
1713    /// ```rust
1714    /// # use polars_core::prelude::*;
1715    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
1716    /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
1717    /// let mut df = DataFrame::new_infer_height(vec![s0, s1])?;
1718    ///
1719    /// fn str_to_len(str_val: &Column) -> Column {
1720    ///     str_val.str()
1721    ///         .unwrap()
1722    ///         .into_iter()
1723    ///         .map(|opt_name: Option<&str>| {
1724    ///             opt_name.map(|name: &str| name.len() as u32)
1725    ///          })
1726    ///         .collect::<UInt32Chunked>()
1727    ///         .into_column()
1728    /// }
1729    ///
1730    /// // Replace the names column by the length of the names.
1731    /// df.apply("names", str_to_len);
1732    /// # Ok::<(), PolarsError>(())
1733    /// ```
1734    /// Results in:
1735    ///
1736    /// ```text
1737    /// +--------+-------+
1738    /// | foo    |       |
1739    /// | ---    | names |
1740    /// | str    | u32   |
1741    /// +========+=======+
1742    /// | "ham"  | 4     |
1743    /// +--------+-------+
1744    /// | "spam" | 6     |
1745    /// +--------+-------+
1746    /// | "egg"  | 3     |
1747    /// +--------+-------+
1748    /// ```
1749    pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
1750    where
1751        F: FnOnce(&Column) -> C,
1752        C: IntoColumn,
1753    {
1754        let idx = self.try_get_column_index(name)?;
1755        self.apply_at_idx(idx, f)?;
1756        Ok(self)
1757    }
1758
1759    /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
1760    /// modification.
1761    ///
1762    /// # Example
1763    ///
1764    /// ```rust
1765    /// # use polars_core::prelude::*;
1766    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
1767    /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
1768    /// let mut df = DataFrame::new_infer_height(vec![s0, s1])?;
1769    ///
1770    /// // Add 32 to get lowercase ascii values
1771    /// df.apply_at_idx(1, |s| s + 32);
1772    /// # Ok::<(), PolarsError>(())
1773    /// ```
1774    /// Results in:
1775    ///
1776    /// ```text
1777    /// +--------+-------+
1778    /// | foo    | ascii |
1779    /// | ---    | ---   |
1780    /// | str    | i32   |
1781    /// +========+=======+
1782    /// | "ham"  | 102   |
1783    /// +--------+-------+
1784    /// | "spam" | 111   |
1785    /// +--------+-------+
1786    /// | "egg"  | 111   |
1787    /// +--------+-------+
1788    /// ```
1789    pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
1790    where
1791        F: FnOnce(&Column) -> C,
1792        C: IntoColumn,
1793    {
1794        let df_height = self.height();
1795        let width = self.width();
1796
1797        let cached_schema = self.cached_schema().cloned();
1798
1799        let col = unsafe { self.columns_mut() }.get_mut(idx).ok_or_else(|| {
1800            polars_err!(
1801                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
1802                idx, width
1803            )
1804        })?;
1805
1806        let mut new_col = f(col).into_column();
1807
1808        if new_col.len() != df_height && new_col.len() == 1 {
1809            new_col = new_col.new_from_index(0, df_height);
1810        }
1811
1812        polars_ensure!(
1813            new_col.len() == df_height,
1814            ShapeMismatch:
1815            "apply_at_idx: resulting Series has length {} while the DataFrame has height {}",
1816            new_col.len(), df_height
1817        );
1818
1819        new_col = new_col.with_name(col.name().clone());
1820        let col_before = std::mem::replace(col, new_col);
1821
1822        if col.dtype() == col_before.dtype() {
1823            unsafe { self.set_opt_schema(cached_schema) };
1824        }
1825
1826        Ok(self)
1827    }
1828
1829    /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
1830    /// modification.
1831    ///
1832    /// # Example
1833    ///
1834    /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
1835    ///
1836    /// ```rust
1837    /// # use polars_core::prelude::*;
1838    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
1839    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
1840    /// let mut df = DataFrame::new_infer_height(vec![s0, s1])?;
1841    ///
1842    /// let idx = vec![0, 1, 4];
1843    ///
1844    /// df.try_apply("foo", |c| {
1845    ///     c.str()?
1846    ///     .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
1847    /// });
1848    /// # Ok::<(), PolarsError>(())
1849    /// ```
1850    /// Results in:
1851    ///
1852    /// ```text
1853    /// +---------------------+--------+
1854    /// | foo                 | values |
1855    /// | ---                 | ---    |
1856    /// | str                 | i32    |
1857    /// +=====================+========+
1858    /// | "ham-is-modified"   | 1      |
1859    /// +---------------------+--------+
1860    /// | "spam-is-modified"  | 2      |
1861    /// +---------------------+--------+
1862    /// | "egg"               | 3      |
1863    /// +---------------------+--------+
1864    /// | "bacon"             | 4      |
1865    /// +---------------------+--------+
1866    /// | "quack-is-modified" | 5      |
1867    /// +---------------------+--------+
1868    /// ```
1869    pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
1870    where
1871        F: FnOnce(&Column) -> PolarsResult<C>,
1872        C: IntoColumn,
1873    {
1874        let df_height = self.height();
1875        let width = self.width();
1876
1877        let cached_schema = self.cached_schema().cloned();
1878
1879        let col = unsafe { self.columns_mut() }.get_mut(idx).ok_or_else(|| {
1880            polars_err!(
1881                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
1882                idx, width
1883            )
1884        })?;
1885
1886        let mut new_col = f(col).map(|c| c.into_column())?;
1887
1888        polars_ensure!(
1889            new_col.len() == df_height,
1890            ShapeMismatch:
1891            "try_apply_at_idx: resulting Series has length {} while the DataFrame has height {}",
1892            new_col.len(), df_height
1893        );
1894
1895        // make sure the name remains the same after applying the closure
1896        new_col = new_col.with_name(col.name().clone());
1897        let col_before = std::mem::replace(col, new_col);
1898
1899        if col.dtype() == col_before.dtype() {
1900            unsafe { self.set_opt_schema(cached_schema) };
1901        }
1902
1903        Ok(self)
1904    }
1905
1906    /// Apply a closure that may fail to a column. This is the recommended way to do in place
1907    /// modification.
1908    ///
1909    /// # Example
1910    ///
1911    /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
1912    ///
1913    /// ```rust
1914    /// # use polars_core::prelude::*;
1915    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
1916    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
1917    /// let mut df = DataFrame::new_infer_height(vec![s0, s1])?;
1918    ///
1919    /// // create a mask
1920    /// let values = df.column("values")?.as_materialized_series();
1921    /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
1922    ///
1923    /// df.try_apply("foo", |c| {
1924    ///     c.str()?
1925    ///     .set(&mask, Some("not_within_bounds"))
1926    /// });
1927    /// # Ok::<(), PolarsError>(())
1928    /// ```
1929    /// Results in:
1930    ///
1931    /// ```text
1932    /// +---------------------+--------+
1933    /// | foo                 | values |
1934    /// | ---                 | ---    |
1935    /// | str                 | i32    |
1936    /// +=====================+========+
1937    /// | "not_within_bounds" | 1      |
1938    /// +---------------------+--------+
1939    /// | "spam"              | 2      |
1940    /// +---------------------+--------+
1941    /// | "egg"               | 3      |
1942    /// +---------------------+--------+
1943    /// | "bacon"             | 4      |
1944    /// +---------------------+--------+
1945    /// | "not_within_bounds" | 5      |
1946    /// +---------------------+--------+
1947    /// ```
1948    pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
1949    where
1950        F: FnOnce(&Series) -> PolarsResult<C>,
1951        C: IntoColumn,
1952    {
1953        let idx = self.try_get_column_index(column)?;
1954        self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
1955    }
1956
1957    /// Slice the [`DataFrame`] along the rows.
1958    ///
1959    /// # Example
1960    ///
1961    /// ```rust
1962    /// # use polars_core::prelude::*;
1963    /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
1964    ///                         "Color" => ["Green", "Red", "White", "White", "Red"])?;
1965    /// let sl: DataFrame = df.slice(2, 3);
1966    ///
1967    /// assert_eq!(sl.shape(), (3, 2));
1968    /// println!("{}", sl);
1969    /// # Ok::<(), PolarsError>(())
1970    /// ```
1971    /// Output:
1972    /// ```text
1973    /// shape: (3, 2)
1974    /// +-------+-------+
1975    /// | Fruit | Color |
1976    /// | ---   | ---   |
1977    /// | str   | str   |
1978    /// +=======+=======+
1979    /// | Grape | White |
1980    /// +-------+-------+
1981    /// | Fig   | White |
1982    /// +-------+-------+
1983    /// | Fig   | Red   |
1984    /// +-------+-------+
1985    /// ```
1986    #[must_use]
1987    pub fn slice(&self, offset: i64, length: usize) -> Self {
1988        if offset == 0 && length == self.height() {
1989            return self.clone();
1990        }
1991
1992        if length == 0 {
1993            return self.clear();
1994        }
1995
1996        let cols = self.apply_columns(|s| s.slice(offset, length));
1997
1998        let height = if let Some(fst) = cols.first() {
1999            fst.len()
2000        } else {
2001            let (_, length) = slice_offsets(offset, length, self.height());
2002            length
2003        };
2004
2005        unsafe { DataFrame::_new_unchecked_impl(height, cols).with_schema_from(self) }
2006    }
2007
2008    /// Split [`DataFrame`] at the given `offset`.
2009    pub fn split_at(&self, offset: i64) -> (Self, Self) {
2010        let (a, b) = self.columns().iter().map(|s| s.split_at(offset)).unzip();
2011
2012        let (idx, _) = slice_offsets(offset, 0, self.height());
2013
2014        let a = unsafe { DataFrame::new_unchecked(idx, a).with_schema_from(self) };
2015        let b = unsafe { DataFrame::new_unchecked(self.height() - idx, b).with_schema_from(self) };
2016        (a, b)
2017    }
2018
2019    #[must_use]
2020    pub fn clear(&self) -> Self {
2021        let cols = self.columns().iter().map(|s| s.clear()).collect::<Vec<_>>();
2022        unsafe { DataFrame::_new_unchecked_impl(0, cols).with_schema_from(self) }
2023    }
2024
2025    #[must_use]
2026    pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2027        if offset == 0 && length == self.height() {
2028            return self.clone();
2029        }
2030        let columns = self.apply_columns_par(|s| s.slice(offset, length));
2031        unsafe { DataFrame::new_unchecked(length, columns).with_schema_from(self) }
2032    }
2033
2034    #[must_use]
2035    pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2036        if offset == 0 && length == self.height() {
2037            return self.clone();
2038        }
2039        // @scalar-opt
2040        let columns = self.apply_columns(|s| {
2041            let mut out = s.slice(offset, length);
2042            out.shrink_to_fit();
2043            out
2044        });
2045        unsafe { DataFrame::new_unchecked(length, columns).with_schema_from(self) }
2046    }
2047
2048    /// Get the head of the [`DataFrame`].
2049    ///
2050    /// # Example
2051    ///
2052    /// ```rust
2053    /// # use polars_core::prelude::*;
2054    /// let countries: DataFrame =
2055    ///     df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2056    ///         "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2057    ///         "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2058    ///         "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2059    /// assert_eq!(countries.shape(), (5, 4));
2060    ///
2061    /// println!("{}", countries.head(Some(3)));
2062    /// # Ok::<(), PolarsError>(())
2063    /// ```
2064    ///
2065    /// Output:
2066    ///
2067    /// ```text
2068    /// shape: (3, 4)
2069    /// +--------------------+---------------+---------------+------------+
2070    /// | Rank by GDP (2021) | Continent     | Country       | Capital    |
2071    /// | ---                | ---           | ---           | ---        |
2072    /// | i32                | str           | str           | str        |
2073    /// +====================+===============+===============+============+
2074    /// | 1                  | North America | United States | Washington |
2075    /// +--------------------+---------------+---------------+------------+
2076    /// | 2                  | Asia          | China         | Beijing    |
2077    /// +--------------------+---------------+---------------+------------+
2078    /// | 3                  | Asia          | Japan         | Tokyo      |
2079    /// +--------------------+---------------+---------------+------------+
2080    /// ```
2081    #[must_use]
2082    pub fn head(&self, length: Option<usize>) -> Self {
2083        let new_height = usize::min(self.height(), length.unwrap_or(HEAD_DEFAULT_LENGTH));
2084        let new_cols = self.apply_columns(|c| c.head(Some(new_height)));
2085
2086        unsafe { DataFrame::new_unchecked(new_height, new_cols).with_schema_from(self) }
2087    }
2088
2089    /// Get the tail of the [`DataFrame`].
2090    ///
2091    /// # Example
2092    ///
2093    /// ```rust
2094    /// # use polars_core::prelude::*;
2095    /// let countries: DataFrame =
2096    ///     df!("Rank (2021)" => [105, 106, 107, 108, 109],
2097    ///         "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2098    ///         "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2099    /// assert_eq!(countries.shape(), (5, 3));
2100    ///
2101    /// println!("{}", countries.tail(Some(2)));
2102    /// # Ok::<(), PolarsError>(())
2103    /// ```
2104    ///
2105    /// Output:
2106    ///
2107    /// ```text
2108    /// shape: (2, 3)
2109    /// +-------------+--------------------+---------+
2110    /// | Rank (2021) | Apple Price (€/kg) | Country |
2111    /// | ---         | ---                | ---     |
2112    /// | i32         | f64                | str     |
2113    /// +=============+====================+=========+
2114    /// | 108         | 0.65               | Syria   |
2115    /// +-------------+--------------------+---------+
2116    /// | 109         | 0.52               | Turkey  |
2117    /// +-------------+--------------------+---------+
2118    /// ```
2119    #[must_use]
2120    pub fn tail(&self, length: Option<usize>) -> Self {
2121        let new_height = usize::min(self.height(), length.unwrap_or(TAIL_DEFAULT_LENGTH));
2122        let new_cols = self.apply_columns(|c| c.tail(Some(new_height)));
2123
2124        unsafe { DataFrame::new_unchecked(new_height, new_cols).with_schema_from(self) }
2125    }
2126
2127    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2128    ///
2129    /// # Panics
2130    ///
2131    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2132    ///
2133    /// This responsibility is left to the caller as we don't want to take mutable references here,
2134    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2135    /// as well.
2136    pub fn iter_chunks(
2137        &self,
2138        compat_level: CompatLevel,
2139        parallel: bool,
2140    ) -> impl Iterator<Item = RecordBatch> + '_ {
2141        debug_assert!(!self.should_rechunk(), "expected equal chunks");
2142
2143        if self.width() == 0 {
2144            return RecordBatchIterWrap::new_zero_width(self.height());
2145        }
2146
2147        // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2148        // as we must allocate arrow strings/binaries.
2149        let must_convert = compat_level.0 == 0;
2150        let parallel = parallel
2151            && must_convert
2152            && self.width() > 1
2153            && self
2154                .columns()
2155                .iter()
2156                .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2157
2158        RecordBatchIterWrap::Batches(RecordBatchIter {
2159            df: self,
2160            schema: Arc::new(
2161                self.columns()
2162                    .iter()
2163                    .map(|c| c.field().to_arrow(compat_level))
2164                    .collect(),
2165            ),
2166            idx: 0,
2167            n_chunks: usize::max(1, self.first_col_n_chunks()),
2168            compat_level,
2169            parallel,
2170        })
2171    }
2172
2173    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2174    ///
2175    /// # Panics
2176    ///
2177    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2178    ///
2179    /// This responsibility is left to the caller as we don't want to take mutable references here,
2180    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2181    /// as well.
2182    pub fn iter_chunks_physical(&self) -> impl Iterator<Item = RecordBatch> + '_ {
2183        debug_assert!(!self.should_rechunk());
2184
2185        if self.width() == 0 {
2186            return RecordBatchIterWrap::new_zero_width(self.height());
2187        }
2188
2189        RecordBatchIterWrap::PhysicalBatches(PhysRecordBatchIter {
2190            schema: Arc::new(
2191                self.columns()
2192                    .iter()
2193                    .map(|c| c.field().to_arrow(CompatLevel::newest()))
2194                    .collect(),
2195            ),
2196            arr_iters: self
2197                .materialized_column_iter()
2198                .map(|s| s.chunks().iter())
2199                .collect(),
2200        })
2201    }
2202
2203    /// Get a [`DataFrame`] with all the columns in reversed order.
2204    #[must_use]
2205    pub fn reverse(&self) -> Self {
2206        let new_cols = self.apply_columns(Column::reverse);
2207        unsafe { DataFrame::new_unchecked(self.height(), new_cols).with_schema_from(self) }
2208    }
2209
2210    /// Shift the values by a given period and fill the parts that will be empty due to this operation
2211    /// with `Nones`.
2212    ///
2213    /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2214    #[must_use]
2215    pub fn shift(&self, periods: i64) -> Self {
2216        let col = self.apply_columns_par(|s| s.shift(periods));
2217        unsafe { DataFrame::new_unchecked(self.height(), col).with_schema_from(self) }
2218    }
2219
2220    /// Replace None values with one of the following strategies:
2221    /// * Forward fill (replace None with the previous value)
2222    /// * Backward fill (replace None with the next value)
2223    /// * Mean fill (replace None with the mean of the whole array)
2224    /// * Min fill (replace None with the minimum of the whole array)
2225    /// * Max fill (replace None with the maximum of the whole array)
2226    ///
2227    /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2228    pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2229        let col = self.try_apply_columns_par(|s| s.fill_null(strategy))?;
2230
2231        Ok(unsafe { DataFrame::new_unchecked(self.height(), col) })
2232    }
2233
2234    /// Pipe different functions/ closure operations that work on a DataFrame together.
2235    pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2236    where
2237        F: Fn(DataFrame) -> PolarsResult<B>,
2238    {
2239        f(self)
2240    }
2241
2242    /// Pipe different functions/ closure operations that work on a DataFrame together.
2243    pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2244    where
2245        F: Fn(&mut DataFrame) -> PolarsResult<B>,
2246    {
2247        f(self)
2248    }
2249
2250    /// Pipe different functions/ closure operations that work on a DataFrame together.
2251    pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2252    where
2253        F: Fn(DataFrame, Args) -> PolarsResult<B>,
2254    {
2255        f(self, args)
2256    }
2257    /// Drop duplicate rows from a [`DataFrame`].
2258    /// *This fails when there is a column of type List in DataFrame*
2259    ///
2260    /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2261    ///
2262    /// # Example
2263    ///
2264    /// ```no_run
2265    /// # use polars_core::prelude::*;
2266    /// let df = df! {
2267    ///               "flt" => [1., 1., 2., 2., 3., 3.],
2268    ///               "int" => [1, 1, 2, 2, 3, 3, ],
2269    ///               "str" => ["a", "a", "b", "b", "c", "c"]
2270    ///           }?;
2271    ///
2272    /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2273    /// # Ok::<(), PolarsError>(())
2274    /// ```
2275    /// Returns
2276    ///
2277    /// ```text
2278    /// +-----+-----+-----+
2279    /// | flt | int | str |
2280    /// | --- | --- | --- |
2281    /// | f64 | i32 | str |
2282    /// +=====+=====+=====+
2283    /// | 1   | 1   | "a" |
2284    /// +-----+-----+-----+
2285    /// | 2   | 2   | "b" |
2286    /// +-----+-----+-----+
2287    /// | 3   | 3   | "c" |
2288    /// +-----+-----+-----+
2289    /// ```
2290    #[cfg(feature = "algorithm_group_by")]
2291    pub fn unique_stable(
2292        &self,
2293        subset: Option<&[String]>,
2294        keep: UniqueKeepStrategy,
2295        slice: Option<(i64, usize)>,
2296    ) -> PolarsResult<DataFrame> {
2297        self.unique_impl(
2298            true,
2299            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2300            keep,
2301            slice,
2302        )
2303    }
2304
2305    /// Unstable distinct. See [`DataFrame::unique_stable`].
2306    #[cfg(feature = "algorithm_group_by")]
2307    pub fn unique<I, S>(
2308        &self,
2309        subset: Option<&[String]>,
2310        keep: UniqueKeepStrategy,
2311        slice: Option<(i64, usize)>,
2312    ) -> PolarsResult<DataFrame> {
2313        self.unique_impl(
2314            false,
2315            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2316            keep,
2317            slice,
2318        )
2319    }
2320
2321    #[cfg(feature = "algorithm_group_by")]
2322    pub fn unique_impl(
2323        &self,
2324        maintain_order: bool,
2325        subset: Option<Vec<PlSmallStr>>,
2326        keep: UniqueKeepStrategy,
2327        slice: Option<(i64, usize)>,
2328    ) -> PolarsResult<Self> {
2329        if self.width() == 0 {
2330            let height = usize::min(self.height(), 1);
2331            return Ok(DataFrame::empty_with_height(height));
2332        }
2333
2334        let names = subset.unwrap_or_else(|| self.get_column_names_owned());
2335        let mut df = self.clone();
2336        // take on multiple chunks is terrible
2337        df.rechunk_mut_par();
2338
2339        let columns = match (keep, maintain_order) {
2340            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
2341                let gb = df.group_by_stable(names)?;
2342                let groups = gb.get_groups();
2343                let (offset, len) = slice.unwrap_or((0, groups.len()));
2344                let groups = groups.slice(offset, len);
2345                df.apply_columns_par(|s| unsafe { s.agg_first(&groups) })
2346            },
2347            (UniqueKeepStrategy::Last, true) => {
2348                // maintain order by last values, so the sorted groups are not correct as they
2349                // are sorted by the first value
2350                let gb = df.group_by_stable(names)?;
2351                let groups = gb.get_groups();
2352
2353                let last_idx: NoNull<IdxCa> = groups
2354                    .iter()
2355                    .map(|g| match g {
2356                        GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
2357                        GroupsIndicator::Slice([first, len]) => first + len - 1,
2358                    })
2359                    .collect();
2360
2361                let mut last_idx = last_idx.into_inner().sort(false);
2362
2363                if let Some((offset, len)) = slice {
2364                    last_idx = last_idx.slice(offset, len);
2365                }
2366
2367                let last_idx = NoNull::new(last_idx);
2368                let out = unsafe { df.take_unchecked(&last_idx) };
2369                return Ok(out);
2370            },
2371            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
2372                let gb = df.group_by(names)?;
2373                let groups = gb.get_groups();
2374                let (offset, len) = slice.unwrap_or((0, groups.len()));
2375                let groups = groups.slice(offset, len);
2376                df.apply_columns_par(|s| unsafe { s.agg_first(&groups) })
2377            },
2378            (UniqueKeepStrategy::Last, false) => {
2379                let gb = df.group_by(names)?;
2380                let groups = gb.get_groups();
2381                let (offset, len) = slice.unwrap_or((0, groups.len()));
2382                let groups = groups.slice(offset, len);
2383                df.apply_columns_par(|s| unsafe { s.agg_last(&groups) })
2384            },
2385            (UniqueKeepStrategy::None, _) => {
2386                let df_part = df.select(names)?;
2387                let mask = df_part.is_unique()?;
2388                let mut filtered = df.filter(&mask)?;
2389
2390                if let Some((offset, len)) = slice {
2391                    filtered = filtered.slice(offset, len);
2392                }
2393                return Ok(filtered);
2394            },
2395        };
2396        Ok(unsafe { DataFrame::new_unchecked_infer_height(columns).with_schema_from(self) })
2397    }
2398
2399    /// Get a mask of all the unique rows in the [`DataFrame`].
2400    ///
2401    /// # Example
2402    ///
2403    /// ```no_run
2404    /// # use polars_core::prelude::*;
2405    /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
2406    ///                         "ISIN" => ["US0378331005", "US5949181045"])?;
2407    /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
2408    ///
2409    /// assert!(ca.all());
2410    /// # Ok::<(), PolarsError>(())
2411    /// ```
2412    #[cfg(feature = "algorithm_group_by")]
2413    pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
2414        let gb = self.group_by(self.get_column_names_owned())?;
2415        let groups = gb.get_groups();
2416        Ok(is_unique_helper(
2417            groups,
2418            self.height() as IdxSize,
2419            true,
2420            false,
2421        ))
2422    }
2423
2424    /// Get a mask of all the duplicated rows in the [`DataFrame`].
2425    ///
2426    /// # Example
2427    ///
2428    /// ```no_run
2429    /// # use polars_core::prelude::*;
2430    /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
2431    ///                         "ISIN" => ["US02079K3059", "US02079K1079"])?;
2432    /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
2433    ///
2434    /// assert!(!ca.all());
2435    /// # Ok::<(), PolarsError>(())
2436    /// ```
2437    #[cfg(feature = "algorithm_group_by")]
2438    pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
2439        let gb = self.group_by(self.get_column_names_owned())?;
2440        let groups = gb.get_groups();
2441        Ok(is_unique_helper(
2442            groups,
2443            self.height() as IdxSize,
2444            false,
2445            true,
2446        ))
2447    }
2448
2449    /// Create a new [`DataFrame`] that shows the null counts per column.
2450    #[must_use]
2451    pub fn null_count(&self) -> Self {
2452        let cols =
2453            self.apply_columns(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]));
2454        unsafe { Self::new_unchecked(1, cols) }
2455    }
2456
2457    /// Hash and combine the row values
2458    #[cfg(feature = "row_hash")]
2459    pub fn hash_rows(
2460        &mut self,
2461        hasher_builder: Option<PlSeedableRandomStateQuality>,
2462    ) -> PolarsResult<UInt64Chunked> {
2463        let dfs = split_df(self, POOL.current_num_threads(), false);
2464        let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
2465
2466        let mut iter = cas.into_iter();
2467        let mut acc_ca = iter.next().unwrap();
2468        for ca in iter {
2469            acc_ca.append(&ca)?;
2470        }
2471        Ok(acc_ca.rechunk().into_owned())
2472    }
2473
2474    /// Get the supertype of the columns in this DataFrame
2475    pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
2476        self.columns()
2477            .iter()
2478            .map(|s| Ok(s.dtype().clone()))
2479            .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
2480    }
2481
2482    /// Take by index values given by the slice `idx`.
2483    /// # Warning
2484    /// Be careful with allowing threads when calling this in a large hot loop
2485    /// every thread split may be on rayon stack and lead to SO
2486    #[doc(hidden)]
2487    pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2488        self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
2489    }
2490
2491    /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
2492    /// if the index value in `idx` are sorted. This will maintain sorted flags.
2493    ///
2494    /// # Warning
2495    /// Be careful with allowing threads when calling this in a large hot loop
2496    /// every thread split may be on rayon stack and lead to SO
2497    #[doc(hidden)]
2498    pub unsafe fn _take_unchecked_slice_sorted(
2499        &self,
2500        idx: &[IdxSize],
2501        allow_threads: bool,
2502        sorted: IsSorted,
2503    ) -> Self {
2504        #[cfg(debug_assertions)]
2505        {
2506            if idx.len() > 2 {
2507                use crate::series::IsSorted;
2508
2509                match sorted {
2510                    IsSorted::Ascending => {
2511                        assert!(idx[0] <= idx[idx.len() - 1]);
2512                    },
2513                    IsSorted::Descending => {
2514                        assert!(idx[0] >= idx[idx.len() - 1]);
2515                    },
2516                    _ => {},
2517                }
2518            }
2519        }
2520        let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
2521        ca.set_sorted_flag(sorted);
2522        self.take_unchecked_impl(&ca, allow_threads)
2523    }
2524    #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
2525    #[doc(hidden)]
2526    pub fn _partition_by_impl(
2527        &self,
2528        cols: &[PlSmallStr],
2529        stable: bool,
2530        include_key: bool,
2531        parallel: bool,
2532    ) -> PolarsResult<Vec<DataFrame>> {
2533        let selected_keys = self.select_to_vec(cols.iter().cloned())?;
2534        let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
2535        let groups = groups.into_groups();
2536
2537        // drop key columns prior to calculation if requested
2538        let df = if include_key {
2539            self.clone()
2540        } else {
2541            self.drop_many(cols.iter().cloned())
2542        };
2543
2544        if parallel {
2545            // don't parallelize this
2546            // there is a lot of parallelization in take and this may easily SO
2547            POOL.install(|| {
2548                match groups.as_ref() {
2549                    GroupsType::Idx(idx) => {
2550                        // Rechunk as the gather may rechunk for every group #17562.
2551                        let mut df = df.clone();
2552                        df.rechunk_mut_par();
2553                        Ok(idx
2554                            .into_par_iter()
2555                            .map(|(_, group)| {
2556                                // groups are in bounds
2557                                unsafe {
2558                                    df._take_unchecked_slice_sorted(
2559                                        group,
2560                                        false,
2561                                        IsSorted::Ascending,
2562                                    )
2563                                }
2564                            })
2565                            .collect())
2566                    },
2567                    GroupsType::Slice { groups, .. } => Ok(groups
2568                        .into_par_iter()
2569                        .map(|[first, len]| df.slice(*first as i64, *len as usize))
2570                        .collect()),
2571                }
2572            })
2573        } else {
2574            match groups.as_ref() {
2575                GroupsType::Idx(idx) => {
2576                    // Rechunk as the gather may rechunk for every group #17562.
2577                    let mut df = df;
2578                    df.rechunk_mut();
2579                    Ok(idx
2580                        .into_iter()
2581                        .map(|(_, group)| {
2582                            // groups are in bounds
2583                            unsafe {
2584                                df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
2585                            }
2586                        })
2587                        .collect())
2588                },
2589                GroupsType::Slice { groups, .. } => Ok(groups
2590                    .iter()
2591                    .map(|[first, len]| df.slice(*first as i64, *len as usize))
2592                    .collect()),
2593            }
2594        }
2595    }
2596
2597    /// Split into multiple DataFrames partitioned by groups
2598    #[cfg(feature = "partition_by")]
2599    pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
2600    where
2601        I: IntoIterator<Item = S>,
2602        S: Into<PlSmallStr>,
2603    {
2604        let cols: UnitVec<PlSmallStr> = cols.into_iter().map(Into::into).collect();
2605        self._partition_by_impl(cols.as_slice(), false, include_key, true)
2606    }
2607
2608    /// Split into multiple DataFrames partitioned by groups
2609    /// Order of the groups are maintained.
2610    #[cfg(feature = "partition_by")]
2611    pub fn partition_by_stable<I, S>(
2612        &self,
2613        cols: I,
2614        include_key: bool,
2615    ) -> PolarsResult<Vec<DataFrame>>
2616    where
2617        I: IntoIterator<Item = S>,
2618        S: Into<PlSmallStr>,
2619    {
2620        let cols: UnitVec<PlSmallStr> = cols.into_iter().map(Into::into).collect();
2621        self._partition_by_impl(cols.as_slice(), true, include_key, true)
2622    }
2623
2624    /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
2625    /// inserted as columns.
2626    #[cfg(feature = "dtype-struct")]
2627    pub fn unnest(
2628        &self,
2629        cols: impl IntoIterator<Item = impl Into<PlSmallStr>>,
2630        separator: Option<&str>,
2631    ) -> PolarsResult<DataFrame> {
2632        self.unnest_impl(cols.into_iter().map(Into::into).collect(), separator)
2633    }
2634
2635    #[cfg(feature = "dtype-struct")]
2636    fn unnest_impl(
2637        &self,
2638        cols: PlHashSet<PlSmallStr>,
2639        separator: Option<&str>,
2640    ) -> PolarsResult<DataFrame> {
2641        let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
2642        let mut count = 0;
2643        for s in self.columns() {
2644            if cols.contains(s.name()) {
2645                let ca = s.struct_()?.clone();
2646                new_cols.extend(ca.fields_as_series().into_iter().map(|mut f| {
2647                    if let Some(separator) = &separator {
2648                        f.rename(polars_utils::format_pl_smallstr!(
2649                            "{}{}{}",
2650                            s.name(),
2651                            separator,
2652                            f.name()
2653                        ));
2654                    }
2655                    Column::from(f)
2656                }));
2657                count += 1;
2658            } else {
2659                new_cols.push(s.clone())
2660            }
2661        }
2662        if count != cols.len() {
2663            // one or more columns not found
2664            // the code below will return an error with the missing name
2665            let schema = self.schema();
2666            for col in cols {
2667                let _ = schema
2668                    .get(col.as_str())
2669                    .ok_or_else(|| polars_err!(col_not_found = col))?;
2670            }
2671        }
2672
2673        DataFrame::new(self.height(), new_cols)
2674    }
2675
2676    pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
2677        // @Optimize: this does a lot of unnecessary allocations. We should probably have a
2678        // append_chunk or something like this. It is just quite difficult to make that safe.
2679        let df = DataFrame::from(rb);
2680        polars_ensure!(
2681            self.schema() == df.schema(),
2682            SchemaMismatch: "cannot append record batch with different schema\n\n
2683        Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
2684        );
2685        self.vstack_mut_owned_unchecked(df);
2686        Ok(())
2687    }
2688}
2689
2690pub struct RecordBatchIter<'a> {
2691    df: &'a DataFrame,
2692    schema: ArrowSchemaRef,
2693    idx: usize,
2694    n_chunks: usize,
2695    compat_level: CompatLevel,
2696    parallel: bool,
2697}
2698
2699impl Iterator for RecordBatchIter<'_> {
2700    type Item = RecordBatch;
2701
2702    fn next(&mut self) -> Option<Self::Item> {
2703        if self.idx >= self.n_chunks {
2704            return None;
2705        }
2706
2707        // Create a batch of the columns with the same chunk no.
2708        let batch_cols: Vec<ArrayRef> = if self.parallel {
2709            let iter = self
2710                .df
2711                .columns()
2712                .par_iter()
2713                .map(Column::as_materialized_series)
2714                .map(|s| s.to_arrow(self.idx, self.compat_level));
2715            POOL.install(|| iter.collect())
2716        } else {
2717            self.df
2718                .columns()
2719                .iter()
2720                .map(Column::as_materialized_series)
2721                .map(|s| s.to_arrow(self.idx, self.compat_level))
2722                .collect()
2723        };
2724
2725        let length = batch_cols.first().map_or(0, |arr| arr.len());
2726
2727        self.idx += 1;
2728
2729        Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
2730    }
2731
2732    fn size_hint(&self) -> (usize, Option<usize>) {
2733        let n = self.n_chunks - self.idx;
2734        (n, Some(n))
2735    }
2736}
2737
2738pub struct PhysRecordBatchIter<'a> {
2739    schema: ArrowSchemaRef,
2740    arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
2741}
2742
2743impl Iterator for PhysRecordBatchIter<'_> {
2744    type Item = RecordBatch;
2745
2746    fn next(&mut self) -> Option<Self::Item> {
2747        let arrs = self
2748            .arr_iters
2749            .iter_mut()
2750            .map(|phys_iter| phys_iter.next().cloned())
2751            .collect::<Option<Vec<_>>>()?;
2752
2753        let length = arrs.first().map_or(0, |arr| arr.len());
2754        Some(RecordBatch::new(length, self.schema.clone(), arrs))
2755    }
2756
2757    fn size_hint(&self) -> (usize, Option<usize>) {
2758        if let Some(iter) = self.arr_iters.first() {
2759            iter.size_hint()
2760        } else {
2761            (0, None)
2762        }
2763    }
2764}
2765
2766pub enum RecordBatchIterWrap<'a> {
2767    ZeroWidth {
2768        remaining_height: usize,
2769        chunk_size: usize,
2770    },
2771    Batches(RecordBatchIter<'a>),
2772    PhysicalBatches(PhysRecordBatchIter<'a>),
2773}
2774
2775impl<'a> RecordBatchIterWrap<'a> {
2776    fn new_zero_width(height: usize) -> Self {
2777        Self::ZeroWidth {
2778            remaining_height: height,
2779            chunk_size: polars_config::config().ideal_morsel_size() as usize,
2780        }
2781    }
2782}
2783
2784impl Iterator for RecordBatchIterWrap<'_> {
2785    type Item = RecordBatch;
2786
2787    fn next(&mut self) -> Option<Self::Item> {
2788        match self {
2789            Self::ZeroWidth {
2790                remaining_height,
2791                chunk_size,
2792            } => {
2793                let n = usize::min(*remaining_height, *chunk_size);
2794                *remaining_height -= n;
2795
2796                (n > 0).then(|| RecordBatch::new(n, ArrowSchemaRef::default(), vec![]))
2797            },
2798            Self::Batches(v) => v.next(),
2799            Self::PhysicalBatches(v) => v.next(),
2800        }
2801    }
2802
2803    fn size_hint(&self) -> (usize, Option<usize>) {
2804        match self {
2805            Self::ZeroWidth {
2806                remaining_height,
2807                chunk_size,
2808            } => {
2809                let n = remaining_height.div_ceil(*chunk_size);
2810                (n, Some(n))
2811            },
2812            Self::Batches(v) => v.size_hint(),
2813            Self::PhysicalBatches(v) => v.size_hint(),
2814        }
2815    }
2816}
2817
2818// utility to test if we can vstack/extend the columns
2819fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
2820    polars_ensure!(
2821        left.name() == right.name(),
2822        ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
2823        left.name(), right.name(),
2824    );
2825    Ok(())
2826}
2827
2828#[cfg(test)]
2829mod test {
2830    use super::*;
2831
2832    fn create_frame() -> DataFrame {
2833        let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
2834        let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
2835        DataFrame::new_infer_height(vec![s0, s1]).unwrap()
2836    }
2837
2838    #[test]
2839    #[cfg_attr(miri, ignore)]
2840    fn test_recordbatch_iterator() {
2841        let df = df!(
2842            "foo" => [1, 2, 3, 4, 5]
2843        )
2844        .unwrap();
2845        let mut iter = df.iter_chunks(CompatLevel::newest(), false);
2846        assert_eq!(5, iter.next().unwrap().len());
2847        assert!(iter.next().is_none());
2848    }
2849
2850    #[test]
2851    #[cfg_attr(miri, ignore)]
2852    fn test_select() {
2853        let df = create_frame();
2854        assert_eq!(
2855            df.column("days")
2856                .unwrap()
2857                .as_series()
2858                .unwrap()
2859                .equal(1)
2860                .unwrap()
2861                .sum(),
2862            Some(1)
2863        );
2864    }
2865
2866    #[test]
2867    #[cfg_attr(miri, ignore)]
2868    fn test_filter_broadcast_on_string_col() {
2869        let col_name = "some_col";
2870        let v = vec!["test".to_string()];
2871        let s0 = Column::new(PlSmallStr::from_str(col_name), v);
2872        let mut df = DataFrame::new_infer_height(vec![s0]).unwrap();
2873
2874        df = df
2875            .filter(
2876                &df.column(col_name)
2877                    .unwrap()
2878                    .as_materialized_series()
2879                    .equal("")
2880                    .unwrap(),
2881            )
2882            .unwrap();
2883        assert_eq!(
2884            df.column(col_name)
2885                .unwrap()
2886                .as_materialized_series()
2887                .n_chunks(),
2888            1
2889        );
2890    }
2891
2892    #[test]
2893    #[cfg_attr(miri, ignore)]
2894    fn test_filter_broadcast_on_list_col() {
2895        let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
2896        let ll: ListChunked = [&s1].iter().copied().collect();
2897
2898        let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
2899        let new = ll.filter(&mask).unwrap();
2900
2901        assert_eq!(new.chunks.len(), 1);
2902        assert_eq!(new.len(), 0);
2903    }
2904
2905    #[test]
2906    fn slice() {
2907        let df = create_frame();
2908        let sliced_df = df.slice(0, 2);
2909        assert_eq!(sliced_df.shape(), (2, 2));
2910    }
2911
2912    #[test]
2913    fn rechunk_false() {
2914        let df = create_frame();
2915        assert!(!df.should_rechunk())
2916    }
2917
2918    #[test]
2919    fn rechunk_true() -> PolarsResult<()> {
2920        let mut base = df!(
2921            "a" => [1, 2, 3],
2922            "b" => [1, 2, 3]
2923        )?;
2924
2925        // Create a series with multiple chunks
2926        let mut s = Series::new("foo".into(), 0..2);
2927        let s2 = Series::new("bar".into(), 0..1);
2928        s.append(&s2)?;
2929
2930        // Append series to frame
2931        let out = base.with_column(s.into_column())?;
2932
2933        // Now we should rechunk
2934        assert!(out.should_rechunk());
2935        Ok(())
2936    }
2937
2938    #[test]
2939    fn test_duplicate_column() {
2940        let mut df = df! {
2941            "foo" => [1, 2, 3]
2942        }
2943        .unwrap();
2944        // check if column is replaced
2945        assert!(
2946            df.with_column(Column::new("foo".into(), &[1, 2, 3]))
2947                .is_ok()
2948        );
2949        assert!(
2950            df.with_column(Column::new("bar".into(), &[1, 2, 3]))
2951                .is_ok()
2952        );
2953        assert!(df.column("bar").is_ok())
2954    }
2955
2956    #[test]
2957    #[cfg_attr(miri, ignore)]
2958    fn distinct() {
2959        let df = df! {
2960            "flt" => [1., 1., 2., 2., 3., 3.],
2961            "int" => [1, 1, 2, 2, 3, 3, ],
2962            "str" => ["a", "a", "b", "b", "c", "c"]
2963        }
2964        .unwrap();
2965        let df = df
2966            .unique_stable(None, UniqueKeepStrategy::First, None)
2967            .unwrap()
2968            .sort(["flt"], SortMultipleOptions::default())
2969            .unwrap();
2970        let valid = df! {
2971            "flt" => [1., 2., 3.],
2972            "int" => [1, 2, 3],
2973            "str" => ["a", "b", "c"]
2974        }
2975        .unwrap();
2976        assert!(df.equals(&valid));
2977    }
2978
2979    #[test]
2980    fn test_vstack() {
2981        // check that it does not accidentally rechunks
2982        let mut df = df! {
2983            "flt" => [1., 1., 2., 2., 3., 3.],
2984            "int" => [1, 1, 2, 2, 3, 3, ],
2985            "str" => ["a", "a", "b", "b", "c", "c"]
2986        }
2987        .unwrap();
2988
2989        df.vstack_mut(&df.slice(0, 3)).unwrap();
2990        assert_eq!(df.first_col_n_chunks(), 2)
2991    }
2992
2993    #[test]
2994    fn test_vstack_on_empty_dataframe() {
2995        let mut df = DataFrame::empty();
2996
2997        let df_data = df! {
2998            "flt" => [1., 1., 2., 2., 3., 3.],
2999            "int" => [1, 1, 2, 2, 3, 3, ],
3000            "str" => ["a", "a", "b", "b", "c", "c"]
3001        }
3002        .unwrap();
3003
3004        df.vstack_mut(&df_data).unwrap();
3005        assert_eq!(df.height(), 6)
3006    }
3007
3008    #[test]
3009    fn test_unique_keep_none_with_slice() {
3010        let df = df! {
3011            "x" => [1, 2, 3, 2, 1]
3012        }
3013        .unwrap();
3014        let out = df
3015            .unique_stable(
3016                Some(&["x".to_string()][..]),
3017                UniqueKeepStrategy::None,
3018                Some((0, 2)),
3019            )
3020            .unwrap();
3021        let expected = df! {
3022            "x" => [3]
3023        }
3024        .unwrap();
3025        assert!(out.equals(&expected));
3026    }
3027
3028    #[test]
3029    #[cfg(feature = "dtype-i8")]
3030    fn test_apply_result_schema() {
3031        let mut df = df! {
3032            "x" => [1, 2, 3, 2, 1]
3033        }
3034        .unwrap();
3035
3036        let schema_before = df.schema().clone();
3037        df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3038        assert_ne!(&schema_before, df.schema());
3039    }
3040}