polars_core/frame/mod.rs
1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::UnitVec;
10use polars_utils::itertools::Itertools;
11use rayon::prelude::*;
12
13use crate::chunked_array::flags::StatisticsFlags;
14#[cfg(feature = "algorithm_group_by")]
15use crate::chunked_array::ops::unique::is_unique_helper;
16use crate::prelude::*;
17#[cfg(feature = "row_hash")]
18use crate::utils::split_df;
19use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
20use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
21
22#[cfg(feature = "dataframe_arithmetic")]
23mod arithmetic;
24pub mod builder;
25mod chunks;
26pub use chunks::chunk_df_for_writing;
27pub mod column;
28pub mod explode;
29mod from;
30#[cfg(feature = "algorithm_group_by")]
31pub mod group_by;
32pub(crate) mod horizontal;
33#[cfg(feature = "proptest")]
34pub mod proptest;
35#[cfg(any(feature = "rows", feature = "object"))]
36pub mod row;
37mod top_k;
38mod upstream_traits;
39mod validation;
40
41use arrow::record_batch::{RecordBatch, RecordBatchT};
42use polars_utils::pl_str::PlSmallStr;
43#[cfg(feature = "serde")]
44use serde::{Deserialize, Serialize};
45use strum_macros::IntoStaticStr;
46
47use crate::POOL;
48#[cfg(feature = "row_hash")]
49use crate::hashing::_df_rows_to_hashes_threaded_vertical;
50use crate::prelude::sort::arg_sort;
51use crate::series::IsSorted;
52
53#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
54#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
55#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
56#[strum(serialize_all = "snake_case")]
57pub enum UniqueKeepStrategy {
58 /// Keep the first unique row.
59 First,
60 /// Keep the last unique row.
61 Last,
62 /// Keep None of the unique rows.
63 None,
64 /// Keep any of the unique rows
65 /// This allows more optimizations
66 #[default]
67 Any,
68}
69
70fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
71where
72 F: for<'a> FnMut(&'a T) -> &'a str,
73{
74 // Always unique.
75 if items.len() <= 1 {
76 return Ok(());
77 }
78
79 if items.len() <= 4 {
80 // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
81 for i in 0..items.len() - 1 {
82 let name = get_name(&items[i]);
83 for other in items.iter().skip(i + 1) {
84 if name == get_name(other) {
85 polars_bail!(duplicate = name);
86 }
87 }
88 }
89 } else {
90 let mut names = PlHashSet::with_capacity(items.len());
91 for item in items {
92 let name = get_name(item);
93 if !names.insert(name) {
94 polars_bail!(duplicate = name);
95 }
96 }
97 }
98 Ok(())
99}
100
101/// A contiguous growable collection of `Series` that have the same length.
102///
103/// ## Use declarations
104///
105/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
106///
107/// ```rust
108/// use polars_core::prelude::*; // if the crate polars-core is used directly
109/// // use polars::prelude::*; if the crate polars is used
110/// ```
111///
112/// # Initialization
113/// ## Default
114///
115/// A `DataFrame` can be initialized empty:
116///
117/// ```rust
118/// # use polars_core::prelude::*;
119/// let df = DataFrame::default();
120/// assert!(df.is_empty());
121/// ```
122///
123/// ## Wrapping a `Vec<Series>`
124///
125/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
126///
127/// ```rust
128/// # use polars_core::prelude::*;
129/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
130/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
131///
132/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
133/// ```
134///
135/// ## Using a macro
136///
137/// The [`df!`] macro is a convenient method:
138///
139/// ```rust
140/// # use polars_core::prelude::*;
141/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
142/// "Color" => ["Red", "Yellow", "Green"]);
143/// ```
144///
145/// ## Using a CSV file
146///
147/// See the `polars_io::csv::CsvReader`.
148///
149/// # Indexing
150/// ## By a number
151///
152/// The `Index<usize>` is implemented for the `DataFrame`.
153///
154/// ```rust
155/// # use polars_core::prelude::*;
156/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
157/// "Color" => ["Red", "Yellow", "Green"])?;
158///
159/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
160/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
161/// # Ok::<(), PolarsError>(())
162/// ```
163///
164/// ## By a `Series` name
165///
166/// ```rust
167/// # use polars_core::prelude::*;
168/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
169/// "Color" => ["Red", "Yellow", "Green"])?;
170///
171/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
172/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
173/// # Ok::<(), PolarsError>(())
174/// ```
175#[derive(Clone)]
176pub struct DataFrame {
177 height: usize,
178 // invariant: columns[i].len() == height for each 0 >= i > columns.len()
179 pub(crate) columns: Vec<Column>,
180
181 /// A cached schema. This might not give correct results if the DataFrame was modified in place
182 /// between schema and reading.
183 cached_schema: OnceLock<SchemaRef>,
184}
185
186impl DataFrame {
187 pub fn clear_schema(&mut self) {
188 self.cached_schema = OnceLock::new();
189 }
190
191 #[inline]
192 pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
193 self.columns.iter()
194 }
195
196 #[inline]
197 pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
198 self.columns.iter().map(Column::as_materialized_series)
199 }
200
201 #[inline]
202 pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
203 self.columns.par_iter().map(Column::as_materialized_series)
204 }
205
206 /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
207 ///
208 /// # Implementation
209 /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
210 /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
211 /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
212 ///
213 /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
214 /// However, this function will yield a smaller number. This is because this function returns
215 /// the visible size of the buffer, not its total capacity.
216 ///
217 /// FFI buffers are included in this estimation.
218 pub fn estimated_size(&self) -> usize {
219 self.columns.iter().map(Column::estimated_size).sum()
220 }
221
222 // Reduce monomorphization.
223 fn try_apply_columns(
224 &self,
225 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
226 ) -> PolarsResult<Vec<Column>> {
227 self.columns.iter().map(func).collect()
228 }
229 // Reduce monomorphization.
230 pub fn _apply_columns(&self, func: &dyn Fn(&Column) -> Column) -> Vec<Column> {
231 self.columns.iter().map(func).collect()
232 }
233 // Reduce monomorphization.
234 fn try_apply_columns_par(
235 &self,
236 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
237 ) -> PolarsResult<Vec<Column>> {
238 POOL.install(|| self.columns.par_iter().map(func).collect())
239 }
240 // Reduce monomorphization.
241 pub fn _apply_columns_par(
242 &self,
243 func: &(dyn Fn(&Column) -> Column + Send + Sync),
244 ) -> Vec<Column> {
245 POOL.install(|| self.columns.par_iter().map(func).collect())
246 }
247
248 /// Get the index of the column.
249 fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
250 self.get_column_index(name)
251 .ok_or_else(|| polars_err!(col_not_found = name))
252 }
253
254 fn check_already_present(&self, name: &str) -> PolarsResult<()> {
255 polars_ensure!(
256 self.columns.iter().all(|s| s.name().as_str() != name),
257 Duplicate: "column with name {:?} is already present in the DataFrame", name
258 );
259 Ok(())
260 }
261
262 /// Reserve additional slots into the chunks of the series.
263 pub(crate) fn reserve_chunks(&mut self, additional: usize) {
264 for s in &mut self.columns {
265 if let Column::Series(s) = s {
266 // SAFETY:
267 // do not modify the data, simply resize.
268 unsafe { s.chunks_mut().reserve(additional) }
269 }
270 }
271 }
272
273 /// Create a DataFrame from a Vector of Columns.
274 ///
275 /// Errors if a column names are not unique, or if heights are not all equal.
276 ///
277 /// # Example
278 ///
279 /// ```
280 /// # use polars_core::prelude::*;
281 /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
282 /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
283 ///
284 /// let df = DataFrame::new(vec![s0, s1])?;
285 /// # Ok::<(), PolarsError>(())
286 /// ```
287 pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
288 DataFrame::validate_columns_slice(&columns)
289 .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
290 Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
291 }
292
293 pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
294 for col in &columns {
295 polars_ensure!(
296 col.len() == height,
297 ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
298 columns[0].name(), height, col.name(), col.len()
299 );
300 }
301
302 ensure_names_unique(&columns, |s| s.name().as_str())?;
303
304 Ok(DataFrame {
305 height,
306 columns,
307 cached_schema: OnceLock::new(),
308 })
309 }
310
311 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
312 /// columns to match the other columns.
313 pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
314 // The length of the longest non-unit length column determines the
315 // broadcast length. If all columns are unit-length the broadcast length
316 // is one.
317 let broadcast_len = columns
318 .iter()
319 .map(|s| s.len())
320 .filter(|l| *l != 1)
321 .max()
322 .unwrap_or(1);
323 Self::new_with_broadcast_len(columns, broadcast_len)
324 }
325
326 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
327 /// columns to broadcast_len.
328 pub fn new_with_broadcast_len(
329 columns: Vec<Column>,
330 broadcast_len: usize,
331 ) -> PolarsResult<Self> {
332 ensure_names_unique(&columns, |s| s.name().as_str())?;
333 unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
334 }
335
336 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
337 /// columns to match the other columns.
338 ///
339 /// # Safety
340 /// Does not check that the column names are unique (which they must be).
341 pub unsafe fn new_with_broadcast_no_namecheck(
342 mut columns: Vec<Column>,
343 broadcast_len: usize,
344 ) -> PolarsResult<Self> {
345 for col in &mut columns {
346 // Length not equal to the broadcast len, needs broadcast or is an error.
347 let len = col.len();
348 if len != broadcast_len {
349 if len != 1 {
350 let name = col.name().to_owned();
351 let extra_info =
352 if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
353 format!(" (matching column '{}')", c.name())
354 } else {
355 String::new()
356 };
357 polars_bail!(
358 ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
359 );
360 }
361 *col = col.new_from_index(0, broadcast_len);
362 }
363 }
364
365 let length = if columns.is_empty() { 0 } else { broadcast_len };
366
367 Ok(unsafe { DataFrame::new_no_checks(length, columns) })
368 }
369
370 pub fn new_from_index(&self, index: usize, height: usize) -> Self {
371 let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
372 unsafe { Self::new_no_checks(height, cols.collect()) }
373 }
374
375 /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
376 ///
377 /// # Example
378 ///
379 /// ```rust
380 /// use polars_core::prelude::DataFrame;
381 /// static EMPTY: DataFrame = DataFrame::empty();
382 /// ```
383 pub const fn empty() -> Self {
384 Self::empty_with_height(0)
385 }
386
387 /// Creates an empty `DataFrame` with a specific `height`.
388 pub const fn empty_with_height(height: usize) -> Self {
389 DataFrame {
390 height,
391 columns: vec![],
392 cached_schema: OnceLock::new(),
393 }
394 }
395
396 /// Create an empty `DataFrame` with empty columns as per the `schema`.
397 pub fn empty_with_arc_schema(schema: Arc<Schema>) -> Self {
398 let mut df = Self::empty_with_schema(&schema);
399 df.cached_schema = OnceLock::from(schema);
400 df
401 }
402
403 /// Create an empty `DataFrame` with empty columns as per the `schema`.
404 pub fn empty_with_schema(schema: &Schema) -> Self {
405 let cols = schema
406 .iter()
407 .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
408 .collect();
409 unsafe { DataFrame::new_no_checks(0, cols) }
410 }
411
412 /// Create an empty `DataFrame` with empty columns as per the `schema`.
413 pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
414 let cols = schema
415 .iter_values()
416 .map(|fld| {
417 Column::from(Series::new_empty(
418 fld.name.clone(),
419 &(DataType::from_arrow_field(fld)),
420 ))
421 })
422 .collect();
423 unsafe { DataFrame::new_no_checks(0, cols) }
424 }
425
426 /// Create a new `DataFrame` with the given schema, only containing nulls.
427 pub fn full_null(schema: &Schema, height: usize) -> Self {
428 let columns = schema
429 .iter_fields()
430 .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
431 .collect();
432 unsafe { DataFrame::new_no_checks(height, columns) }
433 }
434
435 /// Ensure this DataFrame matches the given schema. Casts null columns to
436 /// the expected schema if necessary (but nothing else).
437 pub fn ensure_matches_schema(&mut self, schema: &Schema) -> PolarsResult<()> {
438 let mut any_needed_cast = false;
439 for (col, (name, dt)) in self.columns.iter_mut().zip(schema.iter()) {
440 polars_ensure!(
441 col.name() == name,
442 SchemaMismatch: "column name mismatch: expected {:?}, found {:?}",
443 name,
444 col.name()
445 );
446
447 let needs_cast = col.dtype().matches_schema_type(dt)?;
448 any_needed_cast |= needs_cast;
449 if needs_cast {
450 *col = col.cast(dt)?;
451 }
452 }
453 if any_needed_cast {
454 self.clear_schema();
455 }
456 Ok(())
457 }
458
459 /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
460 ///
461 /// # Example
462 ///
463 /// ```rust
464 /// # use polars_core::prelude::*;
465 /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
466 /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
467 /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
468 ///
469 /// assert_eq!(df.pop(), Some(s2));
470 /// assert_eq!(df.pop(), Some(s1));
471 /// assert_eq!(df.pop(), None);
472 /// assert!(df.is_empty());
473 /// # Ok::<(), PolarsError>(())
474 /// ```
475 pub fn pop(&mut self) -> Option<Column> {
476 self.clear_schema();
477
478 self.columns.pop()
479 }
480
481 /// Add a new column at index 0 that counts the rows.
482 ///
483 /// # Example
484 ///
485 /// ```
486 /// # use polars_core::prelude::*;
487 /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
488 /// assert_eq!(df1.shape(), (4, 1));
489 ///
490 /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
491 /// assert_eq!(df2.shape(), (4, 2));
492 /// println!("{}", df2);
493 ///
494 /// # Ok::<(), PolarsError>(())
495 /// ```
496 ///
497 /// Output:
498 ///
499 /// ```text
500 /// shape: (4, 2)
501 /// +-----+----------+
502 /// | Id | Name |
503 /// | --- | --- |
504 /// | u32 | str |
505 /// +=====+==========+
506 /// | 0 | James |
507 /// +-----+----------+
508 /// | 1 | Mary |
509 /// +-----+----------+
510 /// | 2 | John |
511 /// +-----+----------+
512 /// | 3 | Patricia |
513 /// +-----+----------+
514 /// ```
515 pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
516 let mut columns = Vec::with_capacity(self.columns.len() + 1);
517 let offset = offset.unwrap_or(0);
518
519 let col = Column::new_row_index(name, offset, self.height())?;
520 columns.push(col);
521 columns.extend_from_slice(&self.columns);
522 DataFrame::new(columns)
523 }
524
525 /// Add a row index column in place.
526 ///
527 /// # Safety
528 /// The caller should ensure the DataFrame does not already contain a column with the given name.
529 ///
530 /// # Panics
531 /// Panics if the resulting column would reach or overflow IdxSize::MAX.
532 pub unsafe fn with_row_index_mut(
533 &mut self,
534 name: PlSmallStr,
535 offset: Option<IdxSize>,
536 ) -> &mut Self {
537 // TODO: Make this function unsafe
538 debug_assert!(
539 self.columns.iter().all(|c| c.name() != &name),
540 "with_row_index_mut(): column with name {} already exists",
541 &name
542 );
543
544 let offset = offset.unwrap_or(0);
545 let col = Column::new_row_index(name, offset, self.height()).unwrap();
546
547 self.clear_schema();
548 self.columns.insert(0, col);
549 self
550 }
551
552 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
553 /// `Series`.
554 ///
555 /// Calculates the height from the first column or `0` if no columns are given.
556 ///
557 /// # Safety
558 ///
559 /// It is the callers responsibility to uphold the contract of all `Series`
560 /// having an equal length and a unique name, if not this may panic down the line.
561 pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
562 let height = columns.first().map_or(0, Column::len);
563 unsafe { Self::new_no_checks(height, columns) }
564 }
565
566 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
567 /// `Series`.
568 ///
569 /// It is advised to use [DataFrame::new] in favor of this method.
570 ///
571 /// # Safety
572 ///
573 /// It is the callers responsibility to uphold the contract of all `Series`
574 /// having an equal length and a unique name, if not this may panic down the line.
575 pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
576 if cfg!(debug_assertions) {
577 DataFrame::validate_columns_slice(&columns).unwrap();
578 }
579
580 unsafe { Self::_new_no_checks_impl(height, columns) }
581 }
582
583 /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
584 /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
585 /// constructed with this method is generally highly unsafe and should not be long-lived.
586 #[allow(clippy::missing_safety_doc)]
587 pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
588 DataFrame {
589 height,
590 columns,
591 cached_schema: OnceLock::new(),
592 }
593 }
594
595 /// Shrink the capacity of this DataFrame to fit its length.
596 pub fn shrink_to_fit(&mut self) {
597 // Don't parallelize this. Memory overhead
598 for s in &mut self.columns {
599 s.shrink_to_fit();
600 }
601 }
602
603 /// Aggregate all the chunks in the DataFrame to a single chunk.
604 pub fn as_single_chunk(&mut self) -> &mut Self {
605 // Don't parallelize this. Memory overhead
606 for s in &mut self.columns {
607 *s = s.rechunk();
608 }
609 self
610 }
611
612 /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
613 /// This may lead to more peak memory consumption.
614 pub fn as_single_chunk_par(&mut self) -> &mut Self {
615 if self.columns.iter().any(|c| c.n_chunks() > 1) {
616 self.columns = self._apply_columns_par(&|s| s.rechunk());
617 }
618 self
619 }
620
621 /// Rechunks all columns to only have a single chunk.
622 pub fn rechunk_mut(&mut self) {
623 // SAFETY: We never adjust the length or names of the columns.
624 let columns = unsafe { self.get_columns_mut() };
625
626 for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
627 *col = col.rechunk();
628 }
629 }
630
631 pub fn _deshare_views_mut(&mut self) {
632 // SAFETY: We never adjust the length or names of the columns.
633 unsafe {
634 let columns = self.get_columns_mut();
635 for col in columns {
636 let Column::Series(s) = col else { continue };
637
638 if let Ok(ca) = s.binary() {
639 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
640 *col = Column::from(gc_ca.into_series());
641 } else if let Ok(ca) = s.str() {
642 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
643 *col = Column::from(gc_ca.into_series());
644 }
645 }
646 }
647 }
648
649 /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
650 pub fn rechunk_to_record_batch(
651 self,
652 compat_level: CompatLevel,
653 ) -> RecordBatchT<Box<dyn Array>> {
654 let height = self.height();
655
656 let (schema, arrays) = self
657 .columns
658 .into_iter()
659 .map(|col| {
660 let mut series = col.take_materialized_series();
661 // Rechunk to one chunk if necessary
662 if series.n_chunks() > 1 {
663 series = series.rechunk();
664 }
665 (
666 series.field().to_arrow(compat_level),
667 series.to_arrow(0, compat_level),
668 )
669 })
670 .collect();
671
672 RecordBatchT::new(height, Arc::new(schema), arrays)
673 }
674
675 /// Returns true if the chunks of the columns do not align and re-chunking should be done
676 pub fn should_rechunk(&self) -> bool {
677 // Fast check. It is also needed for correctness, as code below doesn't check if the number
678 // of chunks is equal.
679 if !self
680 .get_columns()
681 .iter()
682 .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
683 .all_equal()
684 {
685 return true;
686 }
687
688 // From here we check chunk lengths.
689 let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
690 match chunk_lengths.next() {
691 None => false,
692 Some(first_column_chunk_lengths) => {
693 // Fast Path for single Chunk Series
694 if first_column_chunk_lengths.size_hint().0 == 1 {
695 return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
696 }
697 // Always rechunk if we have more chunks than rows.
698 // except when we have an empty df containing a single chunk
699 let height = self.height();
700 let n_chunks = first_column_chunk_lengths.size_hint().0;
701 if n_chunks > height && !(height == 0 && n_chunks == 1) {
702 return true;
703 }
704 // Slow Path for multi Chunk series
705 let v: Vec<_> = first_column_chunk_lengths.collect();
706 for cl in chunk_lengths {
707 if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
708 return true;
709 }
710 }
711 false
712 },
713 }
714 }
715
716 /// Ensure all the chunks in the [`DataFrame`] are aligned.
717 pub fn align_chunks_par(&mut self) -> &mut Self {
718 if self.should_rechunk() {
719 self.as_single_chunk_par()
720 } else {
721 self
722 }
723 }
724
725 pub fn align_chunks(&mut self) -> &mut Self {
726 if self.should_rechunk() {
727 self.as_single_chunk()
728 } else {
729 self
730 }
731 }
732
733 /// Get the [`DataFrame`] schema.
734 ///
735 /// # Example
736 ///
737 /// ```rust
738 /// # use polars_core::prelude::*;
739 /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
740 /// "Diameter (m)" => [8.8e26, f64::INFINITY])?;
741 ///
742 /// let f1: Field = Field::new("Thing".into(), DataType::String);
743 /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
744 /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
745 ///
746 /// assert_eq!(&**df.schema(), &sc);
747 /// # Ok::<(), PolarsError>(())
748 /// ```
749 pub fn schema(&self) -> &SchemaRef {
750 let out = self.cached_schema.get_or_init(|| {
751 Arc::new(
752 self.columns
753 .iter()
754 .map(|x| (x.name().clone(), x.dtype().clone()))
755 .collect(),
756 )
757 });
758
759 debug_assert_eq!(out.len(), self.width());
760
761 out
762 }
763
764 /// Get a reference to the [`DataFrame`] columns.
765 ///
766 /// # Example
767 ///
768 /// ```rust
769 /// # use polars_core::prelude::*;
770 /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
771 /// "Symbol" => ["A", "C", "G", "T"])?;
772 /// let columns: &[Column] = df.get_columns();
773 ///
774 /// assert_eq!(columns[0].name(), "Name");
775 /// assert_eq!(columns[1].name(), "Symbol");
776 /// # Ok::<(), PolarsError>(())
777 /// ```
778 #[inline]
779 pub fn get_columns(&self) -> &[Column] {
780 &self.columns
781 }
782
783 #[inline]
784 /// Get mutable access to the underlying columns.
785 ///
786 /// # Safety
787 ///
788 /// The caller must ensure the length of all [`Series`] remains equal to `height` or
789 /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
790 /// The caller must ensure that the cached schema is cleared if it modifies the schema by
791 /// calling [`DataFrame::clear_schema`].
792 pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
793 &mut self.columns
794 }
795
796 #[inline]
797 /// Remove all the columns in the [`DataFrame`] but keep the `height`.
798 pub fn clear_columns(&mut self) {
799 unsafe { self.get_columns_mut() }.clear();
800 self.clear_schema();
801 }
802
803 #[inline]
804 /// Extend the columns without checking for name collisions or height.
805 ///
806 /// # Safety
807 ///
808 /// The caller needs to ensure that:
809 /// - Column names are unique within the resulting [`DataFrame`].
810 /// - The length of each appended column matches the height of the [`DataFrame`]. For
811 /// `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
812 /// with [`DataFrame::set_height`].
813 pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
814 unsafe { self.get_columns_mut() }.extend(iter);
815 self.clear_schema();
816 }
817
818 /// Take ownership of the underlying columns vec.
819 pub fn take_columns(self) -> Vec<Column> {
820 self.columns
821 }
822
823 /// Iterator over the columns as [`Series`].
824 ///
825 /// # Example
826 ///
827 /// ```rust
828 /// # use polars_core::prelude::*;
829 /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
830 /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
831 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
832 ///
833 /// let mut iterator = df.iter();
834 ///
835 /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
836 /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
837 /// assert_eq!(iterator.next(), None);
838 /// # Ok::<(), PolarsError>(())
839 /// ```
840 pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
841 self.materialized_column_iter()
842 }
843
844 /// # Example
845 ///
846 /// ```rust
847 /// # use polars_core::prelude::*;
848 /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
849 /// "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
850 ///
851 /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
852 /// # Ok::<(), PolarsError>(())
853 /// ```
854 pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
855 self.columns.iter().map(|s| s.name()).collect()
856 }
857
858 /// Get the [`Vec<PlSmallStr>`] representing the column names.
859 pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
860 self.columns.iter().map(|s| s.name().clone()).collect()
861 }
862
863 pub fn get_column_names_str(&self) -> Vec<&str> {
864 self.columns.iter().map(|s| s.name().as_str()).collect()
865 }
866
867 /// Set the column names.
868 /// # Example
869 ///
870 /// ```rust
871 /// # use polars_core::prelude::*;
872 /// let mut df: DataFrame = df!("Mathematical set" => ["ā", "ā¤", "š»", "ā", "ā", "ā"])?;
873 /// df.set_column_names(["Set"])?;
874 ///
875 /// assert_eq!(df.get_column_names(), &["Set"]);
876 /// # Ok::<(), PolarsError>(())
877 /// ```
878 pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
879 where
880 I: IntoIterator<Item = S>,
881 S: Into<PlSmallStr>,
882 {
883 let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
884 self._set_column_names_impl(names.as_slice())
885 }
886
887 fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
888 polars_ensure!(
889 names.len() == self.width(),
890 ShapeMismatch: "{} column names provided for a DataFrame of width {}",
891 names.len(), self.width()
892 );
893 ensure_names_unique(names, |s| s.as_str())?;
894
895 let columns = mem::take(&mut self.columns);
896 self.columns = columns
897 .into_iter()
898 .zip(names)
899 .map(|(s, name)| {
900 let mut s = s;
901 s.rename(name.clone());
902 s
903 })
904 .collect();
905 self.clear_schema();
906 Ok(())
907 }
908
909 /// Get the data types of the columns in the [`DataFrame`].
910 ///
911 /// # Example
912 ///
913 /// ```rust
914 /// # use polars_core::prelude::*;
915 /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
916 /// "Fraction" => [0.965, 0.035])?;
917 ///
918 /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
919 /// # Ok::<(), PolarsError>(())
920 /// ```
921 pub fn dtypes(&self) -> Vec<DataType> {
922 self.columns.iter().map(|s| s.dtype().clone()).collect()
923 }
924
925 pub(crate) fn first_series_column(&self) -> Option<&Series> {
926 self.columns.iter().find_map(|col| col.as_series())
927 }
928
929 /// The number of chunks for the first column.
930 pub fn first_col_n_chunks(&self) -> usize {
931 match self.first_series_column() {
932 None if self.columns.is_empty() => 0,
933 None => 1,
934 Some(s) => s.n_chunks(),
935 }
936 }
937
938 /// The highest number of chunks for any column.
939 pub fn max_n_chunks(&self) -> usize {
940 self.columns
941 .iter()
942 .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
943 .max()
944 .unwrap_or(0)
945 }
946
947 /// Get a reference to the schema fields of the [`DataFrame`].
948 ///
949 /// # Example
950 ///
951 /// ```rust
952 /// # use polars_core::prelude::*;
953 /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
954 /// "Fraction" => [0.708, 0.292])?;
955 ///
956 /// let f1: Field = Field::new("Surface type".into(), DataType::String);
957 /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
958 ///
959 /// assert_eq!(earth.fields(), &[f1, f2]);
960 /// # Ok::<(), PolarsError>(())
961 /// ```
962 pub fn fields(&self) -> Vec<Field> {
963 self.columns
964 .iter()
965 .map(|s| s.field().into_owned())
966 .collect()
967 }
968
969 /// Get (height, width) of the [`DataFrame`].
970 ///
971 /// # Example
972 ///
973 /// ```rust
974 /// # use polars_core::prelude::*;
975 /// let df0: DataFrame = DataFrame::default();
976 /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
977 /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
978 /// "2" => [1, 2, 3, 4, 5])?;
979 ///
980 /// assert_eq!(df0.shape(), (0 ,0));
981 /// assert_eq!(df1.shape(), (5, 1));
982 /// assert_eq!(df2.shape(), (5, 2));
983 /// # Ok::<(), PolarsError>(())
984 /// ```
985 pub fn shape(&self) -> (usize, usize) {
986 (self.height, self.columns.len())
987 }
988
989 /// Get the width of the [`DataFrame`] which is the number of columns.
990 ///
991 /// # Example
992 ///
993 /// ```rust
994 /// # use polars_core::prelude::*;
995 /// let df0: DataFrame = DataFrame::default();
996 /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
997 /// let df2: DataFrame = df!("Series 1" => [0; 0],
998 /// "Series 2" => [0; 0])?;
999 ///
1000 /// assert_eq!(df0.width(), 0);
1001 /// assert_eq!(df1.width(), 1);
1002 /// assert_eq!(df2.width(), 2);
1003 /// # Ok::<(), PolarsError>(())
1004 /// ```
1005 pub fn width(&self) -> usize {
1006 self.columns.len()
1007 }
1008
1009 /// Get the height of the [`DataFrame`] which is the number of rows.
1010 ///
1011 /// # Example
1012 ///
1013 /// ```rust
1014 /// # use polars_core::prelude::*;
1015 /// let df0: DataFrame = DataFrame::default();
1016 /// let df1: DataFrame = df!("Currency" => ["ā¬", "$"])?;
1017 /// let df2: DataFrame = df!("Currency" => ["ā¬", "$", "Ā„", "Ā£", "āæ"])?;
1018 ///
1019 /// assert_eq!(df0.height(), 0);
1020 /// assert_eq!(df1.height(), 2);
1021 /// assert_eq!(df2.height(), 5);
1022 /// # Ok::<(), PolarsError>(())
1023 /// ```
1024 pub fn height(&self) -> usize {
1025 self.height
1026 }
1027
1028 /// Returns the size as number of rows * number of columns
1029 pub fn size(&self) -> usize {
1030 let s = self.shape();
1031 s.0 * s.1
1032 }
1033
1034 /// Returns `true` if the [`DataFrame`] contains no rows.
1035 ///
1036 /// # Example
1037 ///
1038 /// ```rust
1039 /// # use polars_core::prelude::*;
1040 /// let df1: DataFrame = DataFrame::default();
1041 /// assert!(df1.is_empty());
1042 ///
1043 /// let df2: DataFrame = df!("First name" => ["Forever"],
1044 /// "Last name" => ["Alone"])?;
1045 /// assert!(!df2.is_empty());
1046 /// # Ok::<(), PolarsError>(())
1047 /// ```
1048 pub fn is_empty(&self) -> bool {
1049 matches!(self.shape(), (0, _) | (_, 0))
1050 }
1051
1052 /// Set the height (i.e. number of rows) of this [`DataFrame`].
1053 ///
1054 /// # Safety
1055 ///
1056 /// This needs to be equal to the length of all the columns.
1057 pub unsafe fn set_height(&mut self, height: usize) {
1058 self.height = height;
1059 }
1060
1061 /// Add multiple [`Series`] to a [`DataFrame`].
1062 /// The added `Series` are required to have the same length.
1063 ///
1064 /// # Example
1065 ///
1066 /// ```rust
1067 /// # use polars_core::prelude::*;
1068 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1069 /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1070 /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1071 ///
1072 /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1073 /// assert_eq!(df2.shape(), (3, 3));
1074 /// println!("{}", df2);
1075 /// # Ok::<(), PolarsError>(())
1076 /// ```
1077 ///
1078 /// Output:
1079 ///
1080 /// ```text
1081 /// shape: (3, 3)
1082 /// +---------+--------+----------+
1083 /// | Element | Proton | Electron |
1084 /// | --- | --- | --- |
1085 /// | str | i32 | i32 |
1086 /// +=========+========+==========+
1087 /// | Copper | 29 | 29 |
1088 /// +---------+--------+----------+
1089 /// | Silver | 47 | 47 |
1090 /// +---------+--------+----------+
1091 /// | Gold | 79 | 79 |
1092 /// +---------+--------+----------+
1093 /// ```
1094 pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1095 let mut new_cols = self.columns.clone();
1096 new_cols.extend_from_slice(columns);
1097 DataFrame::new(new_cols)
1098 }
1099
1100 /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1101 ///
1102 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1103 ///
1104 /// # Example
1105 ///
1106 /// ```rust
1107 /// # use polars_core::prelude::*;
1108 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1109 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1110 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1111 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1112 ///
1113 /// let df3: DataFrame = df1.vstack(&df2)?;
1114 ///
1115 /// assert_eq!(df3.shape(), (5, 2));
1116 /// println!("{}", df3);
1117 /// # Ok::<(), PolarsError>(())
1118 /// ```
1119 ///
1120 /// Output:
1121 ///
1122 /// ```text
1123 /// shape: (5, 2)
1124 /// +-----------+-------------------+
1125 /// | Element | Melting Point (K) |
1126 /// | --- | --- |
1127 /// | str | f64 |
1128 /// +===========+===================+
1129 /// | Copper | 1357.77 |
1130 /// +-----------+-------------------+
1131 /// | Silver | 1234.93 |
1132 /// +-----------+-------------------+
1133 /// | Gold | 1337.33 |
1134 /// +-----------+-------------------+
1135 /// | Platinum | 2041.4 |
1136 /// +-----------+-------------------+
1137 /// | Palladium | 1828.05 |
1138 /// +-----------+-------------------+
1139 /// ```
1140 pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1141 let mut df = self.clone();
1142 df.vstack_mut(other)?;
1143 Ok(df)
1144 }
1145
1146 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1147 ///
1148 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1149 ///
1150 /// # Example
1151 ///
1152 /// ```rust
1153 /// # use polars_core::prelude::*;
1154 /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1155 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1156 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1157 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1158 ///
1159 /// df1.vstack_mut(&df2)?;
1160 ///
1161 /// assert_eq!(df1.shape(), (5, 2));
1162 /// println!("{}", df1);
1163 /// # Ok::<(), PolarsError>(())
1164 /// ```
1165 ///
1166 /// Output:
1167 ///
1168 /// ```text
1169 /// shape: (5, 2)
1170 /// +-----------+-------------------+
1171 /// | Element | Melting Point (K) |
1172 /// | --- | --- |
1173 /// | str | f64 |
1174 /// +===========+===================+
1175 /// | Copper | 1357.77 |
1176 /// +-----------+-------------------+
1177 /// | Silver | 1234.93 |
1178 /// +-----------+-------------------+
1179 /// | Gold | 1337.33 |
1180 /// +-----------+-------------------+
1181 /// | Platinum | 2041.4 |
1182 /// +-----------+-------------------+
1183 /// | Palladium | 1828.05 |
1184 /// +-----------+-------------------+
1185 /// ```
1186 pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1187 if self.width() != other.width() {
1188 polars_ensure!(
1189 self.width() == 0,
1190 ShapeMismatch:
1191 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1192 self.width(), other.width(),
1193 );
1194 self.columns.clone_from(&other.columns);
1195 self.height = other.height;
1196 return Ok(self);
1197 }
1198
1199 self.columns
1200 .iter_mut()
1201 .zip(other.columns.iter())
1202 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1203 ensure_can_extend(&*left, right)?;
1204 left.append(right).map_err(|e| {
1205 e.context(format!("failed to vstack column '{}'", right.name()).into())
1206 })?;
1207 Ok(())
1208 })?;
1209 self.height += other.height;
1210 Ok(self)
1211 }
1212
1213 pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1214 if self.width() != other.width() {
1215 polars_ensure!(
1216 self.width() == 0,
1217 ShapeMismatch:
1218 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1219 self.width(), other.width(),
1220 );
1221 self.columns = other.columns;
1222 self.height = other.height;
1223 return Ok(self);
1224 }
1225
1226 self.columns
1227 .iter_mut()
1228 .zip(other.columns.into_iter())
1229 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1230 ensure_can_extend(&*left, &right)?;
1231 let right_name = right.name().clone();
1232 left.append_owned(right).map_err(|e| {
1233 e.context(format!("failed to vstack column '{right_name}'").into())
1234 })?;
1235 Ok(())
1236 })?;
1237 self.height += other.height;
1238 Ok(self)
1239 }
1240
1241 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1242 ///
1243 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1244 ///
1245 /// # Panics
1246 /// Panics if the schema's don't match.
1247 pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1248 self.columns
1249 .iter_mut()
1250 .zip(other.columns.iter())
1251 .for_each(|(left, right)| {
1252 left.append(right)
1253 .map_err(|e| {
1254 e.context(format!("failed to vstack column '{}'", right.name()).into())
1255 })
1256 .expect("should not fail");
1257 });
1258 self.height += other.height;
1259 }
1260
1261 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1262 ///
1263 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1264 ///
1265 /// # Panics
1266 /// Panics if the schema's don't match.
1267 pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1268 self.columns
1269 .iter_mut()
1270 .zip(other.columns)
1271 .for_each(|(left, right)| {
1272 left.append_owned(right).expect("should not fail");
1273 });
1274 self.height += other.height;
1275 }
1276
1277 /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1278 ///
1279 /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1280 /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1281 ///
1282 /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1283 /// and thus will yield faster queries.
1284 ///
1285 /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1286 /// online operations where you add `n` rows and rerun a query.
1287 ///
1288 /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1289 /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1290 /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1291 pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1292 polars_ensure!(
1293 self.width() == other.width(),
1294 ShapeMismatch:
1295 "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1296 self.width(), other.width(),
1297 );
1298
1299 self.columns
1300 .iter_mut()
1301 .zip(other.columns.iter())
1302 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1303 ensure_can_extend(&*left, right)?;
1304 left.extend(right).map_err(|e| {
1305 e.context(format!("failed to extend column '{}'", right.name()).into())
1306 })?;
1307 Ok(())
1308 })?;
1309 self.height += other.height;
1310 self.clear_schema();
1311 Ok(())
1312 }
1313
1314 /// Remove a column by name and return the column removed.
1315 ///
1316 /// # Example
1317 ///
1318 /// ```rust
1319 /// # use polars_core::prelude::*;
1320 /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1321 /// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1322 ///
1323 /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1324 /// assert!(s1.is_err());
1325 ///
1326 /// let s2: Column = df.drop_in_place("Animal")?;
1327 /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1328 /// # Ok::<(), PolarsError>(())
1329 /// ```
1330 pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1331 let idx = self.check_name_to_idx(name)?;
1332 self.clear_schema();
1333 Ok(self.columns.remove(idx))
1334 }
1335
1336 /// Return a new [`DataFrame`] where all null values are dropped.
1337 ///
1338 /// # Example
1339 ///
1340 /// ```no_run
1341 /// # use polars_core::prelude::*;
1342 /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1343 /// "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1344 /// assert_eq!(df1.shape(), (3, 2));
1345 ///
1346 /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1347 /// assert_eq!(df2.shape(), (1, 2));
1348 /// println!("{}", df2);
1349 /// # Ok::<(), PolarsError>(())
1350 /// ```
1351 ///
1352 /// Output:
1353 ///
1354 /// ```text
1355 /// shape: (1, 2)
1356 /// +---------+---------------------+
1357 /// | Country | Tax revenue (% GDP) |
1358 /// | --- | --- |
1359 /// | str | f64 |
1360 /// +=========+=====================+
1361 /// | Malta | 32.7 |
1362 /// +---------+---------------------+
1363 /// ```
1364 pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1365 where
1366 for<'a> &'a S: Into<PlSmallStr>,
1367 {
1368 if let Some(v) = subset {
1369 let v = self.select_columns(v)?;
1370 self._drop_nulls_impl(v.as_slice())
1371 } else {
1372 self._drop_nulls_impl(self.columns.as_slice())
1373 }
1374 }
1375
1376 fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1377 // fast path for no nulls in df
1378 if subset.iter().all(|s| !s.has_nulls()) {
1379 return Ok(self.clone());
1380 }
1381
1382 let mut iter = subset.iter();
1383
1384 let mask = iter
1385 .next()
1386 .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1387 let mut mask = mask.is_not_null();
1388
1389 for c in iter {
1390 mask = mask & c.is_not_null();
1391 }
1392 self.filter(&mask)
1393 }
1394
1395 /// Drop a column by name.
1396 /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1397 /// the current one in place.
1398 ///
1399 /// # Example
1400 ///
1401 /// ```rust
1402 /// # use polars_core::prelude::*;
1403 /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1404 /// let df2: DataFrame = df1.drop("Ray type")?;
1405 ///
1406 /// assert!(df2.is_empty());
1407 /// # Ok::<(), PolarsError>(())
1408 /// ```
1409 pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1410 let idx = self.check_name_to_idx(name)?;
1411 let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1412
1413 self.columns.iter().enumerate().for_each(|(i, s)| {
1414 if i != idx {
1415 new_cols.push(s.clone())
1416 }
1417 });
1418
1419 Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1420 }
1421
1422 /// Drop columns that are in `names`.
1423 pub fn drop_many<I, S>(&self, names: I) -> Self
1424 where
1425 I: IntoIterator<Item = S>,
1426 S: Into<PlSmallStr>,
1427 {
1428 let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1429 self.drop_many_amortized(&names)
1430 }
1431
1432 /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1433 pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1434 if names.is_empty() {
1435 return self.clone();
1436 }
1437 let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1438 self.columns.iter().for_each(|s| {
1439 if !names.contains(s.name()) {
1440 new_cols.push(s.clone())
1441 }
1442 });
1443
1444 unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1445 }
1446
1447 /// Insert a new column at a given index without checking for duplicates.
1448 /// This can leave the [`DataFrame`] at an invalid state
1449 fn insert_column_no_name_check(
1450 &mut self,
1451 index: usize,
1452 column: Column,
1453 ) -> PolarsResult<&mut Self> {
1454 polars_ensure!(
1455 self.width() == 0 || column.len() == self.height(),
1456 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1457 column.len(), self.height(),
1458 );
1459
1460 if self.width() == 0 {
1461 self.height = column.len();
1462 }
1463
1464 self.columns.insert(index, column);
1465 self.clear_schema();
1466 Ok(self)
1467 }
1468
1469 /// Insert a new column at a given index.
1470 pub fn insert_column<S: IntoColumn>(
1471 &mut self,
1472 index: usize,
1473 column: S,
1474 ) -> PolarsResult<&mut Self> {
1475 let column = column.into_column();
1476 self.check_already_present(column.name().as_str())?;
1477 self.insert_column_no_name_check(index, column)
1478 }
1479
1480 fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1481 if let Some(idx) = self.get_column_index(column.name().as_str()) {
1482 self.replace_column(idx, column)?;
1483 } else {
1484 if self.width() == 0 {
1485 self.height = column.len();
1486 }
1487
1488 self.columns.push(column);
1489 self.clear_schema();
1490 }
1491 Ok(())
1492 }
1493
1494 /// Add a new column to this [`DataFrame`] or replace an existing one.
1495 pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1496 fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1497 let height = df.height();
1498 if column.len() == 1 && height > 1 {
1499 column = column.new_from_index(0, height);
1500 }
1501
1502 if column.len() == height || df.get_columns().is_empty() {
1503 df.add_column_by_search(column)?;
1504 Ok(df)
1505 }
1506 // special case for literals
1507 else if height == 0 && column.len() == 1 {
1508 let s = column.clear();
1509 df.add_column_by_search(s)?;
1510 Ok(df)
1511 } else {
1512 polars_bail!(
1513 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1514 column.len(), height,
1515 );
1516 }
1517 }
1518 let column = column.into_column();
1519 inner(self, column)
1520 }
1521
1522 /// Adds a column to the [`DataFrame`] without doing any checks
1523 /// on length or duplicates.
1524 ///
1525 /// # Safety
1526 /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1527 pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1528 debug_assert!(self.width() == 0 || self.height() == column.len());
1529 debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1530
1531 // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1532 // properly for `width` == 0.
1533 if self.width() == 0 {
1534 unsafe { self.set_height(column.len()) };
1535 }
1536 unsafe { self.get_columns_mut() }.push(column);
1537 self.clear_schema();
1538
1539 self
1540 }
1541
1542 // Note: Schema can be both input or output_schema
1543 fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1544 let name = c.name();
1545 if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1546 if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1547 // Given schema is output_schema and we can push.
1548 if idx == self.columns.len() {
1549 if self.width() == 0 {
1550 self.height = c.len();
1551 }
1552
1553 self.columns.push(c);
1554 self.clear_schema();
1555 }
1556 // Schema is incorrect fallback to search
1557 else {
1558 debug_assert!(false);
1559 self.add_column_by_search(c)?;
1560 }
1561 } else {
1562 self.replace_column(idx, c)?;
1563 }
1564 } else {
1565 if self.width() == 0 {
1566 self.height = c.len();
1567 }
1568
1569 self.columns.push(c);
1570 self.clear_schema();
1571 }
1572
1573 Ok(())
1574 }
1575
1576 // Note: Schema can be both input or output_schema
1577 pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1578 for (i, s) in series.into_iter().enumerate() {
1579 // we need to branch here
1580 // because users can add multiple columns with the same name
1581 if i == 0 || schema.get(s.name().as_str()).is_some() {
1582 self.with_column_and_schema(s.into_column(), schema)?;
1583 } else {
1584 self.with_column(s.clone().into_column())?;
1585 }
1586 }
1587 Ok(())
1588 }
1589
1590 pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1591 for (i, s) in columns.into_iter().enumerate() {
1592 // we need to branch here
1593 // because users can add multiple columns with the same name
1594 if i == 0 || schema.get(s.name().as_str()).is_some() {
1595 self.with_column_and_schema(s, schema)?;
1596 } else {
1597 self.with_column(s.clone())?;
1598 }
1599 }
1600
1601 Ok(())
1602 }
1603
1604 /// Add a new column to this [`DataFrame`] or replace an existing one.
1605 /// Uses an existing schema to amortize lookups.
1606 /// If the schema is incorrect, we will fallback to linear search.
1607 ///
1608 /// Note: Schema can be both input or output_schema
1609 pub fn with_column_and_schema<C: IntoColumn>(
1610 &mut self,
1611 column: C,
1612 schema: &Schema,
1613 ) -> PolarsResult<&mut Self> {
1614 let mut column = column.into_column();
1615
1616 let height = self.height();
1617 if column.len() == 1 && height > 1 {
1618 column = column.new_from_index(0, height);
1619 }
1620
1621 if column.len() == height || self.columns.is_empty() {
1622 self.add_column_by_schema(column, schema)?;
1623 Ok(self)
1624 }
1625 // special case for literals
1626 else if height == 0 && column.len() == 1 {
1627 let s = column.clear();
1628 self.add_column_by_schema(s, schema)?;
1629 Ok(self)
1630 } else {
1631 polars_bail!(
1632 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1633 column.len(), height,
1634 );
1635 }
1636 }
1637
1638 /// Get a row in the [`DataFrame`]. Beware this is slow.
1639 ///
1640 /// # Example
1641 ///
1642 /// ```
1643 /// # use polars_core::prelude::*;
1644 /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1645 /// df.get(idx)
1646 /// }
1647 /// ```
1648 pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1649 match self.columns.first() {
1650 Some(s) => {
1651 if s.len() <= idx {
1652 return None;
1653 }
1654 },
1655 None => return None,
1656 }
1657 // SAFETY: we just checked bounds
1658 unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1659 }
1660
1661 /// Select a [`Series`] by index.
1662 ///
1663 /// # Example
1664 ///
1665 /// ```rust
1666 /// # use polars_core::prelude::*;
1667 /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1668 /// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1669 ///
1670 /// let s1: Option<&Column> = df.select_at_idx(0);
1671 /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1672 ///
1673 /// assert_eq!(s1, Some(&s2));
1674 /// # Ok::<(), PolarsError>(())
1675 /// ```
1676 pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1677 self.columns.get(idx)
1678 }
1679
1680 /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1681 ///
1682 /// # Examples
1683 ///
1684 /// ```rust
1685 /// # use polars_core::prelude::*;
1686 /// let df = df! {
1687 /// "0" => [0, 0, 0],
1688 /// "1" => [1, 1, 1],
1689 /// "2" => [2, 2, 2]
1690 /// }?;
1691 ///
1692 /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1693 /// assert!(df.equals(&df.select_by_range(..)?));
1694 /// # Ok::<(), PolarsError>(())
1695 /// ```
1696 pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1697 where
1698 R: ops::RangeBounds<usize>,
1699 {
1700 // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1701 // because it is the nightly feature. We should change here if this function were stable.
1702 fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1703 where
1704 R: ops::RangeBounds<usize>,
1705 {
1706 let len = bounds.end;
1707
1708 let start: ops::Bound<&usize> = range.start_bound();
1709 let start = match start {
1710 ops::Bound::Included(&start) => start,
1711 ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1712 panic!("attempted to index slice from after maximum usize");
1713 }),
1714 ops::Bound::Unbounded => 0,
1715 };
1716
1717 let end: ops::Bound<&usize> = range.end_bound();
1718 let end = match end {
1719 ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1720 panic!("attempted to index slice up to maximum usize");
1721 }),
1722 ops::Bound::Excluded(&end) => end,
1723 ops::Bound::Unbounded => len,
1724 };
1725
1726 if start > end {
1727 panic!("slice index starts at {start} but ends at {end}");
1728 }
1729 if end > len {
1730 panic!("range end index {end} out of range for slice of length {len}",);
1731 }
1732
1733 ops::Range { start, end }
1734 }
1735
1736 let colnames = self.get_column_names_owned();
1737 let range = get_range(range, ..colnames.len());
1738
1739 self._select_impl(&colnames[range])
1740 }
1741
1742 /// Get column index of a [`Series`] by name.
1743 /// # Example
1744 ///
1745 /// ```rust
1746 /// # use polars_core::prelude::*;
1747 /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1748 /// "Health" => [100, 200, 500],
1749 /// "Mana" => [250, 100, 0],
1750 /// "Strength" => [30, 150, 300])?;
1751 ///
1752 /// assert_eq!(df.get_column_index("Name"), Some(0));
1753 /// assert_eq!(df.get_column_index("Health"), Some(1));
1754 /// assert_eq!(df.get_column_index("Mana"), Some(2));
1755 /// assert_eq!(df.get_column_index("Strength"), Some(3));
1756 /// assert_eq!(df.get_column_index("Haste"), None);
1757 /// # Ok::<(), PolarsError>(())
1758 /// ```
1759 pub fn get_column_index(&self, name: &str) -> Option<usize> {
1760 let schema = self.schema();
1761 if let Some(idx) = schema.index_of(name) {
1762 if self
1763 .get_columns()
1764 .get(idx)
1765 .is_some_and(|c| c.name() == name)
1766 {
1767 return Some(idx);
1768 }
1769 }
1770
1771 self.columns.iter().position(|s| s.name().as_str() == name)
1772 }
1773
1774 /// Get column index of a [`Series`] by name.
1775 pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1776 self.get_column_index(name)
1777 .ok_or_else(|| polars_err!(col_not_found = name))
1778 }
1779
1780 /// Select a single column by name.
1781 ///
1782 /// # Example
1783 ///
1784 /// ```rust
1785 /// # use polars_core::prelude::*;
1786 /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1787 /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1788 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1789 ///
1790 /// assert_eq!(df.column("Password")?, &s1);
1791 /// # Ok::<(), PolarsError>(())
1792 /// ```
1793 pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1794 let idx = self.try_get_column_index(name)?;
1795 Ok(self.select_at_idx(idx).unwrap())
1796 }
1797
1798 /// Selected multiple columns by name.
1799 ///
1800 /// # Example
1801 ///
1802 /// ```rust
1803 /// # use polars_core::prelude::*;
1804 /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1805 /// "Max weight (kg)" => [16.0, 35.89])?;
1806 /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1807 ///
1808 /// assert_eq!(&df[0], sv[0]);
1809 /// assert_eq!(&df[1], sv[1]);
1810 /// # Ok::<(), PolarsError>(())
1811 /// ```
1812 pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1813 where
1814 I: IntoIterator<Item = S>,
1815 S: AsRef<str>,
1816 {
1817 names
1818 .into_iter()
1819 .map(|name| self.column(name.as_ref()))
1820 .collect()
1821 }
1822
1823 /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1824 ///
1825 /// # Examples
1826 ///
1827 /// ```
1828 /// # use polars_core::prelude::*;
1829 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1830 /// df.select(["foo", "bar"])
1831 /// }
1832 /// ```
1833 pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1834 where
1835 I: IntoIterator<Item = S>,
1836 S: Into<PlSmallStr>,
1837 {
1838 let cols: UnitVec<PlSmallStr> = selection.into_iter().map(|s| s.into()).collect();
1839 self._select_impl(cols.as_slice())
1840 }
1841
1842 pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1843 ensure_names_unique(cols, |s| s.as_str())?;
1844 self._select_impl_unchecked(cols)
1845 }
1846
1847 pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1848 let selected = self.select_columns_impl(cols)?;
1849 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1850 }
1851
1852 /// Select with a known schema. The schema names must match the column names of this DataFrame.
1853 pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1854 where
1855 I: IntoIterator<Item = S>,
1856 S: Into<PlSmallStr>,
1857 {
1858 let cols: UnitVec<PlSmallStr> = selection.into_iter().map(|s| s.into()).collect();
1859 self._select_with_schema_impl(&cols, schema, true)
1860 }
1861
1862 /// Select with a known schema without checking for duplicates in `selection`.
1863 /// The schema names must match the column names of this DataFrame.
1864 pub fn select_with_schema_unchecked<I, S>(
1865 &self,
1866 selection: I,
1867 schema: &Schema,
1868 ) -> PolarsResult<Self>
1869 where
1870 I: IntoIterator<Item = S>,
1871 S: Into<PlSmallStr>,
1872 {
1873 let cols: UnitVec<PlSmallStr> = selection.into_iter().map(|s| s.into()).collect();
1874 self._select_with_schema_impl(&cols, schema, false)
1875 }
1876
1877 /// * The schema names must match the column names of this DataFrame.
1878 pub fn _select_with_schema_impl(
1879 &self,
1880 cols: &[PlSmallStr],
1881 schema: &Schema,
1882 check_duplicates: bool,
1883 ) -> PolarsResult<Self> {
1884 if check_duplicates {
1885 ensure_names_unique(cols, |s| s.as_str())?;
1886 }
1887
1888 let selected = self.select_columns_impl_with_schema(cols, schema)?;
1889 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1890 }
1891
1892 /// A non generic implementation to reduce compiler bloat.
1893 fn select_columns_impl_with_schema(
1894 &self,
1895 cols: &[PlSmallStr],
1896 schema: &Schema,
1897 ) -> PolarsResult<Vec<Column>> {
1898 if cfg!(debug_assertions) {
1899 ensure_matching_schema_names(schema, self.schema())?;
1900 }
1901
1902 cols.iter()
1903 .map(|name| {
1904 let index = schema.try_get_full(name.as_str())?.0;
1905 Ok(self.columns[index].clone())
1906 })
1907 .collect()
1908 }
1909
1910 pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1911 where
1912 I: IntoIterator<Item = S>,
1913 S: Into<PlSmallStr>,
1914 {
1915 let cols: UnitVec<PlSmallStr> = selection.into_iter().map(|s| s.into()).collect();
1916 self.select_physical_impl(&cols)
1917 }
1918
1919 fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1920 ensure_names_unique(cols, |s| s.as_str())?;
1921 let selected = self.select_columns_physical_impl(cols)?;
1922 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1923 }
1924
1925 /// # Safety
1926 /// Dtypes must match, as the provided schema becomes the cached schema of the result.
1927 pub unsafe fn project(&self, to: SchemaRef) -> PolarsResult<Self> {
1928 let mut df = unsafe { self.project_names(to.iter_names())? };
1929 df.cached_schema = to.into();
1930 Ok(df)
1931 }
1932
1933 /// # Safety
1934 /// This does not check for duplicates on names.
1935 pub unsafe fn project_names(
1936 &self,
1937 names: impl IntoIterator<Item = impl AsRef<str>>,
1938 ) -> PolarsResult<Self> {
1939 let from = self.schema();
1940 let columns = names
1941 .into_iter()
1942 .map(|name| Ok(self.columns[from.try_index_of(name.as_ref())?].clone()))
1943 .collect::<PolarsResult<_>>()?;
1944 let df = unsafe { Self::new_no_checks(self.height(), columns) };
1945 Ok(df)
1946 }
1947
1948 /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1949 ///
1950 /// # Example
1951 ///
1952 /// ```rust
1953 /// # use polars_core::prelude::*;
1954 /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1955 /// "Carbon" => [1, 2, 3],
1956 /// "Hydrogen" => [4, 6, 8])?;
1957 /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1958 ///
1959 /// assert_eq!(df["Carbon"], sv[0]);
1960 /// assert_eq!(df["Hydrogen"], sv[1]);
1961 /// # Ok::<(), PolarsError>(())
1962 /// ```
1963 pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1964 let cols = selection.into_vec();
1965 self.select_columns_impl(&cols)
1966 }
1967
1968 fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1969 self.columns
1970 .iter()
1971 .enumerate()
1972 .map(|(i, s)| (s.name().as_str(), i))
1973 .collect()
1974 }
1975
1976 /// A non generic implementation to reduce compiler bloat.
1977 fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1978 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1979 let name_to_idx = self._names_to_idx_map();
1980 cols.iter()
1981 .map(|name| {
1982 let idx = *name_to_idx
1983 .get(name.as_str())
1984 .ok_or_else(|| polars_err!(col_not_found = name))?;
1985 Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1986 })
1987 .collect::<PolarsResult<Vec<_>>>()?
1988 } else {
1989 cols.iter()
1990 .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1991 .collect::<PolarsResult<Vec<_>>>()?
1992 };
1993
1994 Ok(selected)
1995 }
1996
1997 /// A non generic implementation to reduce compiler bloat.
1998 fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1999 let selected = if cols.len() > 1 && self.columns.len() > 10 {
2000 // we hash, because there are user that having millions of columns.
2001 // # https://github.com/pola-rs/polars/issues/1023
2002 let name_to_idx = self._names_to_idx_map();
2003
2004 cols.iter()
2005 .map(|name| {
2006 let idx = *name_to_idx
2007 .get(name.as_str())
2008 .ok_or_else(|| polars_err!(col_not_found = name))?;
2009 Ok(self.select_at_idx(idx).unwrap().clone())
2010 })
2011 .collect::<PolarsResult<Vec<_>>>()?
2012 } else {
2013 cols.iter()
2014 .map(|c| self.column(c.as_str()).cloned())
2015 .collect::<PolarsResult<Vec<_>>>()?
2016 };
2017
2018 Ok(selected)
2019 }
2020
2021 fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
2022 // If there is a filtered column just see how many columns there are left.
2023 if let Some(fst) = filtered.first() {
2024 return fst.len();
2025 }
2026
2027 // Otherwise, count the number of values that would be filtered and return that height.
2028 let num_trues = mask.num_trues();
2029 if mask.len() == self.height() {
2030 num_trues
2031 } else {
2032 // This is for broadcasting masks
2033 debug_assert!(num_trues == 0 || num_trues == 1);
2034 self.height() * num_trues
2035 }
2036 }
2037
2038 /// Take the [`DataFrame`] rows by a boolean mask.
2039 ///
2040 /// # Example
2041 ///
2042 /// ```
2043 /// # use polars_core::prelude::*;
2044 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2045 /// let mask = df.column("sepal_width")?.is_not_null();
2046 /// df.filter(&mask)
2047 /// }
2048 /// ```
2049 pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2050 let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
2051 let height = self.filter_height(&new_col, mask);
2052
2053 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2054 }
2055
2056 /// Same as `filter` but does not parallelize.
2057 pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2058 let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
2059 let height = self.filter_height(&new_col, mask);
2060
2061 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2062 }
2063
2064 /// Take [`DataFrame`] rows by index values.
2065 ///
2066 /// # Example
2067 ///
2068 /// ```
2069 /// # use polars_core::prelude::*;
2070 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2071 /// let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2072 /// df.take(&idx)
2073 /// }
2074 /// ```
2075 pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2076 let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2077
2078 Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2079 }
2080
2081 /// # Safety
2082 /// The indices must be in-bounds.
2083 pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2084 self.take_unchecked_impl(idx, true)
2085 }
2086
2087 /// # Safety
2088 /// The indices must be in-bounds.
2089 pub unsafe fn gather_group_unchecked(&self, group: &GroupsIndicator) -> Self {
2090 match group {
2091 GroupsIndicator::Idx((_, indices)) => unsafe {
2092 self.take_slice_unchecked_impl(indices.as_slice(), false)
2093 },
2094 GroupsIndicator::Slice([offset, len]) => self.slice(*offset as i64, *len as usize),
2095 }
2096 }
2097
2098 /// # Safety
2099 /// The indices must be in-bounds.
2100 pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2101 let cols = if allow_threads && POOL.current_num_threads() > 1 {
2102 POOL.install(|| {
2103 if POOL.current_num_threads() > self.width() {
2104 let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2105 if self.len() / stride >= 2 {
2106 self._apply_columns_par(&|c| {
2107 // Nested types initiate a rechunk in their take_unchecked implementation.
2108 // If we do not rechunk, it will result in rechunk storms downstream.
2109 let c = if c.dtype().is_nested() {
2110 &c.rechunk()
2111 } else {
2112 c
2113 };
2114
2115 (0..idx.len().div_ceil(stride))
2116 .into_par_iter()
2117 .map(|i| c.take_unchecked(&idx.slice((i * stride) as i64, stride)))
2118 .reduce(
2119 || Column::new_empty(c.name().clone(), c.dtype()),
2120 |mut a, b| {
2121 a.append_owned(b).unwrap();
2122 a
2123 },
2124 )
2125 })
2126 } else {
2127 self._apply_columns_par(&|c| c.take_unchecked(idx))
2128 }
2129 } else {
2130 self._apply_columns_par(&|c| c.take_unchecked(idx))
2131 }
2132 })
2133 } else {
2134 self._apply_columns(&|s| s.take_unchecked(idx))
2135 };
2136 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2137 }
2138
2139 /// # Safety
2140 /// The indices must be in-bounds.
2141 pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2142 self.take_slice_unchecked_impl(idx, true)
2143 }
2144
2145 /// # Safety
2146 /// The indices must be in-bounds.
2147 pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2148 let cols = if allow_threads && POOL.current_num_threads() > 1 {
2149 POOL.install(|| {
2150 if POOL.current_num_threads() > self.width() {
2151 let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2152 if self.len() / stride >= 2 {
2153 self._apply_columns_par(&|c| {
2154 // Nested types initiate a rechunk in their take_unchecked implementation.
2155 // If we do not rechunk, it will result in rechunk storms downstream.
2156 let c = if c.dtype().is_nested() {
2157 &c.rechunk()
2158 } else {
2159 c
2160 };
2161
2162 (0..idx.len().div_ceil(stride))
2163 .into_par_iter()
2164 .map(|i| {
2165 let idx = &idx[i * stride..];
2166 let idx = &idx[..idx.len().min(stride)];
2167 c.take_slice_unchecked(idx)
2168 })
2169 .reduce(
2170 || Column::new_empty(c.name().clone(), c.dtype()),
2171 |mut a, b| {
2172 a.append_owned(b).unwrap();
2173 a
2174 },
2175 )
2176 })
2177 } else {
2178 self._apply_columns_par(&|s| s.take_slice_unchecked(idx))
2179 }
2180 } else {
2181 self._apply_columns_par(&|s| s.take_slice_unchecked(idx))
2182 }
2183 })
2184 } else {
2185 self._apply_columns(&|s| s.take_slice_unchecked(idx))
2186 };
2187 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2188 }
2189
2190 /// Rename a column in the [`DataFrame`].
2191 ///
2192 /// Should not be called in a loop as that can lead to quadratic behavior.
2193 ///
2194 /// # Example
2195 ///
2196 /// ```
2197 /// # use polars_core::prelude::*;
2198 /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2199 /// let original_name = "foo";
2200 /// let new_name = "bar";
2201 /// df.rename(original_name, new_name.into())
2202 /// }
2203 /// ```
2204 pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2205 if column == name.as_str() {
2206 return Ok(self);
2207 }
2208 polars_ensure!(
2209 !self.schema().contains(&name),
2210 Duplicate: "column rename attempted with already existing name \"{name}\""
2211 );
2212
2213 self.get_column_index(column)
2214 .and_then(|idx| self.columns.get_mut(idx))
2215 .ok_or_else(|| polars_err!(col_not_found = column))
2216 .map(|c| c.rename(name))?;
2217 self.clear_schema();
2218
2219 Ok(self)
2220 }
2221
2222 pub fn rename_many<'a>(
2223 &mut self,
2224 renames: impl Iterator<Item = (&'a str, PlSmallStr)>,
2225 ) -> PolarsResult<&mut Self> {
2226 let mut schema = self.schema().as_ref().clone();
2227 self.clear_schema();
2228
2229 for (from, to) in renames {
2230 if from == to.as_str() {
2231 continue;
2232 }
2233
2234 polars_ensure!(
2235 !schema.contains(&to),
2236 Duplicate: "column rename attempted with already existing name \"{to}\""
2237 );
2238
2239 match schema.get_full(from) {
2240 None => polars_bail!(col_not_found = from),
2241 Some((idx, _, _)) => {
2242 let (n, _) = schema.get_at_index_mut(idx).unwrap();
2243 *n = to.clone();
2244 self.columns.get_mut(idx).unwrap().rename(to);
2245 },
2246 }
2247 }
2248
2249 self.cached_schema = OnceLock::from(Arc::new(schema));
2250 Ok(self)
2251 }
2252
2253 /// Sort [`DataFrame`] in place.
2254 ///
2255 /// See [`DataFrame::sort`] for more instruction.
2256 pub fn sort_in_place(
2257 &mut self,
2258 by: impl IntoVec<PlSmallStr>,
2259 sort_options: SortMultipleOptions,
2260 ) -> PolarsResult<&mut Self> {
2261 let by_column = self.select_columns(by)?;
2262 self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2263 Ok(self)
2264 }
2265
2266 #[doc(hidden)]
2267 /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2268 pub fn sort_impl(
2269 &self,
2270 by_column: Vec<Column>,
2271 sort_options: SortMultipleOptions,
2272 slice: Option<(i64, usize)>,
2273 ) -> PolarsResult<Self> {
2274 if by_column.is_empty() {
2275 // If no columns selected, any order (including original order) is correct.
2276 return if let Some((offset, len)) = slice {
2277 Ok(self.slice(offset, len))
2278 } else {
2279 Ok(self.clone())
2280 };
2281 }
2282
2283 // note that the by_column argument also contains evaluated expression from
2284 // polars-lazy that may not even be present in this dataframe. therefore
2285 // when we try to set the first columns as sorted, we ignore the error as
2286 // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2287 let first_descending = sort_options.descending[0];
2288 let first_by_column = by_column[0].name().to_string();
2289
2290 let set_sorted = |df: &mut DataFrame| {
2291 // Mark the first sort column as sorted; if the column does not exist it
2292 // is ok, because we sorted by an expression not present in the dataframe
2293 let _ = df.apply(&first_by_column, |s| {
2294 let mut s = s.clone();
2295 if first_descending {
2296 s.set_sorted_flag(IsSorted::Descending)
2297 } else {
2298 s.set_sorted_flag(IsSorted::Ascending)
2299 }
2300 s
2301 });
2302 };
2303 if self.is_empty() {
2304 let mut out = self.clone();
2305 set_sorted(&mut out);
2306 return Ok(out);
2307 }
2308
2309 if let Some((0, k)) = slice {
2310 if k < self.len() {
2311 return self.bottom_k_impl(k, by_column, sort_options);
2312 }
2313 }
2314 // Check if the required column is already sorted; if so we can exit early
2315 // We can do so when there is only one column to sort by, for multiple columns
2316 // it will be complicated to do so
2317 #[cfg(feature = "dtype-categorical")]
2318 let is_not_categorical_enum =
2319 !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2320 || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2321
2322 #[cfg(not(feature = "dtype-categorical"))]
2323 #[allow(non_upper_case_globals)]
2324 const is_not_categorical_enum: bool = true;
2325
2326 if by_column.len() == 1 && is_not_categorical_enum {
2327 let required_sorting = if sort_options.descending[0] {
2328 IsSorted::Descending
2329 } else {
2330 IsSorted::Ascending
2331 };
2332 // If null count is 0 then nulls_last doesnt matter
2333 // Safe to get value at last position since the dataframe is not empty (taken care above)
2334 let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2335 && ((by_column[0].null_count() == 0)
2336 || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2337 == sort_options.nulls_last[0]);
2338
2339 if no_sorting_required {
2340 return if let Some((offset, len)) = slice {
2341 Ok(self.slice(offset, len))
2342 } else {
2343 Ok(self.clone())
2344 };
2345 }
2346 }
2347
2348 let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2349 let allow_threads = sort_options.multithreaded;
2350
2351 // a lot of indirection in both sorting and take
2352 let mut df = self.clone();
2353 let df = df.as_single_chunk_par();
2354 let mut take = match (by_column.len(), has_nested) {
2355 (1, false) => {
2356 let s = &by_column[0];
2357 let options = SortOptions {
2358 descending: sort_options.descending[0],
2359 nulls_last: sort_options.nulls_last[0],
2360 multithreaded: sort_options.multithreaded,
2361 maintain_order: sort_options.maintain_order,
2362 limit: sort_options.limit,
2363 };
2364 // fast path for a frame with a single series
2365 // no need to compute the sort indices and then take by these indices
2366 // simply sort and return as frame
2367 if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2368 let mut out = s.sort_with(options)?;
2369 if let Some((offset, len)) = slice {
2370 out = out.slice(offset, len);
2371 }
2372 return Ok(out.into_frame());
2373 }
2374 s.arg_sort(options)
2375 },
2376 _ => arg_sort(&by_column, sort_options)?,
2377 };
2378
2379 if let Some((offset, len)) = slice {
2380 take = take.slice(offset, len);
2381 }
2382
2383 // SAFETY:
2384 // the created indices are in bounds
2385 let mut df = unsafe { df.take_unchecked_impl(&take, allow_threads) };
2386 set_sorted(&mut df);
2387 Ok(df)
2388 }
2389
2390 /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2391 ///
2392 /// This dataframe does not necessarily have a specified schema and may be changed at any
2393 /// point. It is primarily used for debugging.
2394 pub fn _to_metadata(&self) -> DataFrame {
2395 let num_columns = self.columns.len();
2396
2397 let mut column_names =
2398 StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2399 let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2400 let mut sorted_asc_ca =
2401 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2402 let mut sorted_dsc_ca =
2403 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2404 let mut fast_explode_list_ca =
2405 BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2406 let mut materialized_at_ca =
2407 StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2408
2409 for col in &self.columns {
2410 let flags = col.get_flags();
2411
2412 let (repr, materialized_at) = match col {
2413 Column::Series(s) => ("series", s.materialized_at()),
2414 Column::Scalar(_) => ("scalar", None),
2415 };
2416 let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2417 let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2418 let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2419
2420 column_names.append_value(col.name().clone());
2421 repr_ca.append_value(repr);
2422 sorted_asc_ca.append_value(sorted_asc);
2423 sorted_dsc_ca.append_value(sorted_dsc);
2424 fast_explode_list_ca.append_value(fast_explode_list);
2425 materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2426 }
2427
2428 unsafe {
2429 DataFrame::new_no_checks(
2430 self.width(),
2431 vec![
2432 column_names.finish().into_column(),
2433 repr_ca.finish().into_column(),
2434 sorted_asc_ca.finish().into_column(),
2435 sorted_dsc_ca.finish().into_column(),
2436 fast_explode_list_ca.finish().into_column(),
2437 materialized_at_ca.finish().into_column(),
2438 ],
2439 )
2440 }
2441 }
2442
2443 /// Return a sorted clone of this [`DataFrame`].
2444 ///
2445 /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2446 /// # Example
2447 ///
2448 /// Sort by a single column with default options:
2449 /// ```
2450 /// # use polars_core::prelude::*;
2451 /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2452 /// df.sort(["sepal_width"], Default::default())
2453 /// }
2454 /// ```
2455 /// Sort by a single column with specific order:
2456 /// ```
2457 /// # use polars_core::prelude::*;
2458 /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2459 /// df.sort(
2460 /// ["sepal_width"],
2461 /// SortMultipleOptions::new()
2462 /// .with_order_descending(descending)
2463 /// )
2464 /// }
2465 /// ```
2466 /// Sort by multiple columns with specifying order for each column:
2467 /// ```
2468 /// # use polars_core::prelude::*;
2469 /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2470 /// df.sort(
2471 /// ["sepal_width", "sepal_length"],
2472 /// SortMultipleOptions::new()
2473 /// .with_order_descending_multi([false, true])
2474 /// )
2475 /// }
2476 /// ```
2477 /// See [`SortMultipleOptions`] for more options.
2478 ///
2479 /// Also see [`DataFrame::sort_in_place`].
2480 pub fn sort(
2481 &self,
2482 by: impl IntoVec<PlSmallStr>,
2483 sort_options: SortMultipleOptions,
2484 ) -> PolarsResult<Self> {
2485 let mut df = self.clone();
2486 df.sort_in_place(by, sort_options)?;
2487 Ok(df)
2488 }
2489
2490 /// Replace a column with a [`Series`].
2491 ///
2492 /// # Example
2493 ///
2494 /// ```rust
2495 /// # use polars_core::prelude::*;
2496 /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2497 /// "Area (km²)" => [9_833_520, 9_596_961])?;
2498 /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2499 ///
2500 /// assert!(df.replace("Nation", s.clone()).is_err());
2501 /// assert!(df.replace("Country", s).is_ok());
2502 /// # Ok::<(), PolarsError>(())
2503 /// ```
2504 pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2505 self.apply(column, |_| new_col.into_series())
2506 }
2507
2508 /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2509 /// is that now the value of `column: &str` determines the name of the column and not the name
2510 /// of the `Series` passed to this method.
2511 pub fn replace_or_add<S: IntoSeries>(
2512 &mut self,
2513 column: PlSmallStr,
2514 new_col: S,
2515 ) -> PolarsResult<&mut Self> {
2516 let mut new_col = new_col.into_series();
2517 new_col.rename(column);
2518 self.with_column(new_col)
2519 }
2520
2521 /// Replace column at index `idx` with a [`Series`].
2522 ///
2523 /// # Example
2524 ///
2525 /// ```ignored
2526 /// # use polars_core::prelude::*;
2527 /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2528 /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2529 /// let mut df = DataFrame::new(vec![s0, s1])?;
2530 ///
2531 /// // Add 32 to get lowercase ascii values
2532 /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2533 /// # Ok::<(), PolarsError>(())
2534 /// ```
2535 pub fn replace_column<C: IntoColumn>(
2536 &mut self,
2537 index: usize,
2538 new_column: C,
2539 ) -> PolarsResult<&mut Self> {
2540 polars_ensure!(
2541 index < self.width(),
2542 ShapeMismatch:
2543 "unable to replace at index {}, the DataFrame has only {} columns",
2544 index, self.width(),
2545 );
2546 let mut new_column = new_column.into_column();
2547 polars_ensure!(
2548 new_column.len() == self.height(),
2549 ShapeMismatch:
2550 "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2551 new_column.len(), self.height(),
2552 );
2553 let old_col = &mut self.columns[index];
2554 mem::swap(old_col, &mut new_column);
2555 self.clear_schema();
2556 Ok(self)
2557 }
2558
2559 /// Apply a closure to a column. This is the recommended way to do in place modification.
2560 ///
2561 /// # Example
2562 ///
2563 /// ```rust
2564 /// # use polars_core::prelude::*;
2565 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2566 /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2567 /// let mut df = DataFrame::new(vec![s0, s1])?;
2568 ///
2569 /// fn str_to_len(str_val: &Column) -> Column {
2570 /// str_val.str()
2571 /// .unwrap()
2572 /// .into_iter()
2573 /// .map(|opt_name: Option<&str>| {
2574 /// opt_name.map(|name: &str| name.len() as u32)
2575 /// })
2576 /// .collect::<UInt32Chunked>()
2577 /// .into_column()
2578 /// }
2579 ///
2580 /// // Replace the names column by the length of the names.
2581 /// df.apply("names", str_to_len);
2582 /// # Ok::<(), PolarsError>(())
2583 /// ```
2584 /// Results in:
2585 ///
2586 /// ```text
2587 /// +--------+-------+
2588 /// | foo | |
2589 /// | --- | names |
2590 /// | str | u32 |
2591 /// +========+=======+
2592 /// | "ham" | 4 |
2593 /// +--------+-------+
2594 /// | "spam" | 6 |
2595 /// +--------+-------+
2596 /// | "egg" | 3 |
2597 /// +--------+-------+
2598 /// ```
2599 pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2600 where
2601 F: FnOnce(&Column) -> C,
2602 C: IntoColumn,
2603 {
2604 let idx = self.check_name_to_idx(name)?;
2605 self.apply_at_idx(idx, f)?;
2606 Ok(self)
2607 }
2608
2609 /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2610 /// modification.
2611 ///
2612 /// # Example
2613 ///
2614 /// ```rust
2615 /// # use polars_core::prelude::*;
2616 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2617 /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2618 /// let mut df = DataFrame::new(vec![s0, s1])?;
2619 ///
2620 /// // Add 32 to get lowercase ascii values
2621 /// df.apply_at_idx(1, |s| s + 32);
2622 /// # Ok::<(), PolarsError>(())
2623 /// ```
2624 /// Results in:
2625 ///
2626 /// ```text
2627 /// +--------+-------+
2628 /// | foo | ascii |
2629 /// | --- | --- |
2630 /// | str | i32 |
2631 /// +========+=======+
2632 /// | "ham" | 102 |
2633 /// +--------+-------+
2634 /// | "spam" | 111 |
2635 /// +--------+-------+
2636 /// | "egg" | 111 |
2637 /// +--------+-------+
2638 /// ```
2639 pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2640 where
2641 F: FnOnce(&Column) -> C,
2642 C: IntoColumn,
2643 {
2644 let df_height = self.height();
2645 let width = self.width();
2646 let col = self.columns.get_mut(idx).ok_or_else(|| {
2647 polars_err!(
2648 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2649 idx, width
2650 )
2651 })?;
2652 let name = col.name().clone();
2653 let dtype_before = col.dtype().clone();
2654 let new_col = f(col).into_column();
2655 match new_col.len() {
2656 1 => {
2657 let new_col = new_col.new_from_index(0, df_height);
2658 let _ = mem::replace(col, new_col);
2659 },
2660 len if (len == df_height) => {
2661 let _ = mem::replace(col, new_col);
2662 },
2663 len => polars_bail!(
2664 ShapeMismatch:
2665 "resulting Series has length {} while the DataFrame has height {}",
2666 len, df_height
2667 ),
2668 }
2669
2670 // make sure the name remains the same after applying the closure
2671 unsafe {
2672 let col = self.columns.get_unchecked_mut(idx);
2673 col.rename(name);
2674
2675 if col.dtype() != &dtype_before {
2676 self.clear_schema();
2677 }
2678 }
2679 Ok(self)
2680 }
2681
2682 /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2683 /// modification.
2684 ///
2685 /// # Example
2686 ///
2687 /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2688 ///
2689 /// ```rust
2690 /// # use polars_core::prelude::*;
2691 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2692 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2693 /// let mut df = DataFrame::new(vec![s0, s1])?;
2694 ///
2695 /// let idx = vec![0, 1, 4];
2696 ///
2697 /// df.try_apply("foo", |c| {
2698 /// c.str()?
2699 /// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2700 /// });
2701 /// # Ok::<(), PolarsError>(())
2702 /// ```
2703 /// Results in:
2704 ///
2705 /// ```text
2706 /// +---------------------+--------+
2707 /// | foo | values |
2708 /// | --- | --- |
2709 /// | str | i32 |
2710 /// +=====================+========+
2711 /// | "ham-is-modified" | 1 |
2712 /// +---------------------+--------+
2713 /// | "spam-is-modified" | 2 |
2714 /// +---------------------+--------+
2715 /// | "egg" | 3 |
2716 /// +---------------------+--------+
2717 /// | "bacon" | 4 |
2718 /// +---------------------+--------+
2719 /// | "quack-is-modified" | 5 |
2720 /// +---------------------+--------+
2721 /// ```
2722 pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2723 where
2724 F: FnOnce(&Column) -> PolarsResult<C>,
2725 C: IntoColumn,
2726 {
2727 let width = self.width();
2728 let col = self.columns.get_mut(idx).ok_or_else(|| {
2729 polars_err!(
2730 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2731 idx, width
2732 )
2733 })?;
2734 let name = col.name().clone();
2735
2736 let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2737
2738 // make sure the name remains the same after applying the closure
2739 unsafe {
2740 let col = self.columns.get_unchecked_mut(idx);
2741 col.rename(name);
2742 }
2743 Ok(self)
2744 }
2745
2746 /// Apply a closure that may fail to a column. This is the recommended way to do in place
2747 /// modification.
2748 ///
2749 /// # Example
2750 ///
2751 /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2752 ///
2753 /// ```rust
2754 /// # use polars_core::prelude::*;
2755 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2756 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2757 /// let mut df = DataFrame::new(vec![s0, s1])?;
2758 ///
2759 /// // create a mask
2760 /// let values = df.column("values")?.as_materialized_series();
2761 /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2762 ///
2763 /// df.try_apply("foo", |c| {
2764 /// c.str()?
2765 /// .set(&mask, Some("not_within_bounds"))
2766 /// });
2767 /// # Ok::<(), PolarsError>(())
2768 /// ```
2769 /// Results in:
2770 ///
2771 /// ```text
2772 /// +---------------------+--------+
2773 /// | foo | values |
2774 /// | --- | --- |
2775 /// | str | i32 |
2776 /// +=====================+========+
2777 /// | "not_within_bounds" | 1 |
2778 /// +---------------------+--------+
2779 /// | "spam" | 2 |
2780 /// +---------------------+--------+
2781 /// | "egg" | 3 |
2782 /// +---------------------+--------+
2783 /// | "bacon" | 4 |
2784 /// +---------------------+--------+
2785 /// | "not_within_bounds" | 5 |
2786 /// +---------------------+--------+
2787 /// ```
2788 pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2789 where
2790 F: FnOnce(&Series) -> PolarsResult<C>,
2791 C: IntoColumn,
2792 {
2793 let idx = self.try_get_column_index(column)?;
2794 self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2795 }
2796
2797 /// Slice the [`DataFrame`] along the rows.
2798 ///
2799 /// # Example
2800 ///
2801 /// ```rust
2802 /// # use polars_core::prelude::*;
2803 /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2804 /// "Color" => ["Green", "Red", "White", "White", "Red"])?;
2805 /// let sl: DataFrame = df.slice(2, 3);
2806 ///
2807 /// assert_eq!(sl.shape(), (3, 2));
2808 /// println!("{}", sl);
2809 /// # Ok::<(), PolarsError>(())
2810 /// ```
2811 /// Output:
2812 /// ```text
2813 /// shape: (3, 2)
2814 /// +-------+-------+
2815 /// | Fruit | Color |
2816 /// | --- | --- |
2817 /// | str | str |
2818 /// +=======+=======+
2819 /// | Grape | White |
2820 /// +-------+-------+
2821 /// | Fig | White |
2822 /// +-------+-------+
2823 /// | Fig | Red |
2824 /// +-------+-------+
2825 /// ```
2826 #[must_use]
2827 pub fn slice(&self, offset: i64, length: usize) -> Self {
2828 if offset == 0 && length == self.height() {
2829 return self.clone();
2830 }
2831 if length == 0 {
2832 return self.clear();
2833 }
2834 let cols = self
2835 .columns
2836 .iter()
2837 .map(|s| s.slice(offset, length))
2838 .collect::<Vec<_>>();
2839
2840 let height = if let Some(fst) = cols.first() {
2841 fst.len()
2842 } else {
2843 let (_, length) = slice_offsets(offset, length, self.height());
2844 length
2845 };
2846
2847 unsafe { DataFrame::new_no_checks(height, cols) }
2848 }
2849
2850 /// Split [`DataFrame`] at the given `offset`.
2851 pub fn split_at(&self, offset: i64) -> (Self, Self) {
2852 let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2853
2854 let (idx, _) = slice_offsets(offset, 0, self.height());
2855
2856 let a = unsafe { DataFrame::new_no_checks(idx, a) };
2857 let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2858 (a, b)
2859 }
2860
2861 #[must_use]
2862 pub fn clear(&self) -> Self {
2863 let cols = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2864 unsafe { DataFrame::new_no_checks(0, cols) }
2865 }
2866
2867 #[must_use]
2868 pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2869 if offset == 0 && length == self.height() {
2870 return self.clone();
2871 }
2872 let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2873 unsafe { DataFrame::new_no_checks(length, columns) }
2874 }
2875
2876 #[must_use]
2877 pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2878 if offset == 0 && length == self.height() {
2879 return self.clone();
2880 }
2881 // @scalar-opt
2882 let columns = self._apply_columns(&|s| {
2883 let mut out = s.slice(offset, length);
2884 out.shrink_to_fit();
2885 out
2886 });
2887 unsafe { DataFrame::new_no_checks(length, columns) }
2888 }
2889
2890 /// Get the head of the [`DataFrame`].
2891 ///
2892 /// # Example
2893 ///
2894 /// ```rust
2895 /// # use polars_core::prelude::*;
2896 /// let countries: DataFrame =
2897 /// df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2898 /// "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2899 /// "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2900 /// "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2901 /// assert_eq!(countries.shape(), (5, 4));
2902 ///
2903 /// println!("{}", countries.head(Some(3)));
2904 /// # Ok::<(), PolarsError>(())
2905 /// ```
2906 ///
2907 /// Output:
2908 ///
2909 /// ```text
2910 /// shape: (3, 4)
2911 /// +--------------------+---------------+---------------+------------+
2912 /// | Rank by GDP (2021) | Continent | Country | Capital |
2913 /// | --- | --- | --- | --- |
2914 /// | i32 | str | str | str |
2915 /// +====================+===============+===============+============+
2916 /// | 1 | North America | United States | Washington |
2917 /// +--------------------+---------------+---------------+------------+
2918 /// | 2 | Asia | China | Beijing |
2919 /// +--------------------+---------------+---------------+------------+
2920 /// | 3 | Asia | Japan | Tokyo |
2921 /// +--------------------+---------------+---------------+------------+
2922 /// ```
2923 #[must_use]
2924 pub fn head(&self, length: Option<usize>) -> Self {
2925 let cols = self
2926 .columns
2927 .iter()
2928 .map(|c| c.head(length))
2929 .collect::<Vec<_>>();
2930
2931 let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2932 let height = usize::min(height, self.height());
2933 unsafe { DataFrame::new_no_checks(height, cols) }
2934 }
2935
2936 /// Get the tail of the [`DataFrame`].
2937 ///
2938 /// # Example
2939 ///
2940 /// ```rust
2941 /// # use polars_core::prelude::*;
2942 /// let countries: DataFrame =
2943 /// df!("Rank (2021)" => [105, 106, 107, 108, 109],
2944 /// "Apple Price (ā¬/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2945 /// "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2946 /// assert_eq!(countries.shape(), (5, 3));
2947 ///
2948 /// println!("{}", countries.tail(Some(2)));
2949 /// # Ok::<(), PolarsError>(())
2950 /// ```
2951 ///
2952 /// Output:
2953 ///
2954 /// ```text
2955 /// shape: (2, 3)
2956 /// +-------------+--------------------+---------+
2957 /// | Rank (2021) | Apple Price (ā¬/kg) | Country |
2958 /// | --- | --- | --- |
2959 /// | i32 | f64 | str |
2960 /// +=============+====================+=========+
2961 /// | 108 | 0.65 | Syria |
2962 /// +-------------+--------------------+---------+
2963 /// | 109 | 0.52 | Turkey |
2964 /// +-------------+--------------------+---------+
2965 /// ```
2966 #[must_use]
2967 pub fn tail(&self, length: Option<usize>) -> Self {
2968 let cols = self
2969 .columns
2970 .iter()
2971 .map(|c| c.tail(length))
2972 .collect::<Vec<_>>();
2973
2974 let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2975 let height = usize::min(height, self.height());
2976 unsafe { DataFrame::new_no_checks(height, cols) }
2977 }
2978
2979 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2980 ///
2981 /// # Panics
2982 ///
2983 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2984 ///
2985 /// This responsibility is left to the caller as we don't want to take mutable references here,
2986 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2987 /// as well.
2988 pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter<'_> {
2989 debug_assert!(!self.should_rechunk(), "expected equal chunks");
2990 // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2991 // as we must allocate arrow strings/binaries.
2992 let must_convert = compat_level.0 == 0;
2993 let parallel = parallel
2994 && must_convert
2995 && self.columns.len() > 1
2996 && self
2997 .columns
2998 .iter()
2999 .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
3000
3001 RecordBatchIter {
3002 columns: &self.columns,
3003 schema: Arc::new(
3004 self.columns
3005 .iter()
3006 .map(|c| c.field().to_arrow(compat_level))
3007 .collect(),
3008 ),
3009 idx: 0,
3010 n_chunks: self.first_col_n_chunks(),
3011 compat_level,
3012 parallel,
3013 }
3014 }
3015
3016 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
3017 ///
3018 /// # Panics
3019 ///
3020 /// Panics if the [`DataFrame`] that is passed is not rechunked.
3021 ///
3022 /// This responsibility is left to the caller as we don't want to take mutable references here,
3023 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
3024 /// as well.
3025 pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
3026 debug_assert!(!self.should_rechunk());
3027 PhysRecordBatchIter {
3028 schema: Arc::new(
3029 self.get_columns()
3030 .iter()
3031 .map(|c| c.field().to_arrow(CompatLevel::newest()))
3032 .collect(),
3033 ),
3034 arr_iters: self
3035 .materialized_column_iter()
3036 .map(|s| s.chunks().iter())
3037 .collect(),
3038 }
3039 }
3040
3041 /// Get a [`DataFrame`] with all the columns in reversed order.
3042 #[must_use]
3043 pub fn reverse(&self) -> Self {
3044 let cols = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
3045 unsafe { DataFrame::new_no_checks(self.height(), cols) }
3046 }
3047
3048 /// Shift the values by a given period and fill the parts that will be empty due to this operation
3049 /// with `Nones`.
3050 ///
3051 /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
3052 #[must_use]
3053 pub fn shift(&self, periods: i64) -> Self {
3054 let col = self._apply_columns_par(&|s| s.shift(periods));
3055 unsafe { DataFrame::new_no_checks(self.height(), col) }
3056 }
3057
3058 /// Replace None values with one of the following strategies:
3059 /// * Forward fill (replace None with the previous value)
3060 /// * Backward fill (replace None with the next value)
3061 /// * Mean fill (replace None with the mean of the whole array)
3062 /// * Min fill (replace None with the minimum of the whole array)
3063 /// * Max fill (replace None with the maximum of the whole array)
3064 ///
3065 /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
3066 pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
3067 let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
3068
3069 Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
3070 }
3071
3072 /// Pipe different functions/ closure operations that work on a DataFrame together.
3073 pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
3074 where
3075 F: Fn(DataFrame) -> PolarsResult<B>,
3076 {
3077 f(self)
3078 }
3079
3080 /// Pipe different functions/ closure operations that work on a DataFrame together.
3081 pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
3082 where
3083 F: Fn(&mut DataFrame) -> PolarsResult<B>,
3084 {
3085 f(self)
3086 }
3087
3088 /// Pipe different functions/ closure operations that work on a DataFrame together.
3089 pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
3090 where
3091 F: Fn(DataFrame, Args) -> PolarsResult<B>,
3092 {
3093 f(self, args)
3094 }
3095
3096 /// Drop duplicate rows from a [`DataFrame`].
3097 /// *This fails when there is a column of type List in DataFrame*
3098 ///
3099 /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
3100 ///
3101 /// # Example
3102 ///
3103 /// ```no_run
3104 /// # use polars_core::prelude::*;
3105 /// let df = df! {
3106 /// "flt" => [1., 1., 2., 2., 3., 3.],
3107 /// "int" => [1, 1, 2, 2, 3, 3, ],
3108 /// "str" => ["a", "a", "b", "b", "c", "c"]
3109 /// }?;
3110 ///
3111 /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
3112 /// # Ok::<(), PolarsError>(())
3113 /// ```
3114 /// Returns
3115 ///
3116 /// ```text
3117 /// +-----+-----+-----+
3118 /// | flt | int | str |
3119 /// | --- | --- | --- |
3120 /// | f64 | i32 | str |
3121 /// +=====+=====+=====+
3122 /// | 1 | 1 | "a" |
3123 /// +-----+-----+-----+
3124 /// | 2 | 2 | "b" |
3125 /// +-----+-----+-----+
3126 /// | 3 | 3 | "c" |
3127 /// +-----+-----+-----+
3128 /// ```
3129 #[cfg(feature = "algorithm_group_by")]
3130 pub fn unique_stable(
3131 &self,
3132 subset: Option<&[String]>,
3133 keep: UniqueKeepStrategy,
3134 slice: Option<(i64, usize)>,
3135 ) -> PolarsResult<DataFrame> {
3136 self.unique_impl(
3137 true,
3138 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3139 keep,
3140 slice,
3141 )
3142 }
3143
3144 /// Unstable distinct. See [`DataFrame::unique_stable`].
3145 #[cfg(feature = "algorithm_group_by")]
3146 pub fn unique<I, S>(
3147 &self,
3148 subset: Option<&[String]>,
3149 keep: UniqueKeepStrategy,
3150 slice: Option<(i64, usize)>,
3151 ) -> PolarsResult<DataFrame> {
3152 self.unique_impl(
3153 false,
3154 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3155 keep,
3156 slice,
3157 )
3158 }
3159
3160 #[cfg(feature = "algorithm_group_by")]
3161 pub fn unique_impl(
3162 &self,
3163 maintain_order: bool,
3164 subset: Option<Vec<PlSmallStr>>,
3165 keep: UniqueKeepStrategy,
3166 slice: Option<(i64, usize)>,
3167 ) -> PolarsResult<Self> {
3168 let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3169 let mut df = self.clone();
3170 // take on multiple chunks is terrible
3171 df.as_single_chunk_par();
3172
3173 let columns = match (keep, maintain_order) {
3174 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3175 let gb = df.group_by_stable(names)?;
3176 let groups = gb.get_groups();
3177 let (offset, len) = slice.unwrap_or((0, groups.len()));
3178 let groups = groups.slice(offset, len);
3179 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3180 },
3181 (UniqueKeepStrategy::Last, true) => {
3182 // maintain order by last values, so the sorted groups are not correct as they
3183 // are sorted by the first value
3184 let gb = df.group_by_stable(names)?;
3185 let groups = gb.get_groups();
3186
3187 let last_idx: NoNull<IdxCa> = groups
3188 .iter()
3189 .map(|g| match g {
3190 GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3191 GroupsIndicator::Slice([first, len]) => first + len - 1,
3192 })
3193 .collect();
3194
3195 let mut last_idx = last_idx.into_inner().sort(false);
3196
3197 if let Some((offset, len)) = slice {
3198 last_idx = last_idx.slice(offset, len);
3199 }
3200
3201 let last_idx = NoNull::new(last_idx);
3202 let out = unsafe { df.take_unchecked(&last_idx) };
3203 return Ok(out);
3204 },
3205 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3206 let gb = df.group_by(names)?;
3207 let groups = gb.get_groups();
3208 let (offset, len) = slice.unwrap_or((0, groups.len()));
3209 let groups = groups.slice(offset, len);
3210 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3211 },
3212 (UniqueKeepStrategy::Last, false) => {
3213 let gb = df.group_by(names)?;
3214 let groups = gb.get_groups();
3215 let (offset, len) = slice.unwrap_or((0, groups.len()));
3216 let groups = groups.slice(offset, len);
3217 df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3218 },
3219 (UniqueKeepStrategy::None, _) => {
3220 let df_part = df.select(names)?;
3221 let mask = df_part.is_unique()?;
3222 let mut filtered = df.filter(&mask)?;
3223
3224 if let Some((offset, len)) = slice {
3225 filtered = filtered.slice(offset, len);
3226 }
3227 return Ok(filtered);
3228 },
3229 };
3230 let height = Self::infer_height(&columns);
3231 Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3232 }
3233
3234 /// Get a mask of all the unique rows in the [`DataFrame`].
3235 ///
3236 /// # Example
3237 ///
3238 /// ```no_run
3239 /// # use polars_core::prelude::*;
3240 /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3241 /// "ISIN" => ["US0378331005", "US5949181045"])?;
3242 /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3243 ///
3244 /// assert!(ca.all());
3245 /// # Ok::<(), PolarsError>(())
3246 /// ```
3247 #[cfg(feature = "algorithm_group_by")]
3248 pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3249 let gb = self.group_by(self.get_column_names_owned())?;
3250 let groups = gb.get_groups();
3251 Ok(is_unique_helper(
3252 groups,
3253 self.height() as IdxSize,
3254 true,
3255 false,
3256 ))
3257 }
3258
3259 /// Get a mask of all the duplicated rows in the [`DataFrame`].
3260 ///
3261 /// # Example
3262 ///
3263 /// ```no_run
3264 /// # use polars_core::prelude::*;
3265 /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3266 /// "ISIN" => ["US02079K3059", "US02079K1079"])?;
3267 /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3268 ///
3269 /// assert!(!ca.all());
3270 /// # Ok::<(), PolarsError>(())
3271 /// ```
3272 #[cfg(feature = "algorithm_group_by")]
3273 pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3274 let gb = self.group_by(self.get_column_names_owned())?;
3275 let groups = gb.get_groups();
3276 Ok(is_unique_helper(
3277 groups,
3278 self.height() as IdxSize,
3279 false,
3280 true,
3281 ))
3282 }
3283
3284 /// Create a new [`DataFrame`] that shows the null counts per column.
3285 #[must_use]
3286 pub fn null_count(&self) -> Self {
3287 let cols = self
3288 .columns
3289 .iter()
3290 .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3291 .collect();
3292 unsafe { Self::new_no_checks(1, cols) }
3293 }
3294
3295 /// Hash and combine the row values
3296 #[cfg(feature = "row_hash")]
3297 pub fn hash_rows(
3298 &mut self,
3299 hasher_builder: Option<PlSeedableRandomStateQuality>,
3300 ) -> PolarsResult<UInt64Chunked> {
3301 let dfs = split_df(self, POOL.current_num_threads(), false);
3302 let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3303
3304 let mut iter = cas.into_iter();
3305 let mut acc_ca = iter.next().unwrap();
3306 for ca in iter {
3307 acc_ca.append(&ca)?;
3308 }
3309 Ok(acc_ca.rechunk().into_owned())
3310 }
3311
3312 /// Get the supertype of the columns in this DataFrame
3313 pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3314 self.columns
3315 .iter()
3316 .map(|s| Ok(s.dtype().clone()))
3317 .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3318 }
3319
3320 /// Take by index values given by the slice `idx`.
3321 /// # Warning
3322 /// Be careful with allowing threads when calling this in a large hot loop
3323 /// every thread split may be on rayon stack and lead to SO
3324 #[doc(hidden)]
3325 pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3326 self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3327 }
3328
3329 /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3330 /// if the index value in `idx` are sorted. This will maintain sorted flags.
3331 ///
3332 /// # Warning
3333 /// Be careful with allowing threads when calling this in a large hot loop
3334 /// every thread split may be on rayon stack and lead to SO
3335 #[doc(hidden)]
3336 pub unsafe fn _take_unchecked_slice_sorted(
3337 &self,
3338 idx: &[IdxSize],
3339 allow_threads: bool,
3340 sorted: IsSorted,
3341 ) -> Self {
3342 #[cfg(debug_assertions)]
3343 {
3344 if idx.len() > 2 {
3345 match sorted {
3346 IsSorted::Ascending => {
3347 assert!(idx[0] <= idx[idx.len() - 1]);
3348 },
3349 IsSorted::Descending => {
3350 assert!(idx[0] >= idx[idx.len() - 1]);
3351 },
3352 _ => {},
3353 }
3354 }
3355 }
3356 let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3357 ca.set_sorted_flag(sorted);
3358 self.take_unchecked_impl(&ca, allow_threads)
3359 }
3360
3361 #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3362 #[doc(hidden)]
3363 pub fn _partition_by_impl(
3364 &self,
3365 cols: &[PlSmallStr],
3366 stable: bool,
3367 include_key: bool,
3368 parallel: bool,
3369 ) -> PolarsResult<Vec<DataFrame>> {
3370 let selected_keys = self.select_columns(cols.iter().cloned())?;
3371 let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3372 let groups = groups.into_groups();
3373
3374 // drop key columns prior to calculation if requested
3375 let df = if include_key {
3376 self.clone()
3377 } else {
3378 self.drop_many(cols.iter().cloned())
3379 };
3380
3381 if parallel {
3382 // don't parallelize this
3383 // there is a lot of parallelization in take and this may easily SO
3384 POOL.install(|| {
3385 match groups.as_ref() {
3386 GroupsType::Idx(idx) => {
3387 // Rechunk as the gather may rechunk for every group #17562.
3388 let mut df = df.clone();
3389 df.as_single_chunk_par();
3390 Ok(idx
3391 .into_par_iter()
3392 .map(|(_, group)| {
3393 // groups are in bounds
3394 unsafe {
3395 df._take_unchecked_slice_sorted(
3396 group,
3397 false,
3398 IsSorted::Ascending,
3399 )
3400 }
3401 })
3402 .collect())
3403 },
3404 GroupsType::Slice { groups, .. } => Ok(groups
3405 .into_par_iter()
3406 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3407 .collect()),
3408 }
3409 })
3410 } else {
3411 match groups.as_ref() {
3412 GroupsType::Idx(idx) => {
3413 // Rechunk as the gather may rechunk for every group #17562.
3414 let mut df = df;
3415 df.as_single_chunk();
3416 Ok(idx
3417 .into_iter()
3418 .map(|(_, group)| {
3419 // groups are in bounds
3420 unsafe {
3421 df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3422 }
3423 })
3424 .collect())
3425 },
3426 GroupsType::Slice { groups, .. } => Ok(groups
3427 .iter()
3428 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3429 .collect()),
3430 }
3431 }
3432 }
3433
3434 /// Split into multiple DataFrames partitioned by groups
3435 #[cfg(feature = "partition_by")]
3436 pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3437 where
3438 I: IntoIterator<Item = S>,
3439 S: Into<PlSmallStr>,
3440 {
3441 let cols: UnitVec<PlSmallStr> = cols.into_iter().map(Into::into).collect();
3442 self._partition_by_impl(cols.as_slice(), false, include_key, true)
3443 }
3444
3445 /// Split into multiple DataFrames partitioned by groups
3446 /// Order of the groups are maintained.
3447 #[cfg(feature = "partition_by")]
3448 pub fn partition_by_stable<I, S>(
3449 &self,
3450 cols: I,
3451 include_key: bool,
3452 ) -> PolarsResult<Vec<DataFrame>>
3453 where
3454 I: IntoIterator<Item = S>,
3455 S: Into<PlSmallStr>,
3456 {
3457 let cols: UnitVec<PlSmallStr> = cols.into_iter().map(Into::into).collect();
3458 self._partition_by_impl(cols.as_slice(), true, include_key, true)
3459 }
3460
3461 /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3462 /// inserted as columns.
3463 #[cfg(feature = "dtype-struct")]
3464 pub fn unnest<I: IntoVec<PlSmallStr>>(
3465 &self,
3466 cols: I,
3467 separator: Option<&str>,
3468 ) -> PolarsResult<DataFrame> {
3469 let cols = cols.into_vec();
3470 self.unnest_impl(cols.into_iter().collect(), separator)
3471 }
3472
3473 #[cfg(feature = "dtype-struct")]
3474 fn unnest_impl(
3475 &self,
3476 cols: PlHashSet<PlSmallStr>,
3477 separator: Option<&str>,
3478 ) -> PolarsResult<DataFrame> {
3479 let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3480 let mut count = 0;
3481 for s in &self.columns {
3482 if cols.contains(s.name()) {
3483 let ca = s.struct_()?.clone();
3484 new_cols.extend(ca.fields_as_series().into_iter().map(|mut f| {
3485 if let Some(separator) = &separator {
3486 f.rename(polars_utils::format_pl_smallstr!(
3487 "{}{}{}",
3488 s.name(),
3489 separator,
3490 f.name()
3491 ));
3492 }
3493 Column::from(f)
3494 }));
3495 count += 1;
3496 } else {
3497 new_cols.push(s.clone())
3498 }
3499 }
3500 if count != cols.len() {
3501 // one or more columns not found
3502 // the code below will return an error with the missing name
3503 let schema = self.schema();
3504 for col in cols {
3505 let _ = schema
3506 .get(col.as_str())
3507 .ok_or_else(|| polars_err!(col_not_found = col))?;
3508 }
3509 }
3510 DataFrame::new(new_cols)
3511 }
3512
3513 pub(crate) fn infer_height(cols: &[Column]) -> usize {
3514 cols.first().map_or(0, Column::len)
3515 }
3516
3517 pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3518 // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3519 // append_chunk or something like this. It is just quite difficult to make that safe.
3520 let df = DataFrame::from(rb);
3521 polars_ensure!(
3522 self.schema() == df.schema(),
3523 SchemaMismatch: "cannot append record batch with different schema\n\n
3524 Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3525 );
3526 self.vstack_mut_owned_unchecked(df);
3527 Ok(())
3528 }
3529
3530 pub fn into_columns(self) -> Vec<Column> {
3531 self.columns
3532 }
3533}
3534
3535pub struct RecordBatchIter<'a> {
3536 columns: &'a Vec<Column>,
3537 schema: ArrowSchemaRef,
3538 idx: usize,
3539 n_chunks: usize,
3540 compat_level: CompatLevel,
3541 parallel: bool,
3542}
3543
3544impl Iterator for RecordBatchIter<'_> {
3545 type Item = RecordBatch;
3546
3547 fn next(&mut self) -> Option<Self::Item> {
3548 if self.idx >= self.n_chunks {
3549 return None;
3550 }
3551
3552 // Create a batch of the columns with the same chunk no.
3553 let batch_cols: Vec<ArrayRef> = if self.parallel {
3554 let iter = self
3555 .columns
3556 .par_iter()
3557 .map(Column::as_materialized_series)
3558 .map(|s| s.to_arrow(self.idx, self.compat_level));
3559 POOL.install(|| iter.collect())
3560 } else {
3561 self.columns
3562 .iter()
3563 .map(Column::as_materialized_series)
3564 .map(|s| s.to_arrow(self.idx, self.compat_level))
3565 .collect()
3566 };
3567 self.idx += 1;
3568
3569 let length = batch_cols.first().map_or(0, |arr| arr.len());
3570 Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3571 }
3572
3573 fn size_hint(&self) -> (usize, Option<usize>) {
3574 let n = self.n_chunks - self.idx;
3575 (n, Some(n))
3576 }
3577}
3578
3579pub struct PhysRecordBatchIter<'a> {
3580 schema: ArrowSchemaRef,
3581 arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3582}
3583
3584impl Iterator for PhysRecordBatchIter<'_> {
3585 type Item = RecordBatch;
3586
3587 fn next(&mut self) -> Option<Self::Item> {
3588 let arrs = self
3589 .arr_iters
3590 .iter_mut()
3591 .map(|phys_iter| phys_iter.next().cloned())
3592 .collect::<Option<Vec<_>>>()?;
3593
3594 let length = arrs.first().map_or(0, |arr| arr.len());
3595 Some(RecordBatch::new(length, self.schema.clone(), arrs))
3596 }
3597
3598 fn size_hint(&self) -> (usize, Option<usize>) {
3599 if let Some(iter) = self.arr_iters.first() {
3600 iter.size_hint()
3601 } else {
3602 (0, None)
3603 }
3604 }
3605}
3606
3607impl Default for DataFrame {
3608 fn default() -> Self {
3609 DataFrame::empty()
3610 }
3611}
3612
3613impl From<DataFrame> for Vec<Column> {
3614 fn from(df: DataFrame) -> Self {
3615 df.columns
3616 }
3617}
3618
3619// utility to test if we can vstack/extend the columns
3620fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3621 polars_ensure!(
3622 left.name() == right.name(),
3623 ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3624 left.name(), right.name(),
3625 );
3626 Ok(())
3627}
3628
3629#[cfg(test)]
3630mod test {
3631 use super::*;
3632
3633 fn create_frame() -> DataFrame {
3634 let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3635 let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3636 DataFrame::new(vec![s0, s1]).unwrap()
3637 }
3638
3639 #[test]
3640 #[cfg_attr(miri, ignore)]
3641 fn test_recordbatch_iterator() {
3642 let df = df!(
3643 "foo" => [1, 2, 3, 4, 5]
3644 )
3645 .unwrap();
3646 let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3647 assert_eq!(5, iter.next().unwrap().len());
3648 assert!(iter.next().is_none());
3649 }
3650
3651 #[test]
3652 #[cfg_attr(miri, ignore)]
3653 fn test_select() {
3654 let df = create_frame();
3655 assert_eq!(
3656 df.column("days")
3657 .unwrap()
3658 .as_series()
3659 .unwrap()
3660 .equal(1)
3661 .unwrap()
3662 .sum(),
3663 Some(1)
3664 );
3665 }
3666
3667 #[test]
3668 #[cfg_attr(miri, ignore)]
3669 fn test_filter_broadcast_on_string_col() {
3670 let col_name = "some_col";
3671 let v = vec!["test".to_string()];
3672 let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3673 let mut df = DataFrame::new(vec![s0]).unwrap();
3674
3675 df = df
3676 .filter(
3677 &df.column(col_name)
3678 .unwrap()
3679 .as_materialized_series()
3680 .equal("")
3681 .unwrap(),
3682 )
3683 .unwrap();
3684 assert_eq!(
3685 df.column(col_name)
3686 .unwrap()
3687 .as_materialized_series()
3688 .n_chunks(),
3689 1
3690 );
3691 }
3692
3693 #[test]
3694 #[cfg_attr(miri, ignore)]
3695 fn test_filter_broadcast_on_list_col() {
3696 let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3697 let ll: ListChunked = [&s1].iter().copied().collect();
3698
3699 let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3700 let new = ll.filter(&mask).unwrap();
3701
3702 assert_eq!(new.chunks.len(), 1);
3703 assert_eq!(new.len(), 0);
3704 }
3705
3706 #[test]
3707 fn slice() {
3708 let df = create_frame();
3709 let sliced_df = df.slice(0, 2);
3710 assert_eq!(sliced_df.shape(), (2, 2));
3711 }
3712
3713 #[test]
3714 fn rechunk_false() {
3715 let df = create_frame();
3716 assert!(!df.should_rechunk())
3717 }
3718
3719 #[test]
3720 fn rechunk_true() -> PolarsResult<()> {
3721 let mut base = df!(
3722 "a" => [1, 2, 3],
3723 "b" => [1, 2, 3]
3724 )?;
3725
3726 // Create a series with multiple chunks
3727 let mut s = Series::new("foo".into(), 0..2);
3728 let s2 = Series::new("bar".into(), 0..1);
3729 s.append(&s2)?;
3730
3731 // Append series to frame
3732 let out = base.with_column(s)?;
3733
3734 // Now we should rechunk
3735 assert!(out.should_rechunk());
3736 Ok(())
3737 }
3738
3739 #[test]
3740 fn test_duplicate_column() {
3741 let mut df = df! {
3742 "foo" => [1, 2, 3]
3743 }
3744 .unwrap();
3745 // check if column is replaced
3746 assert!(
3747 df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3748 .is_ok()
3749 );
3750 assert!(
3751 df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3752 .is_ok()
3753 );
3754 assert!(df.column("bar").is_ok())
3755 }
3756
3757 #[test]
3758 #[cfg_attr(miri, ignore)]
3759 fn distinct() {
3760 let df = df! {
3761 "flt" => [1., 1., 2., 2., 3., 3.],
3762 "int" => [1, 1, 2, 2, 3, 3, ],
3763 "str" => ["a", "a", "b", "b", "c", "c"]
3764 }
3765 .unwrap();
3766 let df = df
3767 .unique_stable(None, UniqueKeepStrategy::First, None)
3768 .unwrap()
3769 .sort(["flt"], SortMultipleOptions::default())
3770 .unwrap();
3771 let valid = df! {
3772 "flt" => [1., 2., 3.],
3773 "int" => [1, 2, 3],
3774 "str" => ["a", "b", "c"]
3775 }
3776 .unwrap();
3777 assert!(df.equals(&valid));
3778 }
3779
3780 #[test]
3781 fn test_vstack() {
3782 // check that it does not accidentally rechunks
3783 let mut df = df! {
3784 "flt" => [1., 1., 2., 2., 3., 3.],
3785 "int" => [1, 1, 2, 2, 3, 3, ],
3786 "str" => ["a", "a", "b", "b", "c", "c"]
3787 }
3788 .unwrap();
3789
3790 df.vstack_mut(&df.slice(0, 3)).unwrap();
3791 assert_eq!(df.first_col_n_chunks(), 2)
3792 }
3793
3794 #[test]
3795 fn test_vstack_on_empty_dataframe() {
3796 let mut df = DataFrame::empty();
3797
3798 let df_data = df! {
3799 "flt" => [1., 1., 2., 2., 3., 3.],
3800 "int" => [1, 1, 2, 2, 3, 3, ],
3801 "str" => ["a", "a", "b", "b", "c", "c"]
3802 }
3803 .unwrap();
3804
3805 df.vstack_mut(&df_data).unwrap();
3806 assert_eq!(df.height, 6)
3807 }
3808
3809 #[test]
3810 fn test_replace_or_add() -> PolarsResult<()> {
3811 let mut df = df!(
3812 "a" => [1, 2, 3],
3813 "b" => [1, 2, 3]
3814 )?;
3815
3816 // check that the new column is "c" and not "bar".
3817 df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3818
3819 assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3820 Ok(())
3821 }
3822
3823 #[test]
3824 fn test_unique_keep_none_with_slice() {
3825 let df = df! {
3826 "x" => [1, 2, 3, 2, 1]
3827 }
3828 .unwrap();
3829 let out = df
3830 .unique_stable(
3831 Some(&["x".to_string()][..]),
3832 UniqueKeepStrategy::None,
3833 Some((0, 2)),
3834 )
3835 .unwrap();
3836 let expected = df! {
3837 "x" => [3]
3838 }
3839 .unwrap();
3840 assert!(out.equals(&expected));
3841 }
3842
3843 #[test]
3844 #[cfg(feature = "dtype-i8")]
3845 fn test_apply_result_schema() {
3846 let mut df = df! {
3847 "x" => [1, 2, 3, 2, 1]
3848 }
3849 .unwrap();
3850
3851 let schema_before = df.schema().clone();
3852 df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3853 assert_ne!(&schema_before, df.schema());
3854 }
3855}