polars_core/frame/mod.rs
1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(any(feature = "rows", feature = "object"))]
33pub mod row;
34mod top_k;
35mod upstream_traits;
36mod validation;
37
38use arrow::record_batch::{RecordBatch, RecordBatchT};
39use polars_utils::pl_str::PlSmallStr;
40#[cfg(feature = "serde")]
41use serde::{Deserialize, Serialize};
42use strum_macros::IntoStaticStr;
43
44use crate::POOL;
45#[cfg(feature = "row_hash")]
46use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48use crate::series::IsSorted;
49
50#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52#[strum(serialize_all = "snake_case")]
53pub enum UniqueKeepStrategy {
54 /// Keep the first unique row.
55 First,
56 /// Keep the last unique row.
57 Last,
58 /// Keep None of the unique rows.
59 None,
60 /// Keep any of the unique rows
61 /// This allows more optimizations
62 #[default]
63 Any,
64}
65
66fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
67where
68 F: for<'a> FnMut(&'a T) -> &'a str,
69{
70 // Always unique.
71 if items.len() <= 1 {
72 return Ok(());
73 }
74
75 if items.len() <= 4 {
76 // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
77 for i in 0..items.len() - 1 {
78 let name = get_name(&items[i]);
79 for other in items.iter().skip(i + 1) {
80 if name == get_name(other) {
81 polars_bail!(duplicate = name);
82 }
83 }
84 }
85 } else {
86 let mut names = PlHashSet::with_capacity(items.len());
87 for item in items {
88 let name = get_name(item);
89 if !names.insert(name) {
90 polars_bail!(duplicate = name);
91 }
92 }
93 }
94 Ok(())
95}
96
97/// A contiguous growable collection of `Series` that have the same length.
98///
99/// ## Use declarations
100///
101/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
102///
103/// ```rust
104/// use polars_core::prelude::*; // if the crate polars-core is used directly
105/// // use polars::prelude::*; if the crate polars is used
106/// ```
107///
108/// # Initialization
109/// ## Default
110///
111/// A `DataFrame` can be initialized empty:
112///
113/// ```rust
114/// # use polars_core::prelude::*;
115/// let df = DataFrame::default();
116/// assert!(df.is_empty());
117/// ```
118///
119/// ## Wrapping a `Vec<Series>`
120///
121/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
122///
123/// ```rust
124/// # use polars_core::prelude::*;
125/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
126/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
127///
128/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
129/// ```
130///
131/// ## Using a macro
132///
133/// The [`df!`] macro is a convenient method:
134///
135/// ```rust
136/// # use polars_core::prelude::*;
137/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
138/// "Color" => ["Red", "Yellow", "Green"]);
139/// ```
140///
141/// ## Using a CSV file
142///
143/// See the `polars_io::csv::CsvReader`.
144///
145/// # Indexing
146/// ## By a number
147///
148/// The `Index<usize>` is implemented for the `DataFrame`.
149///
150/// ```rust
151/// # use polars_core::prelude::*;
152/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
153/// "Color" => ["Red", "Yellow", "Green"])?;
154///
155/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
156/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
157/// # Ok::<(), PolarsError>(())
158/// ```
159///
160/// ## By a `Series` name
161///
162/// ```rust
163/// # use polars_core::prelude::*;
164/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
165/// "Color" => ["Red", "Yellow", "Green"])?;
166///
167/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
168/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
169/// # Ok::<(), PolarsError>(())
170/// ```
171#[derive(Clone)]
172pub struct DataFrame {
173 height: usize,
174 // invariant: columns[i].len() == height for each 0 >= i > columns.len()
175 pub(crate) columns: Vec<Column>,
176
177 /// A cached schema. This might not give correct results if the DataFrame was modified in place
178 /// between schema and reading.
179 cached_schema: OnceLock<SchemaRef>,
180}
181
182impl DataFrame {
183 pub fn clear_schema(&mut self) {
184 self.cached_schema = OnceLock::new();
185 }
186
187 #[inline]
188 pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
189 self.columns.iter()
190 }
191
192 #[inline]
193 pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
194 self.columns.iter().map(Column::as_materialized_series)
195 }
196
197 #[inline]
198 pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
199 self.columns.par_iter().map(Column::as_materialized_series)
200 }
201
202 /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
203 ///
204 /// # Implementation
205 /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
206 /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
207 /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
208 ///
209 /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
210 /// However, this function will yield a smaller number. This is because this function returns
211 /// the visible size of the buffer, not its total capacity.
212 ///
213 /// FFI buffers are included in this estimation.
214 pub fn estimated_size(&self) -> usize {
215 self.columns.iter().map(Column::estimated_size).sum()
216 }
217
218 // Reduce monomorphization.
219 fn try_apply_columns(
220 &self,
221 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
222 ) -> PolarsResult<Vec<Column>> {
223 self.columns.iter().map(func).collect()
224 }
225 // Reduce monomorphization.
226 pub fn _apply_columns(&self, func: &(dyn Fn(&Column) -> Column)) -> Vec<Column> {
227 self.columns.iter().map(func).collect()
228 }
229 // Reduce monomorphization.
230 fn try_apply_columns_par(
231 &self,
232 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
233 ) -> PolarsResult<Vec<Column>> {
234 POOL.install(|| self.columns.par_iter().map(func).collect())
235 }
236 // Reduce monomorphization.
237 pub fn _apply_columns_par(
238 &self,
239 func: &(dyn Fn(&Column) -> Column + Send + Sync),
240 ) -> Vec<Column> {
241 POOL.install(|| self.columns.par_iter().map(func).collect())
242 }
243
244 /// Get the index of the column.
245 fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
246 self.get_column_index(name)
247 .ok_or_else(|| polars_err!(col_not_found = name))
248 }
249
250 fn check_already_present(&self, name: &str) -> PolarsResult<()> {
251 polars_ensure!(
252 self.columns.iter().all(|s| s.name().as_str() != name),
253 Duplicate: "column with name {:?} is already present in the DataFrame", name
254 );
255 Ok(())
256 }
257
258 /// Reserve additional slots into the chunks of the series.
259 pub(crate) fn reserve_chunks(&mut self, additional: usize) {
260 for s in &mut self.columns {
261 if let Column::Series(s) = s {
262 // SAFETY:
263 // do not modify the data, simply resize.
264 unsafe { s.chunks_mut().reserve(additional) }
265 }
266 }
267 }
268
269 /// Create a DataFrame from a Vector of Series.
270 ///
271 /// Errors if a column names are not unique, or if heights are not all equal.
272 ///
273 /// # Example
274 ///
275 /// ```
276 /// # use polars_core::prelude::*;
277 /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
278 /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
279 ///
280 /// let df = DataFrame::new(vec![s0, s1])?;
281 /// # Ok::<(), PolarsError>(())
282 /// ```
283 pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
284 DataFrame::validate_columns_slice(&columns)
285 .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {}", e)))?;
286 Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
287 }
288
289 pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
290 for col in &columns {
291 polars_ensure!(
292 col.len() == height,
293 ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
294 columns[0].name(), height, col.name(), col.len()
295 );
296 }
297
298 Ok(DataFrame {
299 height,
300 columns,
301 cached_schema: OnceLock::new(),
302 })
303 }
304
305 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
306 /// columns to match the other columns.
307 pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
308 // The length of the longest non-unit length column determines the
309 // broadcast length. If all columns are unit-length the broadcast length
310 // is one.
311 let broadcast_len = columns
312 .iter()
313 .map(|s| s.len())
314 .filter(|l| *l != 1)
315 .max()
316 .unwrap_or(1);
317 Self::new_with_broadcast_len(columns, broadcast_len)
318 }
319
320 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
321 /// columns to broadcast_len.
322 pub fn new_with_broadcast_len(
323 columns: Vec<Column>,
324 broadcast_len: usize,
325 ) -> PolarsResult<Self> {
326 ensure_names_unique(&columns, |s| s.name().as_str())?;
327 unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
328 }
329
330 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
331 /// columns to match the other columns.
332 ///
333 /// # Safety
334 /// Does not check that the column names are unique (which they must be).
335 pub unsafe fn new_with_broadcast_no_namecheck(
336 mut columns: Vec<Column>,
337 broadcast_len: usize,
338 ) -> PolarsResult<Self> {
339 for col in &mut columns {
340 // Length not equal to the broadcast len, needs broadcast or is an error.
341 let len = col.len();
342 if len != broadcast_len {
343 if len != 1 {
344 let name = col.name().to_owned();
345 let extra_info =
346 if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
347 format!(" (matching column '{}')", c.name())
348 } else {
349 String::new()
350 };
351 polars_bail!(
352 ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
353 );
354 }
355 *col = col.new_from_index(0, broadcast_len);
356 }
357 }
358
359 let length = if columns.is_empty() { 0 } else { broadcast_len };
360
361 Ok(unsafe { DataFrame::new_no_checks(length, columns) })
362 }
363
364 /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
365 ///
366 /// # Example
367 ///
368 /// ```rust
369 /// use polars_core::prelude::DataFrame;
370 /// static EMPTY: DataFrame = DataFrame::empty();
371 /// ```
372 pub const fn empty() -> Self {
373 Self::empty_with_height(0)
374 }
375
376 /// Creates an empty `DataFrame` with a specific `height`.
377 pub const fn empty_with_height(height: usize) -> Self {
378 DataFrame {
379 height,
380 columns: vec![],
381 cached_schema: OnceLock::new(),
382 }
383 }
384
385 /// Create an empty `DataFrame` with empty columns as per the `schema`.
386 pub fn empty_with_schema(schema: &Schema) -> Self {
387 let cols = schema
388 .iter()
389 .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
390 .collect();
391 unsafe { DataFrame::new_no_checks(0, cols) }
392 }
393
394 /// Create an empty `DataFrame` with empty columns as per the `schema`.
395 pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
396 let cols = schema
397 .iter_values()
398 .map(|fld| {
399 Column::from(Series::new_empty(
400 fld.name.clone(),
401 &(DataType::from_arrow_field(fld)),
402 ))
403 })
404 .collect();
405 unsafe { DataFrame::new_no_checks(0, cols) }
406 }
407
408 /// Create a new `DataFrame` with the given schema, only containing nulls.
409 pub fn full_null(schema: &Schema, height: usize) -> Self {
410 let columns = schema
411 .iter_fields()
412 .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
413 .collect();
414 unsafe { DataFrame::new_no_checks(height, columns) }
415 }
416
417 /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
418 ///
419 /// # Example
420 ///
421 /// ```rust
422 /// # use polars_core::prelude::*;
423 /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
424 /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
425 /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
426 ///
427 /// assert_eq!(df.pop(), Some(s2));
428 /// assert_eq!(df.pop(), Some(s1));
429 /// assert_eq!(df.pop(), None);
430 /// assert!(df.is_empty());
431 /// # Ok::<(), PolarsError>(())
432 /// ```
433 pub fn pop(&mut self) -> Option<Column> {
434 self.clear_schema();
435
436 self.columns.pop()
437 }
438
439 /// Add a new column at index 0 that counts the rows.
440 ///
441 /// # Example
442 ///
443 /// ```
444 /// # use polars_core::prelude::*;
445 /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
446 /// assert_eq!(df1.shape(), (4, 1));
447 ///
448 /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
449 /// assert_eq!(df2.shape(), (4, 2));
450 /// println!("{}", df2);
451 ///
452 /// # Ok::<(), PolarsError>(())
453 /// ```
454 ///
455 /// Output:
456 ///
457 /// ```text
458 /// shape: (4, 2)
459 /// +-----+----------+
460 /// | Id | Name |
461 /// | --- | --- |
462 /// | u32 | str |
463 /// +=====+==========+
464 /// | 0 | James |
465 /// +-----+----------+
466 /// | 1 | Mary |
467 /// +-----+----------+
468 /// | 2 | John |
469 /// +-----+----------+
470 /// | 3 | Patricia |
471 /// +-----+----------+
472 /// ```
473 pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
474 let mut columns = Vec::with_capacity(self.columns.len() + 1);
475 let offset = offset.unwrap_or(0);
476
477 let col = Column::new_row_index(name, offset, self.height())?;
478 columns.push(col);
479 columns.extend_from_slice(&self.columns);
480 DataFrame::new(columns)
481 }
482
483 /// Add a row index column in place.
484 ///
485 /// # Safety
486 /// The caller should ensure the DataFrame does not already contain a column with the given name.
487 ///
488 /// # Panics
489 /// Panics if the resulting column would reach or overflow IdxSize::MAX.
490 pub unsafe fn with_row_index_mut(
491 &mut self,
492 name: PlSmallStr,
493 offset: Option<IdxSize>,
494 ) -> &mut Self {
495 // TODO: Make this function unsafe
496 debug_assert!(
497 self.columns.iter().all(|c| c.name() != &name),
498 "with_row_index_mut(): column with name {} already exists",
499 &name
500 );
501
502 let offset = offset.unwrap_or(0);
503 let col = Column::new_row_index(name, offset, self.height()).unwrap();
504
505 self.clear_schema();
506 self.columns.insert(0, col);
507 self
508 }
509
510 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
511 /// `Series`.
512 ///
513 /// Calculates the height from the first column or `0` if no columns are given.
514 ///
515 /// # Safety
516 ///
517 /// It is the callers responsibility to uphold the contract of all `Series`
518 /// having an equal length and a unique name, if not this may panic down the line.
519 pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
520 let height = columns.first().map_or(0, Column::len);
521 unsafe { Self::new_no_checks(height, columns) }
522 }
523
524 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
525 /// `Series`.
526 ///
527 /// It is advised to use [DataFrame::new] in favor of this method.
528 ///
529 /// # Safety
530 ///
531 /// It is the callers responsibility to uphold the contract of all `Series`
532 /// having an equal length and a unique name, if not this may panic down the line.
533 pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
534 if cfg!(debug_assertions) {
535 DataFrame::validate_columns_slice(&columns).unwrap();
536 }
537
538 unsafe { Self::_new_no_checks_impl(height, columns) }
539 }
540
541 /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
542 /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
543 /// constructed with this method is generally highly unsafe and should not be long-lived.
544 #[allow(clippy::missing_safety_doc)]
545 pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
546 DataFrame {
547 height,
548 columns,
549 cached_schema: OnceLock::new(),
550 }
551 }
552
553 /// Shrink the capacity of this DataFrame to fit its length.
554 pub fn shrink_to_fit(&mut self) {
555 // Don't parallelize this. Memory overhead
556 for s in &mut self.columns {
557 s.shrink_to_fit();
558 }
559 }
560
561 /// Aggregate all the chunks in the DataFrame to a single chunk.
562 pub fn as_single_chunk(&mut self) -> &mut Self {
563 // Don't parallelize this. Memory overhead
564 for s in &mut self.columns {
565 *s = s.rechunk();
566 }
567 self
568 }
569
570 /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
571 /// This may lead to more peak memory consumption.
572 pub fn as_single_chunk_par(&mut self) -> &mut Self {
573 if self.columns.iter().any(|c| c.n_chunks() > 1) {
574 self.columns = self._apply_columns_par(&|s| s.rechunk());
575 }
576 self
577 }
578
579 /// Rechunks all columns to only have a single chunk.
580 pub fn rechunk_mut(&mut self) {
581 // SAFETY: We never adjust the length or names of the columns.
582 let columns = unsafe { self.get_columns_mut() };
583
584 for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
585 *col = col.rechunk();
586 }
587 }
588
589 pub fn _deshare_views_mut(&mut self) {
590 // SAFETY: We never adjust the length or names of the columns.
591 unsafe {
592 let columns = self.get_columns_mut();
593 for col in columns {
594 let Column::Series(s) = col else { continue };
595
596 if let Ok(ca) = s.binary() {
597 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
598 *col = Column::from(gc_ca.into_series());
599 } else if let Ok(ca) = s.str() {
600 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
601 *col = Column::from(gc_ca.into_series());
602 }
603 }
604 }
605 }
606
607 /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
608 pub fn rechunk_to_record_batch(
609 self,
610 compat_level: CompatLevel,
611 ) -> RecordBatchT<Box<dyn Array>> {
612 let height = self.height();
613
614 let (schema, arrays) = self
615 .columns
616 .into_iter()
617 .map(|col| {
618 let mut series = col.take_materialized_series();
619 // Rechunk to one chunk if necessary
620 if series.n_chunks() > 1 {
621 series = series.rechunk();
622 }
623 (
624 series.field().to_arrow(compat_level),
625 series.to_arrow(0, compat_level),
626 )
627 })
628 .collect();
629
630 RecordBatchT::new(height, Arc::new(schema), arrays)
631 }
632
633 /// Returns true if the chunks of the columns do not align and re-chunking should be done
634 pub fn should_rechunk(&self) -> bool {
635 // Fast check. It is also needed for correctness, as code below doesn't check if the number
636 // of chunks is equal.
637 if !self
638 .get_columns()
639 .iter()
640 .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
641 .all_equal()
642 {
643 return true;
644 }
645
646 // From here we check chunk lengths.
647 let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
648 match chunk_lengths.next() {
649 None => false,
650 Some(first_column_chunk_lengths) => {
651 // Fast Path for single Chunk Series
652 if first_column_chunk_lengths.size_hint().0 == 1 {
653 return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
654 }
655 // Always rechunk if we have more chunks than rows.
656 // except when we have an empty df containing a single chunk
657 let height = self.height();
658 let n_chunks = first_column_chunk_lengths.size_hint().0;
659 if n_chunks > height && !(height == 0 && n_chunks == 1) {
660 return true;
661 }
662 // Slow Path for multi Chunk series
663 let v: Vec<_> = first_column_chunk_lengths.collect();
664 for cl in chunk_lengths {
665 if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
666 return true;
667 }
668 }
669 false
670 },
671 }
672 }
673
674 /// Ensure all the chunks in the [`DataFrame`] are aligned.
675 pub fn align_chunks_par(&mut self) -> &mut Self {
676 if self.should_rechunk() {
677 self.as_single_chunk_par()
678 } else {
679 self
680 }
681 }
682
683 pub fn align_chunks(&mut self) -> &mut Self {
684 if self.should_rechunk() {
685 self.as_single_chunk()
686 } else {
687 self
688 }
689 }
690
691 /// Get the [`DataFrame`] schema.
692 ///
693 /// # Example
694 ///
695 /// ```rust
696 /// # use polars_core::prelude::*;
697 /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
698 /// "Diameter (m)" => [8.8e26, f64::INFINITY])?;
699 ///
700 /// let f1: Field = Field::new("Thing".into(), DataType::String);
701 /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
702 /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
703 ///
704 /// assert_eq!(&**df.schema(), &sc);
705 /// # Ok::<(), PolarsError>(())
706 /// ```
707 pub fn schema(&self) -> &SchemaRef {
708 let out = self.cached_schema.get_or_init(|| {
709 Arc::new(
710 self.columns
711 .iter()
712 .map(|x| (x.name().clone(), x.dtype().clone()))
713 .collect(),
714 )
715 });
716
717 debug_assert_eq!(out.len(), self.width());
718
719 out
720 }
721
722 /// Get a reference to the [`DataFrame`] columns.
723 ///
724 /// # Example
725 ///
726 /// ```rust
727 /// # use polars_core::prelude::*;
728 /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
729 /// "Symbol" => ["A", "C", "G", "T"])?;
730 /// let columns: &[Column] = df.get_columns();
731 ///
732 /// assert_eq!(columns[0].name(), "Name");
733 /// assert_eq!(columns[1].name(), "Symbol");
734 /// # Ok::<(), PolarsError>(())
735 /// ```
736 #[inline]
737 pub fn get_columns(&self) -> &[Column] {
738 &self.columns
739 }
740
741 #[inline]
742 /// Get mutable access to the underlying columns.
743 ///
744 /// # Safety
745 ///
746 /// The caller must ensure the length of all [`Series`] remains equal to `height` or
747 /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
748 /// The caller must ensure that the cached schema is cleared if it modifies the schema by
749 /// calling [`DataFrame::clear_schema`].
750 pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
751 &mut self.columns
752 }
753
754 #[inline]
755 /// Remove all the columns in the [`DataFrame`] but keep the `height`.
756 pub fn clear_columns(&mut self) {
757 unsafe { self.get_columns_mut() }.clear();
758 self.clear_schema();
759 }
760
761 #[inline]
762 /// Extend the columns without checking for name collisions or height.
763 ///
764 /// # Safety
765 ///
766 /// The caller needs to ensure that:
767 /// - Column names are unique within the resulting [`DataFrame`].
768 /// - The length of each appended column matches the height of the [`DataFrame`]. For
769 /// `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
770 /// with [`DataFrame::set_height`].
771 pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
772 unsafe { self.get_columns_mut() }.extend(iter);
773 self.clear_schema();
774 }
775
776 /// Take ownership of the underlying columns vec.
777 pub fn take_columns(self) -> Vec<Column> {
778 self.columns
779 }
780
781 /// Iterator over the columns as [`Series`].
782 ///
783 /// # Example
784 ///
785 /// ```rust
786 /// # use polars_core::prelude::*;
787 /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
788 /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
789 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
790 ///
791 /// let mut iterator = df.iter();
792 ///
793 /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
794 /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
795 /// assert_eq!(iterator.next(), None);
796 /// # Ok::<(), PolarsError>(())
797 /// ```
798 pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
799 self.materialized_column_iter()
800 }
801
802 /// # Example
803 ///
804 /// ```rust
805 /// # use polars_core::prelude::*;
806 /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
807 /// "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
808 ///
809 /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
810 /// # Ok::<(), PolarsError>(())
811 /// ```
812 pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
813 self.columns.iter().map(|s| s.name()).collect()
814 }
815
816 /// Get the [`Vec<PlSmallStr>`] representing the column names.
817 pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
818 self.columns.iter().map(|s| s.name().clone()).collect()
819 }
820
821 pub fn get_column_names_str(&self) -> Vec<&str> {
822 self.columns.iter().map(|s| s.name().as_str()).collect()
823 }
824
825 /// Set the column names.
826 /// # Example
827 ///
828 /// ```rust
829 /// # use polars_core::prelude::*;
830 /// let mut df: DataFrame = df!("Mathematical set" => ["ℕ", "ℤ", "𝔻", "ℚ", "ℝ", "ℂ"])?;
831 /// df.set_column_names(["Set"])?;
832 ///
833 /// assert_eq!(df.get_column_names(), &["Set"]);
834 /// # Ok::<(), PolarsError>(())
835 /// ```
836 pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
837 where
838 I: IntoIterator<Item = S>,
839 S: Into<PlSmallStr>,
840 {
841 let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
842 self._set_column_names_impl(names.as_slice())
843 }
844
845 fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
846 polars_ensure!(
847 names.len() == self.width(),
848 ShapeMismatch: "{} column names provided for a DataFrame of width {}",
849 names.len(), self.width()
850 );
851 ensure_names_unique(names, |s| s.as_str())?;
852
853 let columns = mem::take(&mut self.columns);
854 self.columns = columns
855 .into_iter()
856 .zip(names)
857 .map(|(s, name)| {
858 let mut s = s;
859 s.rename(name.clone());
860 s
861 })
862 .collect();
863 self.clear_schema();
864 Ok(())
865 }
866
867 /// Get the data types of the columns in the [`DataFrame`].
868 ///
869 /// # Example
870 ///
871 /// ```rust
872 /// # use polars_core::prelude::*;
873 /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
874 /// "Fraction" => [0.965, 0.035])?;
875 ///
876 /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
877 /// # Ok::<(), PolarsError>(())
878 /// ```
879 pub fn dtypes(&self) -> Vec<DataType> {
880 self.columns.iter().map(|s| s.dtype().clone()).collect()
881 }
882
883 pub(crate) fn first_series_column(&self) -> Option<&Series> {
884 self.columns.iter().find_map(|col| col.as_series())
885 }
886
887 /// The number of chunks for the first column.
888 pub fn first_col_n_chunks(&self) -> usize {
889 match self.first_series_column() {
890 None if self.columns.is_empty() => 0,
891 None => 1,
892 Some(s) => s.n_chunks(),
893 }
894 }
895
896 /// The highest number of chunks for any column.
897 pub fn max_n_chunks(&self) -> usize {
898 self.columns
899 .iter()
900 .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
901 .max()
902 .unwrap_or(0)
903 }
904
905 /// Get a reference to the schema fields of the [`DataFrame`].
906 ///
907 /// # Example
908 ///
909 /// ```rust
910 /// # use polars_core::prelude::*;
911 /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
912 /// "Fraction" => [0.708, 0.292])?;
913 ///
914 /// let f1: Field = Field::new("Surface type".into(), DataType::String);
915 /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
916 ///
917 /// assert_eq!(earth.fields(), &[f1, f2]);
918 /// # Ok::<(), PolarsError>(())
919 /// ```
920 pub fn fields(&self) -> Vec<Field> {
921 self.columns
922 .iter()
923 .map(|s| s.field().into_owned())
924 .collect()
925 }
926
927 /// Get (height, width) of the [`DataFrame`].
928 ///
929 /// # Example
930 ///
931 /// ```rust
932 /// # use polars_core::prelude::*;
933 /// let df0: DataFrame = DataFrame::default();
934 /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
935 /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
936 /// "2" => [1, 2, 3, 4, 5])?;
937 ///
938 /// assert_eq!(df0.shape(), (0 ,0));
939 /// assert_eq!(df1.shape(), (5, 1));
940 /// assert_eq!(df2.shape(), (5, 2));
941 /// # Ok::<(), PolarsError>(())
942 /// ```
943 pub fn shape(&self) -> (usize, usize) {
944 (self.height, self.columns.len())
945 }
946
947 /// Get the width of the [`DataFrame`] which is the number of columns.
948 ///
949 /// # Example
950 ///
951 /// ```rust
952 /// # use polars_core::prelude::*;
953 /// let df0: DataFrame = DataFrame::default();
954 /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
955 /// let df2: DataFrame = df!("Series 1" => [0; 0],
956 /// "Series 2" => [0; 0])?;
957 ///
958 /// assert_eq!(df0.width(), 0);
959 /// assert_eq!(df1.width(), 1);
960 /// assert_eq!(df2.width(), 2);
961 /// # Ok::<(), PolarsError>(())
962 /// ```
963 pub fn width(&self) -> usize {
964 self.columns.len()
965 }
966
967 /// Get the height of the [`DataFrame`] which is the number of rows.
968 ///
969 /// # Example
970 ///
971 /// ```rust
972 /// # use polars_core::prelude::*;
973 /// let df0: DataFrame = DataFrame::default();
974 /// let df1: DataFrame = df!("Currency" => ["€", "$"])?;
975 /// let df2: DataFrame = df!("Currency" => ["€", "$", "¥", "£", "₿"])?;
976 ///
977 /// assert_eq!(df0.height(), 0);
978 /// assert_eq!(df1.height(), 2);
979 /// assert_eq!(df2.height(), 5);
980 /// # Ok::<(), PolarsError>(())
981 /// ```
982 pub fn height(&self) -> usize {
983 self.height
984 }
985
986 /// Returns the size as number of rows * number of columns
987 pub fn size(&self) -> usize {
988 let s = self.shape();
989 s.0 * s.1
990 }
991
992 /// Returns `true` if the [`DataFrame`] contains no rows.
993 ///
994 /// # Example
995 ///
996 /// ```rust
997 /// # use polars_core::prelude::*;
998 /// let df1: DataFrame = DataFrame::default();
999 /// assert!(df1.is_empty());
1000 ///
1001 /// let df2: DataFrame = df!("First name" => ["Forever"],
1002 /// "Last name" => ["Alone"])?;
1003 /// assert!(!df2.is_empty());
1004 /// # Ok::<(), PolarsError>(())
1005 /// ```
1006 pub fn is_empty(&self) -> bool {
1007 matches!(self.shape(), (0, _) | (_, 0))
1008 }
1009
1010 /// Set the height (i.e. number of rows) of this [`DataFrame`].
1011 ///
1012 /// # Safety
1013 ///
1014 /// This needs to be equal to the length of all the columns.
1015 pub unsafe fn set_height(&mut self, height: usize) {
1016 self.height = height;
1017 }
1018
1019 /// Add multiple [`Series`] to a [`DataFrame`].
1020 /// The added `Series` are required to have the same length.
1021 ///
1022 /// # Example
1023 ///
1024 /// ```rust
1025 /// # use polars_core::prelude::*;
1026 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1027 /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1028 /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1029 ///
1030 /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1031 /// assert_eq!(df2.shape(), (3, 3));
1032 /// println!("{}", df2);
1033 /// # Ok::<(), PolarsError>(())
1034 /// ```
1035 ///
1036 /// Output:
1037 ///
1038 /// ```text
1039 /// shape: (3, 3)
1040 /// +---------+--------+----------+
1041 /// | Element | Proton | Electron |
1042 /// | --- | --- | --- |
1043 /// | str | i32 | i32 |
1044 /// +=========+========+==========+
1045 /// | Copper | 29 | 29 |
1046 /// +---------+--------+----------+
1047 /// | Silver | 47 | 47 |
1048 /// +---------+--------+----------+
1049 /// | Gold | 79 | 79 |
1050 /// +---------+--------+----------+
1051 /// ```
1052 pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1053 let mut new_cols = self.columns.clone();
1054 new_cols.extend_from_slice(columns);
1055 DataFrame::new(new_cols)
1056 }
1057
1058 /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1059 ///
1060 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1061 ///
1062 /// # Example
1063 ///
1064 /// ```rust
1065 /// # use polars_core::prelude::*;
1066 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1067 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1068 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1069 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1070 ///
1071 /// let df3: DataFrame = df1.vstack(&df2)?;
1072 ///
1073 /// assert_eq!(df3.shape(), (5, 2));
1074 /// println!("{}", df3);
1075 /// # Ok::<(), PolarsError>(())
1076 /// ```
1077 ///
1078 /// Output:
1079 ///
1080 /// ```text
1081 /// shape: (5, 2)
1082 /// +-----------+-------------------+
1083 /// | Element | Melting Point (K) |
1084 /// | --- | --- |
1085 /// | str | f64 |
1086 /// +===========+===================+
1087 /// | Copper | 1357.77 |
1088 /// +-----------+-------------------+
1089 /// | Silver | 1234.93 |
1090 /// +-----------+-------------------+
1091 /// | Gold | 1337.33 |
1092 /// +-----------+-------------------+
1093 /// | Platinum | 2041.4 |
1094 /// +-----------+-------------------+
1095 /// | Palladium | 1828.05 |
1096 /// +-----------+-------------------+
1097 /// ```
1098 pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1099 let mut df = self.clone();
1100 df.vstack_mut(other)?;
1101 Ok(df)
1102 }
1103
1104 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1105 ///
1106 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1107 ///
1108 /// # Example
1109 ///
1110 /// ```rust
1111 /// # use polars_core::prelude::*;
1112 /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1113 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1114 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1115 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1116 ///
1117 /// df1.vstack_mut(&df2)?;
1118 ///
1119 /// assert_eq!(df1.shape(), (5, 2));
1120 /// println!("{}", df1);
1121 /// # Ok::<(), PolarsError>(())
1122 /// ```
1123 ///
1124 /// Output:
1125 ///
1126 /// ```text
1127 /// shape: (5, 2)
1128 /// +-----------+-------------------+
1129 /// | Element | Melting Point (K) |
1130 /// | --- | --- |
1131 /// | str | f64 |
1132 /// +===========+===================+
1133 /// | Copper | 1357.77 |
1134 /// +-----------+-------------------+
1135 /// | Silver | 1234.93 |
1136 /// +-----------+-------------------+
1137 /// | Gold | 1337.33 |
1138 /// +-----------+-------------------+
1139 /// | Platinum | 2041.4 |
1140 /// +-----------+-------------------+
1141 /// | Palladium | 1828.05 |
1142 /// +-----------+-------------------+
1143 /// ```
1144 pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1145 if self.width() != other.width() {
1146 polars_ensure!(
1147 self.width() == 0,
1148 ShapeMismatch:
1149 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1150 self.width(), other.width(),
1151 );
1152 self.columns.clone_from(&other.columns);
1153 self.height = other.height;
1154 return Ok(self);
1155 }
1156
1157 self.columns
1158 .iter_mut()
1159 .zip(other.columns.iter())
1160 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1161 ensure_can_extend(&*left, right)?;
1162 left.append(right).map_err(|e| {
1163 e.context(format!("failed to vstack column '{}'", right.name()).into())
1164 })?;
1165 Ok(())
1166 })?;
1167 self.height += other.height;
1168 Ok(self)
1169 }
1170
1171 pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1172 if self.width() != other.width() {
1173 polars_ensure!(
1174 self.width() == 0,
1175 ShapeMismatch:
1176 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1177 self.width(), other.width(),
1178 );
1179 self.columns = other.columns;
1180 self.height = other.height;
1181 return Ok(self);
1182 }
1183
1184 self.columns
1185 .iter_mut()
1186 .zip(other.columns.into_iter())
1187 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1188 ensure_can_extend(&*left, &right)?;
1189 let right_name = right.name().clone();
1190 left.append_owned(right).map_err(|e| {
1191 e.context(format!("failed to vstack column '{right_name}'").into())
1192 })?;
1193 Ok(())
1194 })?;
1195 self.height += other.height;
1196 Ok(self)
1197 }
1198
1199 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1200 ///
1201 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1202 ///
1203 /// # Panics
1204 /// Panics if the schema's don't match.
1205 pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1206 self.columns
1207 .iter_mut()
1208 .zip(other.columns.iter())
1209 .for_each(|(left, right)| {
1210 left.append(right)
1211 .map_err(|e| {
1212 e.context(format!("failed to vstack column '{}'", right.name()).into())
1213 })
1214 .expect("should not fail");
1215 });
1216 self.height += other.height;
1217 }
1218
1219 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1220 ///
1221 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1222 ///
1223 /// # Panics
1224 /// Panics if the schema's don't match.
1225 pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1226 self.columns
1227 .iter_mut()
1228 .zip(other.columns)
1229 .for_each(|(left, right)| {
1230 left.append_owned(right).expect("should not fail");
1231 });
1232 self.height += other.height;
1233 }
1234
1235 /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1236 ///
1237 /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1238 /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1239 ///
1240 /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1241 /// and thus will yield faster queries.
1242 ///
1243 /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1244 /// online operations where you add `n` rows and rerun a query.
1245 ///
1246 /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1247 /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1248 /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1249 pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1250 polars_ensure!(
1251 self.width() == other.width(),
1252 ShapeMismatch:
1253 "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1254 self.width(), other.width(),
1255 );
1256
1257 self.columns
1258 .iter_mut()
1259 .zip(other.columns.iter())
1260 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1261 ensure_can_extend(&*left, right)?;
1262 left.extend(right).map_err(|e| {
1263 e.context(format!("failed to extend column '{}'", right.name()).into())
1264 })?;
1265 Ok(())
1266 })?;
1267 self.height += other.height;
1268 self.clear_schema();
1269 Ok(())
1270 }
1271
1272 /// Remove a column by name and return the column removed.
1273 ///
1274 /// # Example
1275 ///
1276 /// ```rust
1277 /// # use polars_core::prelude::*;
1278 /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1279 /// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1280 ///
1281 /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1282 /// assert!(s1.is_err());
1283 ///
1284 /// let s2: Column = df.drop_in_place("Animal")?;
1285 /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1286 /// # Ok::<(), PolarsError>(())
1287 /// ```
1288 pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1289 let idx = self.check_name_to_idx(name)?;
1290 self.clear_schema();
1291 Ok(self.columns.remove(idx))
1292 }
1293
1294 /// Return a new [`DataFrame`] where all null values are dropped.
1295 ///
1296 /// # Example
1297 ///
1298 /// ```no_run
1299 /// # use polars_core::prelude::*;
1300 /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1301 /// "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1302 /// assert_eq!(df1.shape(), (3, 2));
1303 ///
1304 /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1305 /// assert_eq!(df2.shape(), (1, 2));
1306 /// println!("{}", df2);
1307 /// # Ok::<(), PolarsError>(())
1308 /// ```
1309 ///
1310 /// Output:
1311 ///
1312 /// ```text
1313 /// shape: (1, 2)
1314 /// +---------+---------------------+
1315 /// | Country | Tax revenue (% GDP) |
1316 /// | --- | --- |
1317 /// | str | f64 |
1318 /// +=========+=====================+
1319 /// | Malta | 32.7 |
1320 /// +---------+---------------------+
1321 /// ```
1322 pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1323 where
1324 for<'a> &'a S: Into<PlSmallStr>,
1325 {
1326 if let Some(v) = subset {
1327 let v = self.select_columns(v)?;
1328 self._drop_nulls_impl(v.as_slice())
1329 } else {
1330 self._drop_nulls_impl(self.columns.as_slice())
1331 }
1332 }
1333
1334 fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1335 // fast path for no nulls in df
1336 if subset.iter().all(|s| !s.has_nulls()) {
1337 return Ok(self.clone());
1338 }
1339
1340 let mut iter = subset.iter();
1341
1342 let mask = iter
1343 .next()
1344 .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1345 let mut mask = mask.is_not_null();
1346
1347 for c in iter {
1348 mask = mask & c.is_not_null();
1349 }
1350 self.filter(&mask)
1351 }
1352
1353 /// Drop a column by name.
1354 /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1355 /// the current one in place.
1356 ///
1357 /// # Example
1358 ///
1359 /// ```rust
1360 /// # use polars_core::prelude::*;
1361 /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1362 /// let df2: DataFrame = df1.drop("Ray type")?;
1363 ///
1364 /// assert!(df2.is_empty());
1365 /// # Ok::<(), PolarsError>(())
1366 /// ```
1367 pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1368 let idx = self.check_name_to_idx(name)?;
1369 let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1370
1371 self.columns.iter().enumerate().for_each(|(i, s)| {
1372 if i != idx {
1373 new_cols.push(s.clone())
1374 }
1375 });
1376
1377 Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1378 }
1379
1380 /// Drop columns that are in `names`.
1381 pub fn drop_many<I, S>(&self, names: I) -> Self
1382 where
1383 I: IntoIterator<Item = S>,
1384 S: Into<PlSmallStr>,
1385 {
1386 let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1387 self.drop_many_amortized(&names)
1388 }
1389
1390 /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1391 pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1392 if names.is_empty() {
1393 return self.clone();
1394 }
1395 let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1396 self.columns.iter().for_each(|s| {
1397 if !names.contains(s.name()) {
1398 new_cols.push(s.clone())
1399 }
1400 });
1401
1402 unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1403 }
1404
1405 /// Insert a new column at a given index without checking for duplicates.
1406 /// This can leave the [`DataFrame`] at an invalid state
1407 fn insert_column_no_name_check(
1408 &mut self,
1409 index: usize,
1410 column: Column,
1411 ) -> PolarsResult<&mut Self> {
1412 polars_ensure!(
1413 self.width() == 0 || column.len() == self.height(),
1414 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1415 column.len(), self.height(),
1416 );
1417
1418 if self.width() == 0 {
1419 self.height = column.len();
1420 }
1421
1422 self.columns.insert(index, column);
1423 self.clear_schema();
1424 Ok(self)
1425 }
1426
1427 /// Insert a new column at a given index.
1428 pub fn insert_column<S: IntoColumn>(
1429 &mut self,
1430 index: usize,
1431 column: S,
1432 ) -> PolarsResult<&mut Self> {
1433 let column = column.into_column();
1434 self.check_already_present(column.name().as_str())?;
1435 self.insert_column_no_name_check(index, column)
1436 }
1437
1438 fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1439 if let Some(idx) = self.get_column_index(column.name().as_str()) {
1440 self.replace_column(idx, column)?;
1441 } else {
1442 if self.width() == 0 {
1443 self.height = column.len();
1444 }
1445
1446 self.columns.push(column);
1447 self.clear_schema();
1448 }
1449 Ok(())
1450 }
1451
1452 /// Add a new column to this [`DataFrame`] or replace an existing one.
1453 pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1454 fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1455 let height = df.height();
1456 if column.len() == 1 && height > 1 {
1457 column = column.new_from_index(0, height);
1458 }
1459
1460 if column.len() == height || df.get_columns().is_empty() {
1461 df.add_column_by_search(column)?;
1462 Ok(df)
1463 }
1464 // special case for literals
1465 else if height == 0 && column.len() == 1 {
1466 let s = column.clear();
1467 df.add_column_by_search(s)?;
1468 Ok(df)
1469 } else {
1470 polars_bail!(
1471 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1472 column.len(), height,
1473 );
1474 }
1475 }
1476 let column = column.into_column();
1477 inner(self, column)
1478 }
1479
1480 /// Adds a column to the [`DataFrame`] without doing any checks
1481 /// on length or duplicates.
1482 ///
1483 /// # Safety
1484 /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1485 pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1486 debug_assert!(self.width() == 0 || self.height() == column.len());
1487 debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1488
1489 // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1490 // properly for `width` == 0.
1491 if self.width() == 0 {
1492 unsafe { self.set_height(column.len()) };
1493 }
1494 unsafe { self.get_columns_mut() }.push(column);
1495 self.clear_schema();
1496
1497 self
1498 }
1499
1500 // Note: Schema can be both input or output_schema
1501 fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1502 let name = c.name();
1503 if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1504 if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1505 // Given schema is output_schema and we can push.
1506 if idx == self.columns.len() {
1507 if self.width() == 0 {
1508 self.height = c.len();
1509 }
1510
1511 self.columns.push(c);
1512 self.clear_schema();
1513 }
1514 // Schema is incorrect fallback to search
1515 else {
1516 debug_assert!(false);
1517 self.add_column_by_search(c)?;
1518 }
1519 } else {
1520 self.replace_column(idx, c)?;
1521 }
1522 } else {
1523 if self.width() == 0 {
1524 self.height = c.len();
1525 }
1526
1527 self.columns.push(c);
1528 self.clear_schema();
1529 }
1530
1531 Ok(())
1532 }
1533
1534 // Note: Schema can be both input or output_schema
1535 pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1536 for (i, s) in series.into_iter().enumerate() {
1537 // we need to branch here
1538 // because users can add multiple columns with the same name
1539 if i == 0 || schema.get(s.name().as_str()).is_some() {
1540 self.with_column_and_schema(s.into_column(), schema)?;
1541 } else {
1542 self.with_column(s.clone().into_column())?;
1543 }
1544 }
1545 Ok(())
1546 }
1547
1548 pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1549 for (i, s) in columns.into_iter().enumerate() {
1550 // we need to branch here
1551 // because users can add multiple columns with the same name
1552 if i == 0 || schema.get(s.name().as_str()).is_some() {
1553 self.with_column_and_schema(s, schema)?;
1554 } else {
1555 self.with_column(s.clone())?;
1556 }
1557 }
1558
1559 Ok(())
1560 }
1561
1562 /// Add a new column to this [`DataFrame`] or replace an existing one.
1563 /// Uses an existing schema to amortize lookups.
1564 /// If the schema is incorrect, we will fallback to linear search.
1565 ///
1566 /// Note: Schema can be both input or output_schema
1567 pub fn with_column_and_schema<C: IntoColumn>(
1568 &mut self,
1569 column: C,
1570 schema: &Schema,
1571 ) -> PolarsResult<&mut Self> {
1572 let mut column = column.into_column();
1573
1574 let height = self.height();
1575 if column.len() == 1 && height > 1 {
1576 column = column.new_from_index(0, height);
1577 }
1578
1579 if column.len() == height || self.columns.is_empty() {
1580 self.add_column_by_schema(column, schema)?;
1581 Ok(self)
1582 }
1583 // special case for literals
1584 else if height == 0 && column.len() == 1 {
1585 let s = column.clear();
1586 self.add_column_by_schema(s, schema)?;
1587 Ok(self)
1588 } else {
1589 polars_bail!(
1590 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1591 column.len(), height,
1592 );
1593 }
1594 }
1595
1596 /// Get a row in the [`DataFrame`]. Beware this is slow.
1597 ///
1598 /// # Example
1599 ///
1600 /// ```
1601 /// # use polars_core::prelude::*;
1602 /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1603 /// df.get(idx)
1604 /// }
1605 /// ```
1606 pub fn get(&self, idx: usize) -> Option<Vec<AnyValue>> {
1607 match self.columns.first() {
1608 Some(s) => {
1609 if s.len() <= idx {
1610 return None;
1611 }
1612 },
1613 None => return None,
1614 }
1615 // SAFETY: we just checked bounds
1616 unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1617 }
1618
1619 /// Select a [`Series`] by index.
1620 ///
1621 /// # Example
1622 ///
1623 /// ```rust
1624 /// # use polars_core::prelude::*;
1625 /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1626 /// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1627 ///
1628 /// let s1: Option<&Column> = df.select_at_idx(0);
1629 /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1630 ///
1631 /// assert_eq!(s1, Some(&s2));
1632 /// # Ok::<(), PolarsError>(())
1633 /// ```
1634 pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1635 self.columns.get(idx)
1636 }
1637
1638 /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1639 ///
1640 /// # Examples
1641 ///
1642 /// ```rust
1643 /// # use polars_core::prelude::*;
1644 /// let df = df! {
1645 /// "0" => [0, 0, 0],
1646 /// "1" => [1, 1, 1],
1647 /// "2" => [2, 2, 2]
1648 /// }?;
1649 ///
1650 /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1651 /// assert!(df.equals(&df.select_by_range(..)?));
1652 /// # Ok::<(), PolarsError>(())
1653 /// ```
1654 pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1655 where
1656 R: ops::RangeBounds<usize>,
1657 {
1658 // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1659 // because it is the nightly feature. We should change here if this function were stable.
1660 fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1661 where
1662 R: ops::RangeBounds<usize>,
1663 {
1664 let len = bounds.end;
1665
1666 let start: ops::Bound<&usize> = range.start_bound();
1667 let start = match start {
1668 ops::Bound::Included(&start) => start,
1669 ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1670 panic!("attempted to index slice from after maximum usize");
1671 }),
1672 ops::Bound::Unbounded => 0,
1673 };
1674
1675 let end: ops::Bound<&usize> = range.end_bound();
1676 let end = match end {
1677 ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1678 panic!("attempted to index slice up to maximum usize");
1679 }),
1680 ops::Bound::Excluded(&end) => end,
1681 ops::Bound::Unbounded => len,
1682 };
1683
1684 if start > end {
1685 panic!("slice index starts at {start} but ends at {end}");
1686 }
1687 if end > len {
1688 panic!("range end index {end} out of range for slice of length {len}",);
1689 }
1690
1691 ops::Range { start, end }
1692 }
1693
1694 let colnames = self.get_column_names_owned();
1695 let range = get_range(range, ..colnames.len());
1696
1697 self._select_impl(&colnames[range])
1698 }
1699
1700 /// Get column index of a [`Series`] by name.
1701 /// # Example
1702 ///
1703 /// ```rust
1704 /// # use polars_core::prelude::*;
1705 /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1706 /// "Health" => [100, 200, 500],
1707 /// "Mana" => [250, 100, 0],
1708 /// "Strength" => [30, 150, 300])?;
1709 ///
1710 /// assert_eq!(df.get_column_index("Name"), Some(0));
1711 /// assert_eq!(df.get_column_index("Health"), Some(1));
1712 /// assert_eq!(df.get_column_index("Mana"), Some(2));
1713 /// assert_eq!(df.get_column_index("Strength"), Some(3));
1714 /// assert_eq!(df.get_column_index("Haste"), None);
1715 /// # Ok::<(), PolarsError>(())
1716 /// ```
1717 pub fn get_column_index(&self, name: &str) -> Option<usize> {
1718 let schema = self.schema();
1719 if let Some(idx) = schema.index_of(name) {
1720 if self
1721 .get_columns()
1722 .get(idx)
1723 .is_some_and(|c| c.name() == name)
1724 {
1725 return Some(idx);
1726 }
1727 }
1728
1729 self.columns.iter().position(|s| s.name().as_str() == name)
1730 }
1731
1732 /// Get column index of a [`Series`] by name.
1733 pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1734 self.get_column_index(name)
1735 .ok_or_else(|| polars_err!(col_not_found = name))
1736 }
1737
1738 /// Select a single column by name.
1739 ///
1740 /// # Example
1741 ///
1742 /// ```rust
1743 /// # use polars_core::prelude::*;
1744 /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1745 /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1746 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1747 ///
1748 /// assert_eq!(df.column("Password")?, &s1);
1749 /// # Ok::<(), PolarsError>(())
1750 /// ```
1751 pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1752 let idx = self.try_get_column_index(name)?;
1753 Ok(self.select_at_idx(idx).unwrap())
1754 }
1755
1756 /// Selected multiple columns by name.
1757 ///
1758 /// # Example
1759 ///
1760 /// ```rust
1761 /// # use polars_core::prelude::*;
1762 /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1763 /// "Max weight (kg)" => [16.0, 35.89])?;
1764 /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1765 ///
1766 /// assert_eq!(&df[0], sv[0]);
1767 /// assert_eq!(&df[1], sv[1]);
1768 /// # Ok::<(), PolarsError>(())
1769 /// ```
1770 pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1771 where
1772 I: IntoIterator<Item = S>,
1773 S: AsRef<str>,
1774 {
1775 names
1776 .into_iter()
1777 .map(|name| self.column(name.as_ref()))
1778 .collect()
1779 }
1780
1781 /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1782 ///
1783 /// # Examples
1784 ///
1785 /// ```
1786 /// # use polars_core::prelude::*;
1787 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1788 /// df.select(["foo", "bar"])
1789 /// }
1790 /// ```
1791 pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1792 where
1793 I: IntoIterator<Item = S>,
1794 S: Into<PlSmallStr>,
1795 {
1796 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1797 self._select_impl(cols.as_slice())
1798 }
1799
1800 pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1801 ensure_names_unique(cols, |s| s.as_str())?;
1802 self._select_impl_unchecked(cols)
1803 }
1804
1805 pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1806 let selected = self.select_columns_impl(cols)?;
1807 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1808 }
1809
1810 /// Select with a known schema. The schema names must match the column names of this DataFrame.
1811 pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1812 where
1813 I: IntoIterator<Item = S>,
1814 S: Into<PlSmallStr>,
1815 {
1816 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1817 self._select_with_schema_impl(&cols, schema, true)
1818 }
1819
1820 /// Select with a known schema without checking for duplicates in `selection`.
1821 /// The schema names must match the column names of this DataFrame.
1822 pub fn select_with_schema_unchecked<I, S>(
1823 &self,
1824 selection: I,
1825 schema: &Schema,
1826 ) -> PolarsResult<Self>
1827 where
1828 I: IntoIterator<Item = S>,
1829 S: Into<PlSmallStr>,
1830 {
1831 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1832 self._select_with_schema_impl(&cols, schema, false)
1833 }
1834
1835 /// * The schema names must match the column names of this DataFrame.
1836 pub fn _select_with_schema_impl(
1837 &self,
1838 cols: &[PlSmallStr],
1839 schema: &Schema,
1840 check_duplicates: bool,
1841 ) -> PolarsResult<Self> {
1842 if check_duplicates {
1843 ensure_names_unique(cols, |s| s.as_str())?;
1844 }
1845
1846 let selected = self.select_columns_impl_with_schema(cols, schema)?;
1847 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1848 }
1849
1850 /// A non generic implementation to reduce compiler bloat.
1851 fn select_columns_impl_with_schema(
1852 &self,
1853 cols: &[PlSmallStr],
1854 schema: &Schema,
1855 ) -> PolarsResult<Vec<Column>> {
1856 if cfg!(debug_assertions) {
1857 ensure_matching_schema_names(schema, self.schema())?;
1858 }
1859
1860 cols.iter()
1861 .map(|name| {
1862 let index = schema.try_get_full(name.as_str())?.0;
1863 Ok(self.columns[index].clone())
1864 })
1865 .collect()
1866 }
1867
1868 pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1869 where
1870 I: IntoIterator<Item = S>,
1871 S: Into<PlSmallStr>,
1872 {
1873 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1874 self.select_physical_impl(&cols)
1875 }
1876
1877 fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1878 ensure_names_unique(cols, |s| s.as_str())?;
1879 let selected = self.select_columns_physical_impl(cols)?;
1880 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1881 }
1882
1883 /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1884 ///
1885 /// # Example
1886 ///
1887 /// ```rust
1888 /// # use polars_core::prelude::*;
1889 /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1890 /// "Carbon" => [1, 2, 3],
1891 /// "Hydrogen" => [4, 6, 8])?;
1892 /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1893 ///
1894 /// assert_eq!(df["Carbon"], sv[0]);
1895 /// assert_eq!(df["Hydrogen"], sv[1]);
1896 /// # Ok::<(), PolarsError>(())
1897 /// ```
1898 pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1899 let cols = selection.into_vec();
1900 self.select_columns_impl(&cols)
1901 }
1902
1903 fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1904 self.columns
1905 .iter()
1906 .enumerate()
1907 .map(|(i, s)| (s.name().as_str(), i))
1908 .collect()
1909 }
1910
1911 /// A non generic implementation to reduce compiler bloat.
1912 fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1913 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1914 let name_to_idx = self._names_to_idx_map();
1915 cols.iter()
1916 .map(|name| {
1917 let idx = *name_to_idx
1918 .get(name.as_str())
1919 .ok_or_else(|| polars_err!(col_not_found = name))?;
1920 Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1921 })
1922 .collect::<PolarsResult<Vec<_>>>()?
1923 } else {
1924 cols.iter()
1925 .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1926 .collect::<PolarsResult<Vec<_>>>()?
1927 };
1928
1929 Ok(selected)
1930 }
1931
1932 /// A non generic implementation to reduce compiler bloat.
1933 fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1934 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1935 // we hash, because there are user that having millions of columns.
1936 // # https://github.com/pola-rs/polars/issues/1023
1937 let name_to_idx = self._names_to_idx_map();
1938
1939 cols.iter()
1940 .map(|name| {
1941 let idx = *name_to_idx
1942 .get(name.as_str())
1943 .ok_or_else(|| polars_err!(col_not_found = name))?;
1944 Ok(self.select_at_idx(idx).unwrap().clone())
1945 })
1946 .collect::<PolarsResult<Vec<_>>>()?
1947 } else {
1948 cols.iter()
1949 .map(|c| self.column(c.as_str()).cloned())
1950 .collect::<PolarsResult<Vec<_>>>()?
1951 };
1952
1953 Ok(selected)
1954 }
1955
1956 fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1957 // If there is a filtered column just see how many columns there are left.
1958 if let Some(fst) = filtered.first() {
1959 return fst.len();
1960 }
1961
1962 // Otherwise, count the number of values that would be filtered and return that height.
1963 let num_trues = mask.num_trues();
1964 if mask.len() == self.height() {
1965 num_trues
1966 } else {
1967 // This is for broadcasting masks
1968 debug_assert!(num_trues == 0 || num_trues == 1);
1969 self.height() * num_trues
1970 }
1971 }
1972
1973 /// Take the [`DataFrame`] rows by a boolean mask.
1974 ///
1975 /// # Example
1976 ///
1977 /// ```
1978 /// # use polars_core::prelude::*;
1979 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1980 /// let mask = df.column("sepal_width")?.is_not_null();
1981 /// df.filter(&mask)
1982 /// }
1983 /// ```
1984 pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1985 let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
1986 let height = self.filter_height(&new_col, mask);
1987
1988 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1989 }
1990
1991 /// Same as `filter` but does not parallelize.
1992 pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1993 let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
1994 let height = self.filter_height(&new_col, mask);
1995
1996 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1997 }
1998
1999 /// Take [`DataFrame`] rows by index values.
2000 ///
2001 /// # Example
2002 ///
2003 /// ```
2004 /// # use polars_core::prelude::*;
2005 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2006 /// let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2007 /// df.take(&idx)
2008 /// }
2009 /// ```
2010 pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2011 let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2012
2013 Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2014 }
2015
2016 /// # Safety
2017 /// The indices must be in-bounds.
2018 pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2019 self.take_unchecked_impl(idx, true)
2020 }
2021
2022 /// # Safety
2023 /// The indices must be in-bounds.
2024 pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2025 let cols = if allow_threads {
2026 POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
2027 } else {
2028 self._apply_columns(&|s| s.take_unchecked(idx))
2029 };
2030 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2031 }
2032
2033 /// # Safety
2034 /// The indices must be in-bounds.
2035 pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2036 self.take_slice_unchecked_impl(idx, true)
2037 }
2038
2039 /// # Safety
2040 /// The indices must be in-bounds.
2041 pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2042 let cols = if allow_threads {
2043 POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2044 } else {
2045 self._apply_columns(&|s| s.take_slice_unchecked(idx))
2046 };
2047 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2048 }
2049
2050 /// Rename a column in the [`DataFrame`].
2051 ///
2052 /// # Example
2053 ///
2054 /// ```
2055 /// # use polars_core::prelude::*;
2056 /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2057 /// let original_name = "foo";
2058 /// let new_name = "bar";
2059 /// df.rename(original_name, new_name.into())
2060 /// }
2061 /// ```
2062 pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2063 if column == name.as_str() {
2064 return Ok(self);
2065 }
2066 polars_ensure!(
2067 !self.schema().contains(&name),
2068 Duplicate: "column rename attempted with already existing name \"{name}\""
2069 );
2070
2071 self.get_column_index(column)
2072 .and_then(|idx| self.columns.get_mut(idx))
2073 .ok_or_else(|| polars_err!(col_not_found = column))
2074 .map(|c| c.rename(name))?;
2075 Ok(self)
2076 }
2077
2078 /// Sort [`DataFrame`] in place.
2079 ///
2080 /// See [`DataFrame::sort`] for more instruction.
2081 pub fn sort_in_place(
2082 &mut self,
2083 by: impl IntoVec<PlSmallStr>,
2084 sort_options: SortMultipleOptions,
2085 ) -> PolarsResult<&mut Self> {
2086 let by_column = self.select_columns(by)?;
2087 self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2088 Ok(self)
2089 }
2090
2091 #[doc(hidden)]
2092 /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2093 pub fn sort_impl(
2094 &self,
2095 by_column: Vec<Column>,
2096 mut sort_options: SortMultipleOptions,
2097 slice: Option<(i64, usize)>,
2098 ) -> PolarsResult<Self> {
2099 if by_column.is_empty() {
2100 // If no columns selected, any order (including original order) is correct.
2101 return if let Some((offset, len)) = slice {
2102 Ok(self.slice(offset, len))
2103 } else {
2104 Ok(self.clone())
2105 };
2106 }
2107
2108 // note that the by_column argument also contains evaluated expression from
2109 // polars-lazy that may not even be present in this dataframe. therefore
2110 // when we try to set the first columns as sorted, we ignore the error as
2111 // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2112 let first_descending = sort_options.descending[0];
2113 let first_by_column = by_column[0].name().to_string();
2114
2115 let set_sorted = |df: &mut DataFrame| {
2116 // Mark the first sort column as sorted; if the column does not exist it
2117 // is ok, because we sorted by an expression not present in the dataframe
2118 let _ = df.apply(&first_by_column, |s| {
2119 let mut s = s.clone();
2120 if first_descending {
2121 s.set_sorted_flag(IsSorted::Descending)
2122 } else {
2123 s.set_sorted_flag(IsSorted::Ascending)
2124 }
2125 s
2126 });
2127 };
2128 if self.is_empty() {
2129 let mut out = self.clone();
2130 set_sorted(&mut out);
2131 return Ok(out);
2132 }
2133
2134 if let Some((0, k)) = slice {
2135 if k < self.len() {
2136 return self.bottom_k_impl(k, by_column, sort_options);
2137 }
2138 }
2139 // Check if the required column is already sorted; if so we can exit early
2140 // We can do so when there is only one column to sort by, for multiple columns
2141 // it will be complicated to do so
2142 #[cfg(feature = "dtype-categorical")]
2143 let is_not_categorical_enum =
2144 !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2145 || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2146
2147 #[cfg(not(feature = "dtype-categorical"))]
2148 #[allow(non_upper_case_globals)]
2149 const is_not_categorical_enum: bool = true;
2150
2151 if by_column.len() == 1 && is_not_categorical_enum {
2152 let required_sorting = if sort_options.descending[0] {
2153 IsSorted::Descending
2154 } else {
2155 IsSorted::Ascending
2156 };
2157 // If null count is 0 then nulls_last doesnt matter
2158 // Safe to get value at last position since the dataframe is not empty (taken care above)
2159 let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2160 && ((by_column[0].null_count() == 0)
2161 || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2162 == sort_options.nulls_last[0]);
2163
2164 if no_sorting_required {
2165 return if let Some((offset, len)) = slice {
2166 Ok(self.slice(offset, len))
2167 } else {
2168 Ok(self.clone())
2169 };
2170 }
2171 }
2172
2173 let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2174
2175 // a lot of indirection in both sorting and take
2176 let mut df = self.clone();
2177 let df = df.as_single_chunk_par();
2178 let mut take = match (by_column.len(), has_nested) {
2179 (1, false) => {
2180 let s = &by_column[0];
2181 let options = SortOptions {
2182 descending: sort_options.descending[0],
2183 nulls_last: sort_options.nulls_last[0],
2184 multithreaded: sort_options.multithreaded,
2185 maintain_order: sort_options.maintain_order,
2186 limit: sort_options.limit,
2187 };
2188 // fast path for a frame with a single series
2189 // no need to compute the sort indices and then take by these indices
2190 // simply sort and return as frame
2191 if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2192 let mut out = s.sort_with(options)?;
2193 if let Some((offset, len)) = slice {
2194 out = out.slice(offset, len);
2195 }
2196 return Ok(out.into_frame());
2197 }
2198 s.arg_sort(options)
2199 },
2200 _ => {
2201 if sort_options.nulls_last.iter().all(|&x| x)
2202 || has_nested
2203 || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2204 {
2205 argsort_multiple_row_fmt(
2206 &by_column,
2207 sort_options.descending,
2208 sort_options.nulls_last,
2209 sort_options.multithreaded,
2210 )?
2211 } else {
2212 let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2213 first
2214 .as_materialized_series()
2215 .arg_sort_multiple(&other, &sort_options)?
2216 }
2217 },
2218 };
2219
2220 if let Some((offset, len)) = slice {
2221 take = take.slice(offset, len);
2222 }
2223
2224 // SAFETY:
2225 // the created indices are in bounds
2226 let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2227 set_sorted(&mut df);
2228 Ok(df)
2229 }
2230
2231 /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2232 ///
2233 /// This dataframe does not necessarily have a specified schema and may be changed at any
2234 /// point. It is primarily used for debugging.
2235 pub fn _to_metadata(&self) -> DataFrame {
2236 let num_columns = self.columns.len();
2237
2238 let mut column_names =
2239 StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2240 let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2241 let mut sorted_asc_ca =
2242 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2243 let mut sorted_dsc_ca =
2244 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2245 let mut fast_explode_list_ca =
2246 BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2247 let mut materialized_at_ca =
2248 StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2249
2250 for col in &self.columns {
2251 let flags = col.get_flags();
2252
2253 let (repr, materialized_at) = match col {
2254 Column::Series(s) => ("series", s.materialized_at()),
2255 Column::Partitioned(_) => ("partitioned", None),
2256 Column::Scalar(_) => ("scalar", None),
2257 };
2258 let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2259 let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2260 let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2261
2262 column_names.append_value(col.name().clone());
2263 repr_ca.append_value(repr);
2264 sorted_asc_ca.append_value(sorted_asc);
2265 sorted_dsc_ca.append_value(sorted_dsc);
2266 fast_explode_list_ca.append_value(fast_explode_list);
2267 materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2268 }
2269
2270 unsafe {
2271 DataFrame::new_no_checks(
2272 self.width(),
2273 vec![
2274 column_names.finish().into_column(),
2275 repr_ca.finish().into_column(),
2276 sorted_asc_ca.finish().into_column(),
2277 sorted_dsc_ca.finish().into_column(),
2278 fast_explode_list_ca.finish().into_column(),
2279 materialized_at_ca.finish().into_column(),
2280 ],
2281 )
2282 }
2283 }
2284
2285 /// Return a sorted clone of this [`DataFrame`].
2286 ///
2287 /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2288 /// # Example
2289 ///
2290 /// Sort by a single column with default options:
2291 /// ```
2292 /// # use polars_core::prelude::*;
2293 /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2294 /// df.sort(["sepal_width"], Default::default())
2295 /// }
2296 /// ```
2297 /// Sort by a single column with specific order:
2298 /// ```
2299 /// # use polars_core::prelude::*;
2300 /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2301 /// df.sort(
2302 /// ["sepal_width"],
2303 /// SortMultipleOptions::new()
2304 /// .with_order_descending(descending)
2305 /// )
2306 /// }
2307 /// ```
2308 /// Sort by multiple columns with specifying order for each column:
2309 /// ```
2310 /// # use polars_core::prelude::*;
2311 /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2312 /// df.sort(
2313 /// ["sepal_width", "sepal_length"],
2314 /// SortMultipleOptions::new()
2315 /// .with_order_descending_multi([false, true])
2316 /// )
2317 /// }
2318 /// ```
2319 /// See [`SortMultipleOptions`] for more options.
2320 ///
2321 /// Also see [`DataFrame::sort_in_place`].
2322 pub fn sort(
2323 &self,
2324 by: impl IntoVec<PlSmallStr>,
2325 sort_options: SortMultipleOptions,
2326 ) -> PolarsResult<Self> {
2327 let mut df = self.clone();
2328 df.sort_in_place(by, sort_options)?;
2329 Ok(df)
2330 }
2331
2332 /// Replace a column with a [`Series`].
2333 ///
2334 /// # Example
2335 ///
2336 /// ```rust
2337 /// # use polars_core::prelude::*;
2338 /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2339 /// "Area (km²)" => [9_833_520, 9_596_961])?;
2340 /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2341 ///
2342 /// assert!(df.replace("Nation", s.clone()).is_err());
2343 /// assert!(df.replace("Country", s).is_ok());
2344 /// # Ok::<(), PolarsError>(())
2345 /// ```
2346 pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2347 self.apply(column, |_| new_col.into_series())
2348 }
2349
2350 /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2351 /// is that now the value of `column: &str` determines the name of the column and not the name
2352 /// of the `Series` passed to this method.
2353 pub fn replace_or_add<S: IntoSeries>(
2354 &mut self,
2355 column: PlSmallStr,
2356 new_col: S,
2357 ) -> PolarsResult<&mut Self> {
2358 let mut new_col = new_col.into_series();
2359 new_col.rename(column);
2360 self.with_column(new_col)
2361 }
2362
2363 /// Replace column at index `idx` with a [`Series`].
2364 ///
2365 /// # Example
2366 ///
2367 /// ```ignored
2368 /// # use polars_core::prelude::*;
2369 /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2370 /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2371 /// let mut df = DataFrame::new(vec![s0, s1])?;
2372 ///
2373 /// // Add 32 to get lowercase ascii values
2374 /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2375 /// # Ok::<(), PolarsError>(())
2376 /// ```
2377 pub fn replace_column<C: IntoColumn>(
2378 &mut self,
2379 index: usize,
2380 new_column: C,
2381 ) -> PolarsResult<&mut Self> {
2382 polars_ensure!(
2383 index < self.width(),
2384 ShapeMismatch:
2385 "unable to replace at index {}, the DataFrame has only {} columns",
2386 index, self.width(),
2387 );
2388 let mut new_column = new_column.into_column();
2389 polars_ensure!(
2390 new_column.len() == self.height(),
2391 ShapeMismatch:
2392 "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2393 new_column.len(), self.height(),
2394 );
2395 let old_col = &mut self.columns[index];
2396 mem::swap(old_col, &mut new_column);
2397 self.clear_schema();
2398 Ok(self)
2399 }
2400
2401 /// Apply a closure to a column. This is the recommended way to do in place modification.
2402 ///
2403 /// # Example
2404 ///
2405 /// ```rust
2406 /// # use polars_core::prelude::*;
2407 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2408 /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2409 /// let mut df = DataFrame::new(vec![s0, s1])?;
2410 ///
2411 /// fn str_to_len(str_val: &Column) -> Column {
2412 /// str_val.str()
2413 /// .unwrap()
2414 /// .into_iter()
2415 /// .map(|opt_name: Option<&str>| {
2416 /// opt_name.map(|name: &str| name.len() as u32)
2417 /// })
2418 /// .collect::<UInt32Chunked>()
2419 /// .into_column()
2420 /// }
2421 ///
2422 /// // Replace the names column by the length of the names.
2423 /// df.apply("names", str_to_len);
2424 /// # Ok::<(), PolarsError>(())
2425 /// ```
2426 /// Results in:
2427 ///
2428 /// ```text
2429 /// +--------+-------+
2430 /// | foo | |
2431 /// | --- | names |
2432 /// | str | u32 |
2433 /// +========+=======+
2434 /// | "ham" | 4 |
2435 /// +--------+-------+
2436 /// | "spam" | 6 |
2437 /// +--------+-------+
2438 /// | "egg" | 3 |
2439 /// +--------+-------+
2440 /// ```
2441 pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2442 where
2443 F: FnOnce(&Column) -> C,
2444 C: IntoColumn,
2445 {
2446 let idx = self.check_name_to_idx(name)?;
2447 self.apply_at_idx(idx, f)
2448 }
2449
2450 /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2451 /// modification.
2452 ///
2453 /// # Example
2454 ///
2455 /// ```rust
2456 /// # use polars_core::prelude::*;
2457 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2458 /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2459 /// let mut df = DataFrame::new(vec![s0, s1])?;
2460 ///
2461 /// // Add 32 to get lowercase ascii values
2462 /// df.apply_at_idx(1, |s| s + 32);
2463 /// # Ok::<(), PolarsError>(())
2464 /// ```
2465 /// Results in:
2466 ///
2467 /// ```text
2468 /// +--------+-------+
2469 /// | foo | ascii |
2470 /// | --- | --- |
2471 /// | str | i32 |
2472 /// +========+=======+
2473 /// | "ham" | 102 |
2474 /// +--------+-------+
2475 /// | "spam" | 111 |
2476 /// +--------+-------+
2477 /// | "egg" | 111 |
2478 /// +--------+-------+
2479 /// ```
2480 pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2481 where
2482 F: FnOnce(&Column) -> C,
2483 C: IntoColumn,
2484 {
2485 let df_height = self.height();
2486 let width = self.width();
2487 let col = self.columns.get_mut(idx).ok_or_else(|| {
2488 polars_err!(
2489 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2490 idx, width
2491 )
2492 })?;
2493 let name = col.name().clone();
2494 let new_col = f(col).into_column();
2495 match new_col.len() {
2496 1 => {
2497 let new_col = new_col.new_from_index(0, df_height);
2498 let _ = mem::replace(col, new_col);
2499 },
2500 len if (len == df_height) => {
2501 let _ = mem::replace(col, new_col);
2502 },
2503 len => polars_bail!(
2504 ShapeMismatch:
2505 "resulting Series has length {} while the DataFrame has height {}",
2506 len, df_height
2507 ),
2508 }
2509
2510 // make sure the name remains the same after applying the closure
2511 unsafe {
2512 let col = self.columns.get_unchecked_mut(idx);
2513 col.rename(name);
2514 }
2515 Ok(self)
2516 }
2517
2518 /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2519 /// modification.
2520 ///
2521 /// # Example
2522 ///
2523 /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2524 ///
2525 /// ```rust
2526 /// # use polars_core::prelude::*;
2527 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2528 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2529 /// let mut df = DataFrame::new(vec![s0, s1])?;
2530 ///
2531 /// let idx = vec![0, 1, 4];
2532 ///
2533 /// df.try_apply("foo", |c| {
2534 /// c.str()?
2535 /// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2536 /// });
2537 /// # Ok::<(), PolarsError>(())
2538 /// ```
2539 /// Results in:
2540 ///
2541 /// ```text
2542 /// +---------------------+--------+
2543 /// | foo | values |
2544 /// | --- | --- |
2545 /// | str | i32 |
2546 /// +=====================+========+
2547 /// | "ham-is-modified" | 1 |
2548 /// +---------------------+--------+
2549 /// | "spam-is-modified" | 2 |
2550 /// +---------------------+--------+
2551 /// | "egg" | 3 |
2552 /// +---------------------+--------+
2553 /// | "bacon" | 4 |
2554 /// +---------------------+--------+
2555 /// | "quack-is-modified" | 5 |
2556 /// +---------------------+--------+
2557 /// ```
2558 pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2559 where
2560 F: FnOnce(&Column) -> PolarsResult<C>,
2561 C: IntoColumn,
2562 {
2563 let width = self.width();
2564 let col = self.columns.get_mut(idx).ok_or_else(|| {
2565 polars_err!(
2566 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2567 idx, width
2568 )
2569 })?;
2570 let name = col.name().clone();
2571
2572 let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2573
2574 // make sure the name remains the same after applying the closure
2575 unsafe {
2576 let col = self.columns.get_unchecked_mut(idx);
2577 col.rename(name);
2578 }
2579 Ok(self)
2580 }
2581
2582 /// Apply a closure that may fail to a column. This is the recommended way to do in place
2583 /// modification.
2584 ///
2585 /// # Example
2586 ///
2587 /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2588 ///
2589 /// ```rust
2590 /// # use polars_core::prelude::*;
2591 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2592 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2593 /// let mut df = DataFrame::new(vec![s0, s1])?;
2594 ///
2595 /// // create a mask
2596 /// let values = df.column("values")?.as_materialized_series();
2597 /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2598 ///
2599 /// df.try_apply("foo", |c| {
2600 /// c.str()?
2601 /// .set(&mask, Some("not_within_bounds"))
2602 /// });
2603 /// # Ok::<(), PolarsError>(())
2604 /// ```
2605 /// Results in:
2606 ///
2607 /// ```text
2608 /// +---------------------+--------+
2609 /// | foo | values |
2610 /// | --- | --- |
2611 /// | str | i32 |
2612 /// +=====================+========+
2613 /// | "not_within_bounds" | 1 |
2614 /// +---------------------+--------+
2615 /// | "spam" | 2 |
2616 /// +---------------------+--------+
2617 /// | "egg" | 3 |
2618 /// +---------------------+--------+
2619 /// | "bacon" | 4 |
2620 /// +---------------------+--------+
2621 /// | "not_within_bounds" | 5 |
2622 /// +---------------------+--------+
2623 /// ```
2624 pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2625 where
2626 F: FnOnce(&Series) -> PolarsResult<C>,
2627 C: IntoColumn,
2628 {
2629 let idx = self.try_get_column_index(column)?;
2630 self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2631 }
2632
2633 /// Slice the [`DataFrame`] along the rows.
2634 ///
2635 /// # Example
2636 ///
2637 /// ```rust
2638 /// # use polars_core::prelude::*;
2639 /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2640 /// "Color" => ["Green", "Red", "White", "White", "Red"])?;
2641 /// let sl: DataFrame = df.slice(2, 3);
2642 ///
2643 /// assert_eq!(sl.shape(), (3, 2));
2644 /// println!("{}", sl);
2645 /// # Ok::<(), PolarsError>(())
2646 /// ```
2647 /// Output:
2648 /// ```text
2649 /// shape: (3, 2)
2650 /// +-------+-------+
2651 /// | Fruit | Color |
2652 /// | --- | --- |
2653 /// | str | str |
2654 /// +=======+=======+
2655 /// | Grape | White |
2656 /// +-------+-------+
2657 /// | Fig | White |
2658 /// +-------+-------+
2659 /// | Fig | Red |
2660 /// +-------+-------+
2661 /// ```
2662 #[must_use]
2663 pub fn slice(&self, offset: i64, length: usize) -> Self {
2664 if offset == 0 && length == self.height() {
2665 return self.clone();
2666 }
2667 if length == 0 {
2668 return self.clear();
2669 }
2670 let col = self
2671 .columns
2672 .iter()
2673 .map(|s| s.slice(offset, length))
2674 .collect::<Vec<_>>();
2675
2676 let height = if let Some(fst) = col.first() {
2677 fst.len()
2678 } else {
2679 let (_, length) = slice_offsets(offset, length, self.height());
2680 length
2681 };
2682
2683 unsafe { DataFrame::new_no_checks(height, col) }
2684 }
2685
2686 /// Split [`DataFrame`] at the given `offset`.
2687 pub fn split_at(&self, offset: i64) -> (Self, Self) {
2688 let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2689
2690 let (idx, _) = slice_offsets(offset, 0, self.height());
2691
2692 let a = unsafe { DataFrame::new_no_checks(idx, a) };
2693 let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2694 (a, b)
2695 }
2696
2697 pub fn clear(&self) -> Self {
2698 let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2699 unsafe { DataFrame::new_no_checks(0, col) }
2700 }
2701
2702 #[must_use]
2703 pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2704 if offset == 0 && length == self.height() {
2705 return self.clone();
2706 }
2707 let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2708 unsafe { DataFrame::new_no_checks(length, columns) }
2709 }
2710
2711 #[must_use]
2712 pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2713 if offset == 0 && length == self.height() {
2714 return self.clone();
2715 }
2716 // @scalar-opt
2717 let columns = self._apply_columns(&|s| {
2718 let mut out = s.slice(offset, length);
2719 out.shrink_to_fit();
2720 out
2721 });
2722 unsafe { DataFrame::new_no_checks(length, columns) }
2723 }
2724
2725 /// Get the head of the [`DataFrame`].
2726 ///
2727 /// # Example
2728 ///
2729 /// ```rust
2730 /// # use polars_core::prelude::*;
2731 /// let countries: DataFrame =
2732 /// df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2733 /// "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2734 /// "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2735 /// "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2736 /// assert_eq!(countries.shape(), (5, 4));
2737 ///
2738 /// println!("{}", countries.head(Some(3)));
2739 /// # Ok::<(), PolarsError>(())
2740 /// ```
2741 ///
2742 /// Output:
2743 ///
2744 /// ```text
2745 /// shape: (3, 4)
2746 /// +--------------------+---------------+---------------+------------+
2747 /// | Rank by GDP (2021) | Continent | Country | Capital |
2748 /// | --- | --- | --- | --- |
2749 /// | i32 | str | str | str |
2750 /// +====================+===============+===============+============+
2751 /// | 1 | North America | United States | Washington |
2752 /// +--------------------+---------------+---------------+------------+
2753 /// | 2 | Asia | China | Beijing |
2754 /// +--------------------+---------------+---------------+------------+
2755 /// | 3 | Asia | Japan | Tokyo |
2756 /// +--------------------+---------------+---------------+------------+
2757 /// ```
2758 #[must_use]
2759 pub fn head(&self, length: Option<usize>) -> Self {
2760 let col = self
2761 .columns
2762 .iter()
2763 .map(|c| c.head(length))
2764 .collect::<Vec<_>>();
2765
2766 let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2767 let height = usize::min(height, self.height());
2768 unsafe { DataFrame::new_no_checks(height, col) }
2769 }
2770
2771 /// Get the tail of the [`DataFrame`].
2772 ///
2773 /// # Example
2774 ///
2775 /// ```rust
2776 /// # use polars_core::prelude::*;
2777 /// let countries: DataFrame =
2778 /// df!("Rank (2021)" => [105, 106, 107, 108, 109],
2779 /// "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2780 /// "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2781 /// assert_eq!(countries.shape(), (5, 3));
2782 ///
2783 /// println!("{}", countries.tail(Some(2)));
2784 /// # Ok::<(), PolarsError>(())
2785 /// ```
2786 ///
2787 /// Output:
2788 ///
2789 /// ```text
2790 /// shape: (2, 3)
2791 /// +-------------+--------------------+---------+
2792 /// | Rank (2021) | Apple Price (€/kg) | Country |
2793 /// | --- | --- | --- |
2794 /// | i32 | f64 | str |
2795 /// +=============+====================+=========+
2796 /// | 108 | 0.63 | Syria |
2797 /// +-------------+--------------------+---------+
2798 /// | 109 | 0.63 | Turkey |
2799 /// +-------------+--------------------+---------+
2800 /// ```
2801 #[must_use]
2802 pub fn tail(&self, length: Option<usize>) -> Self {
2803 let col = self
2804 .columns
2805 .iter()
2806 .map(|c| c.tail(length))
2807 .collect::<Vec<_>>();
2808
2809 let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2810 let height = usize::min(height, self.height());
2811 unsafe { DataFrame::new_no_checks(height, col) }
2812 }
2813
2814 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2815 ///
2816 /// # Panics
2817 ///
2818 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2819 ///
2820 /// This responsibility is left to the caller as we don't want to take mutable references here,
2821 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2822 /// as well.
2823 pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter {
2824 debug_assert!(!self.should_rechunk(), "expected equal chunks");
2825 // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2826 // as we must allocate arrow strings/binaries.
2827 let must_convert = compat_level.0 == 0;
2828 let parallel = parallel
2829 && must_convert
2830 && self.columns.len() > 1
2831 && self
2832 .columns
2833 .iter()
2834 .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2835
2836 RecordBatchIter {
2837 columns: &self.columns,
2838 schema: Arc::new(
2839 self.columns
2840 .iter()
2841 .map(|c| c.field().to_arrow(compat_level))
2842 .collect(),
2843 ),
2844 idx: 0,
2845 n_chunks: self.first_col_n_chunks(),
2846 compat_level,
2847 parallel,
2848 }
2849 }
2850
2851 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2852 ///
2853 /// # Panics
2854 ///
2855 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2856 ///
2857 /// This responsibility is left to the caller as we don't want to take mutable references here,
2858 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2859 /// as well.
2860 pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2861 PhysRecordBatchIter {
2862 schema: Arc::new(
2863 self.get_columns()
2864 .iter()
2865 .map(|c| c.field().to_arrow(CompatLevel::newest()))
2866 .collect(),
2867 ),
2868 arr_iters: self
2869 .materialized_column_iter()
2870 .map(|s| s.chunks().iter())
2871 .collect(),
2872 }
2873 }
2874
2875 /// Get a [`DataFrame`] with all the columns in reversed order.
2876 #[must_use]
2877 pub fn reverse(&self) -> Self {
2878 let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2879 unsafe { DataFrame::new_no_checks(self.height(), col) }
2880 }
2881
2882 /// Shift the values by a given period and fill the parts that will be empty due to this operation
2883 /// with `Nones`.
2884 ///
2885 /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2886 #[must_use]
2887 pub fn shift(&self, periods: i64) -> Self {
2888 let col = self._apply_columns_par(&|s| s.shift(periods));
2889 unsafe { DataFrame::new_no_checks(self.height(), col) }
2890 }
2891
2892 /// Replace None values with one of the following strategies:
2893 /// * Forward fill (replace None with the previous value)
2894 /// * Backward fill (replace None with the next value)
2895 /// * Mean fill (replace None with the mean of the whole array)
2896 /// * Min fill (replace None with the minimum of the whole array)
2897 /// * Max fill (replace None with the maximum of the whole array)
2898 ///
2899 /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2900 pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2901 let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2902
2903 Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2904 }
2905
2906 /// Pipe different functions/ closure operations that work on a DataFrame together.
2907 pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2908 where
2909 F: Fn(DataFrame) -> PolarsResult<B>,
2910 {
2911 f(self)
2912 }
2913
2914 /// Pipe different functions/ closure operations that work on a DataFrame together.
2915 pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2916 where
2917 F: Fn(&mut DataFrame) -> PolarsResult<B>,
2918 {
2919 f(self)
2920 }
2921
2922 /// Pipe different functions/ closure operations that work on a DataFrame together.
2923 pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2924 where
2925 F: Fn(DataFrame, Args) -> PolarsResult<B>,
2926 {
2927 f(self, args)
2928 }
2929
2930 /// Drop duplicate rows from a [`DataFrame`].
2931 /// *This fails when there is a column of type List in DataFrame*
2932 ///
2933 /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2934 ///
2935 /// # Example
2936 ///
2937 /// ```no_run
2938 /// # use polars_core::prelude::*;
2939 /// let df = df! {
2940 /// "flt" => [1., 1., 2., 2., 3., 3.],
2941 /// "int" => [1, 1, 2, 2, 3, 3, ],
2942 /// "str" => ["a", "a", "b", "b", "c", "c"]
2943 /// }?;
2944 ///
2945 /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2946 /// # Ok::<(), PolarsError>(())
2947 /// ```
2948 /// Returns
2949 ///
2950 /// ```text
2951 /// +-----+-----+-----+
2952 /// | flt | int | str |
2953 /// | --- | --- | --- |
2954 /// | f64 | i32 | str |
2955 /// +=====+=====+=====+
2956 /// | 1 | 1 | "a" |
2957 /// +-----+-----+-----+
2958 /// | 2 | 2 | "b" |
2959 /// +-----+-----+-----+
2960 /// | 3 | 3 | "c" |
2961 /// +-----+-----+-----+
2962 /// ```
2963 #[cfg(feature = "algorithm_group_by")]
2964 pub fn unique_stable(
2965 &self,
2966 subset: Option<&[String]>,
2967 keep: UniqueKeepStrategy,
2968 slice: Option<(i64, usize)>,
2969 ) -> PolarsResult<DataFrame> {
2970 self.unique_impl(
2971 true,
2972 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2973 keep,
2974 slice,
2975 )
2976 }
2977
2978 /// Unstable distinct. See [`DataFrame::unique_stable`].
2979 #[cfg(feature = "algorithm_group_by")]
2980 pub fn unique<I, S>(
2981 &self,
2982 subset: Option<&[String]>,
2983 keep: UniqueKeepStrategy,
2984 slice: Option<(i64, usize)>,
2985 ) -> PolarsResult<DataFrame> {
2986 self.unique_impl(
2987 false,
2988 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2989 keep,
2990 slice,
2991 )
2992 }
2993
2994 #[cfg(feature = "algorithm_group_by")]
2995 pub fn unique_impl(
2996 &self,
2997 maintain_order: bool,
2998 subset: Option<Vec<PlSmallStr>>,
2999 keep: UniqueKeepStrategy,
3000 slice: Option<(i64, usize)>,
3001 ) -> PolarsResult<Self> {
3002 let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3003 let mut df = self.clone();
3004 // take on multiple chunks is terrible
3005 df.as_single_chunk_par();
3006
3007 let columns = match (keep, maintain_order) {
3008 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3009 let gb = df.group_by_stable(names)?;
3010 let groups = gb.get_groups();
3011 let (offset, len) = slice.unwrap_or((0, groups.len()));
3012 let groups = groups.slice(offset, len);
3013 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3014 },
3015 (UniqueKeepStrategy::Last, true) => {
3016 // maintain order by last values, so the sorted groups are not correct as they
3017 // are sorted by the first value
3018 let gb = df.group_by(names)?;
3019 let groups = gb.get_groups();
3020
3021 let func = |g: GroupsIndicator| match g {
3022 GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3023 GroupsIndicator::Slice([first, len]) => first + len - 1,
3024 };
3025
3026 let last_idx: NoNull<IdxCa> = match slice {
3027 None => groups.iter().map(func).collect(),
3028 Some((offset, len)) => {
3029 let (offset, len) = slice_offsets(offset, len, groups.len());
3030 groups.iter().skip(offset).take(len).map(func).collect()
3031 },
3032 };
3033
3034 let last_idx = last_idx.sort(false);
3035 return Ok(unsafe { df.take_unchecked(&last_idx) });
3036 },
3037 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3038 let gb = df.group_by(names)?;
3039 let groups = gb.get_groups();
3040 let (offset, len) = slice.unwrap_or((0, groups.len()));
3041 let groups = groups.slice(offset, len);
3042 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3043 },
3044 (UniqueKeepStrategy::Last, false) => {
3045 let gb = df.group_by(names)?;
3046 let groups = gb.get_groups();
3047 let (offset, len) = slice.unwrap_or((0, groups.len()));
3048 let groups = groups.slice(offset, len);
3049 df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3050 },
3051 (UniqueKeepStrategy::None, _) => {
3052 let df_part = df.select(names)?;
3053 let mask = df_part.is_unique()?;
3054 let mask = match slice {
3055 None => mask,
3056 Some((offset, len)) => mask.slice(offset, len),
3057 };
3058 return df.filter(&mask);
3059 },
3060 };
3061
3062 let height = Self::infer_height(&columns);
3063 Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3064 }
3065
3066 /// Get a mask of all the unique rows in the [`DataFrame`].
3067 ///
3068 /// # Example
3069 ///
3070 /// ```no_run
3071 /// # use polars_core::prelude::*;
3072 /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3073 /// "ISIN" => ["US0378331005", "US5949181045"])?;
3074 /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3075 ///
3076 /// assert!(ca.all());
3077 /// # Ok::<(), PolarsError>(())
3078 /// ```
3079 #[cfg(feature = "algorithm_group_by")]
3080 pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3081 let gb = self.group_by(self.get_column_names_owned())?;
3082 let groups = gb.get_groups();
3083 Ok(is_unique_helper(
3084 groups,
3085 self.height() as IdxSize,
3086 true,
3087 false,
3088 ))
3089 }
3090
3091 /// Get a mask of all the duplicated rows in the [`DataFrame`].
3092 ///
3093 /// # Example
3094 ///
3095 /// ```no_run
3096 /// # use polars_core::prelude::*;
3097 /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3098 /// "ISIN" => ["US02079K3059", "US02079K1079"])?;
3099 /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3100 ///
3101 /// assert!(!ca.all());
3102 /// # Ok::<(), PolarsError>(())
3103 /// ```
3104 #[cfg(feature = "algorithm_group_by")]
3105 pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3106 let gb = self.group_by(self.get_column_names_owned())?;
3107 let groups = gb.get_groups();
3108 Ok(is_unique_helper(
3109 groups,
3110 self.height() as IdxSize,
3111 false,
3112 true,
3113 ))
3114 }
3115
3116 /// Create a new [`DataFrame`] that shows the null counts per column.
3117 #[must_use]
3118 pub fn null_count(&self) -> Self {
3119 let cols = self
3120 .columns
3121 .iter()
3122 .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3123 .collect();
3124 unsafe { Self::new_no_checks(1, cols) }
3125 }
3126
3127 /// Hash and combine the row values
3128 #[cfg(feature = "row_hash")]
3129 pub fn hash_rows(
3130 &mut self,
3131 hasher_builder: Option<PlSeedableRandomStateQuality>,
3132 ) -> PolarsResult<UInt64Chunked> {
3133 let dfs = split_df(self, POOL.current_num_threads(), false);
3134 let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3135
3136 let mut iter = cas.into_iter();
3137 let mut acc_ca = iter.next().unwrap();
3138 for ca in iter {
3139 acc_ca.append(&ca)?;
3140 }
3141 Ok(acc_ca.rechunk().into_owned())
3142 }
3143
3144 /// Get the supertype of the columns in this DataFrame
3145 pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3146 self.columns
3147 .iter()
3148 .map(|s| Ok(s.dtype().clone()))
3149 .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3150 }
3151
3152 /// Take by index values given by the slice `idx`.
3153 /// # Warning
3154 /// Be careful with allowing threads when calling this in a large hot loop
3155 /// every thread split may be on rayon stack and lead to SO
3156 #[doc(hidden)]
3157 pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3158 self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3159 }
3160
3161 /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3162 /// if the index value in `idx` are sorted. This will maintain sorted flags.
3163 ///
3164 /// # Warning
3165 /// Be careful with allowing threads when calling this in a large hot loop
3166 /// every thread split may be on rayon stack and lead to SO
3167 #[doc(hidden)]
3168 pub unsafe fn _take_unchecked_slice_sorted(
3169 &self,
3170 idx: &[IdxSize],
3171 allow_threads: bool,
3172 sorted: IsSorted,
3173 ) -> Self {
3174 #[cfg(debug_assertions)]
3175 {
3176 if idx.len() > 2 {
3177 match sorted {
3178 IsSorted::Ascending => {
3179 assert!(idx[0] <= idx[idx.len() - 1]);
3180 },
3181 IsSorted::Descending => {
3182 assert!(idx[0] >= idx[idx.len() - 1]);
3183 },
3184 _ => {},
3185 }
3186 }
3187 }
3188 let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3189 ca.set_sorted_flag(sorted);
3190 self.take_unchecked_impl(&ca, allow_threads)
3191 }
3192
3193 #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3194 #[doc(hidden)]
3195 pub fn _partition_by_impl(
3196 &self,
3197 cols: &[PlSmallStr],
3198 stable: bool,
3199 include_key: bool,
3200 parallel: bool,
3201 ) -> PolarsResult<Vec<DataFrame>> {
3202 let selected_keys = self.select_columns(cols.iter().cloned())?;
3203 let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3204 let groups = groups.take_groups();
3205
3206 // drop key columns prior to calculation if requested
3207 let df = if include_key {
3208 self.clone()
3209 } else {
3210 self.drop_many(cols.iter().cloned())
3211 };
3212
3213 if parallel {
3214 // don't parallelize this
3215 // there is a lot of parallelization in take and this may easily SO
3216 POOL.install(|| {
3217 match groups.as_ref() {
3218 GroupsType::Idx(idx) => {
3219 // Rechunk as the gather may rechunk for every group #17562.
3220 let mut df = df.clone();
3221 df.as_single_chunk_par();
3222 Ok(idx
3223 .into_par_iter()
3224 .map(|(_, group)| {
3225 // groups are in bounds
3226 unsafe {
3227 df._take_unchecked_slice_sorted(
3228 group,
3229 false,
3230 IsSorted::Ascending,
3231 )
3232 }
3233 })
3234 .collect())
3235 },
3236 GroupsType::Slice { groups, .. } => Ok(groups
3237 .into_par_iter()
3238 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3239 .collect()),
3240 }
3241 })
3242 } else {
3243 match groups.as_ref() {
3244 GroupsType::Idx(idx) => {
3245 // Rechunk as the gather may rechunk for every group #17562.
3246 let mut df = df.clone();
3247 df.as_single_chunk();
3248 Ok(idx
3249 .into_iter()
3250 .map(|(_, group)| {
3251 // groups are in bounds
3252 unsafe {
3253 df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3254 }
3255 })
3256 .collect())
3257 },
3258 GroupsType::Slice { groups, .. } => Ok(groups
3259 .iter()
3260 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3261 .collect()),
3262 }
3263 }
3264 }
3265
3266 /// Split into multiple DataFrames partitioned by groups
3267 #[cfg(feature = "partition_by")]
3268 pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3269 where
3270 I: IntoIterator<Item = S>,
3271 S: Into<PlSmallStr>,
3272 {
3273 let cols = cols
3274 .into_iter()
3275 .map(Into::into)
3276 .collect::<Vec<PlSmallStr>>();
3277 self._partition_by_impl(cols.as_slice(), false, include_key, true)
3278 }
3279
3280 /// Split into multiple DataFrames partitioned by groups
3281 /// Order of the groups are maintained.
3282 #[cfg(feature = "partition_by")]
3283 pub fn partition_by_stable<I, S>(
3284 &self,
3285 cols: I,
3286 include_key: bool,
3287 ) -> PolarsResult<Vec<DataFrame>>
3288 where
3289 I: IntoIterator<Item = S>,
3290 S: Into<PlSmallStr>,
3291 {
3292 let cols = cols
3293 .into_iter()
3294 .map(Into::into)
3295 .collect::<Vec<PlSmallStr>>();
3296 self._partition_by_impl(cols.as_slice(), true, include_key, true)
3297 }
3298
3299 /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3300 /// inserted as columns.
3301 #[cfg(feature = "dtype-struct")]
3302 pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3303 let cols = cols.into_vec();
3304 self.unnest_impl(cols.into_iter().collect())
3305 }
3306
3307 #[cfg(feature = "dtype-struct")]
3308 fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3309 let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3310 let mut count = 0;
3311 for s in &self.columns {
3312 if cols.contains(s.name()) {
3313 let ca = s.struct_()?.clone();
3314 new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3315 count += 1;
3316 } else {
3317 new_cols.push(s.clone())
3318 }
3319 }
3320 if count != cols.len() {
3321 // one or more columns not found
3322 // the code below will return an error with the missing name
3323 let schema = self.schema();
3324 for col in cols {
3325 let _ = schema
3326 .get(col.as_str())
3327 .ok_or_else(|| polars_err!(col_not_found = col))?;
3328 }
3329 }
3330 DataFrame::new(new_cols)
3331 }
3332
3333 pub(crate) fn infer_height(cols: &[Column]) -> usize {
3334 cols.first().map_or(0, Column::len)
3335 }
3336
3337 pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3338 // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3339 // append_chunk or something like this. It is just quite difficult to make that safe.
3340 let df = DataFrame::from(rb);
3341 polars_ensure!(
3342 self.schema() == df.schema(),
3343 SchemaMismatch: "cannot append record batch with different schema",
3344 );
3345 self.vstack_mut_owned_unchecked(df);
3346 Ok(())
3347 }
3348}
3349
3350pub struct RecordBatchIter<'a> {
3351 columns: &'a Vec<Column>,
3352 schema: ArrowSchemaRef,
3353 idx: usize,
3354 n_chunks: usize,
3355 compat_level: CompatLevel,
3356 parallel: bool,
3357}
3358
3359impl Iterator for RecordBatchIter<'_> {
3360 type Item = RecordBatch;
3361
3362 fn next(&mut self) -> Option<Self::Item> {
3363 if self.idx >= self.n_chunks {
3364 return None;
3365 }
3366
3367 // Create a batch of the columns with the same chunk no.
3368 let batch_cols: Vec<ArrayRef> = if self.parallel {
3369 let iter = self
3370 .columns
3371 .par_iter()
3372 .map(Column::as_materialized_series)
3373 .map(|s| s.to_arrow(self.idx, self.compat_level));
3374 POOL.install(|| iter.collect())
3375 } else {
3376 self.columns
3377 .iter()
3378 .map(Column::as_materialized_series)
3379 .map(|s| s.to_arrow(self.idx, self.compat_level))
3380 .collect()
3381 };
3382 self.idx += 1;
3383
3384 let length = batch_cols.first().map_or(0, |arr| arr.len());
3385 Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3386 }
3387
3388 fn size_hint(&self) -> (usize, Option<usize>) {
3389 let n = self.n_chunks - self.idx;
3390 (n, Some(n))
3391 }
3392}
3393
3394pub struct PhysRecordBatchIter<'a> {
3395 schema: ArrowSchemaRef,
3396 arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3397}
3398
3399impl Iterator for PhysRecordBatchIter<'_> {
3400 type Item = RecordBatch;
3401
3402 fn next(&mut self) -> Option<Self::Item> {
3403 let arrs = self
3404 .arr_iters
3405 .iter_mut()
3406 .map(|phys_iter| phys_iter.next().cloned())
3407 .collect::<Option<Vec<_>>>()?;
3408
3409 let length = arrs.first().map_or(0, |arr| arr.len());
3410 Some(RecordBatch::new(length, self.schema.clone(), arrs))
3411 }
3412
3413 fn size_hint(&self) -> (usize, Option<usize>) {
3414 if let Some(iter) = self.arr_iters.first() {
3415 iter.size_hint()
3416 } else {
3417 (0, None)
3418 }
3419 }
3420}
3421
3422impl Default for DataFrame {
3423 fn default() -> Self {
3424 DataFrame::empty()
3425 }
3426}
3427
3428impl From<DataFrame> for Vec<Column> {
3429 fn from(df: DataFrame) -> Self {
3430 df.columns
3431 }
3432}
3433
3434// utility to test if we can vstack/extend the columns
3435fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3436 polars_ensure!(
3437 left.name() == right.name(),
3438 ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3439 left.name(), right.name(),
3440 );
3441 Ok(())
3442}
3443
3444#[cfg(test)]
3445mod test {
3446 use super::*;
3447
3448 fn create_frame() -> DataFrame {
3449 let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3450 let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3451 DataFrame::new(vec![s0, s1]).unwrap()
3452 }
3453
3454 #[test]
3455 #[cfg_attr(miri, ignore)]
3456 fn test_recordbatch_iterator() {
3457 let df = df!(
3458 "foo" => [1, 2, 3, 4, 5]
3459 )
3460 .unwrap();
3461 let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3462 assert_eq!(5, iter.next().unwrap().len());
3463 assert!(iter.next().is_none());
3464 }
3465
3466 #[test]
3467 #[cfg_attr(miri, ignore)]
3468 fn test_select() {
3469 let df = create_frame();
3470 assert_eq!(
3471 df.column("days")
3472 .unwrap()
3473 .as_series()
3474 .unwrap()
3475 .equal(1)
3476 .unwrap()
3477 .sum(),
3478 Some(1)
3479 );
3480 }
3481
3482 #[test]
3483 #[cfg_attr(miri, ignore)]
3484 fn test_filter_broadcast_on_string_col() {
3485 let col_name = "some_col";
3486 let v = vec!["test".to_string()];
3487 let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3488 let mut df = DataFrame::new(vec![s0]).unwrap();
3489
3490 df = df
3491 .filter(
3492 &df.column(col_name)
3493 .unwrap()
3494 .as_materialized_series()
3495 .equal("")
3496 .unwrap(),
3497 )
3498 .unwrap();
3499 assert_eq!(
3500 df.column(col_name)
3501 .unwrap()
3502 .as_materialized_series()
3503 .n_chunks(),
3504 1
3505 );
3506 }
3507
3508 #[test]
3509 #[cfg_attr(miri, ignore)]
3510 fn test_filter_broadcast_on_list_col() {
3511 let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3512 let ll: ListChunked = [&s1].iter().copied().collect();
3513
3514 let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3515 let new = ll.filter(&mask).unwrap();
3516
3517 assert_eq!(new.chunks.len(), 1);
3518 assert_eq!(new.len(), 0);
3519 }
3520
3521 #[test]
3522 fn slice() {
3523 let df = create_frame();
3524 let sliced_df = df.slice(0, 2);
3525 assert_eq!(sliced_df.shape(), (2, 2));
3526 }
3527
3528 #[test]
3529 fn rechunk_false() {
3530 let df = create_frame();
3531 assert!(!df.should_rechunk())
3532 }
3533
3534 #[test]
3535 fn rechunk_true() -> PolarsResult<()> {
3536 let mut base = df!(
3537 "a" => [1, 2, 3],
3538 "b" => [1, 2, 3]
3539 )?;
3540
3541 // Create a series with multiple chunks
3542 let mut s = Series::new("foo".into(), 0..2);
3543 let s2 = Series::new("bar".into(), 0..1);
3544 s.append(&s2)?;
3545
3546 // Append series to frame
3547 let out = base.with_column(s)?;
3548
3549 // Now we should rechunk
3550 assert!(out.should_rechunk());
3551 Ok(())
3552 }
3553
3554 #[test]
3555 fn test_duplicate_column() {
3556 let mut df = df! {
3557 "foo" => [1, 2, 3]
3558 }
3559 .unwrap();
3560 // check if column is replaced
3561 assert!(
3562 df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3563 .is_ok()
3564 );
3565 assert!(
3566 df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3567 .is_ok()
3568 );
3569 assert!(df.column("bar").is_ok())
3570 }
3571
3572 #[test]
3573 #[cfg_attr(miri, ignore)]
3574 fn distinct() {
3575 let df = df! {
3576 "flt" => [1., 1., 2., 2., 3., 3.],
3577 "int" => [1, 1, 2, 2, 3, 3, ],
3578 "str" => ["a", "a", "b", "b", "c", "c"]
3579 }
3580 .unwrap();
3581 let df = df
3582 .unique_stable(None, UniqueKeepStrategy::First, None)
3583 .unwrap()
3584 .sort(["flt"], SortMultipleOptions::default())
3585 .unwrap();
3586 let valid = df! {
3587 "flt" => [1., 2., 3.],
3588 "int" => [1, 2, 3],
3589 "str" => ["a", "b", "c"]
3590 }
3591 .unwrap();
3592 assert!(df.equals(&valid));
3593 }
3594
3595 #[test]
3596 fn test_vstack() {
3597 // check that it does not accidentally rechunks
3598 let mut df = df! {
3599 "flt" => [1., 1., 2., 2., 3., 3.],
3600 "int" => [1, 1, 2, 2, 3, 3, ],
3601 "str" => ["a", "a", "b", "b", "c", "c"]
3602 }
3603 .unwrap();
3604
3605 df.vstack_mut(&df.slice(0, 3)).unwrap();
3606 assert_eq!(df.first_col_n_chunks(), 2)
3607 }
3608
3609 #[test]
3610 fn test_vstack_on_empty_dataframe() {
3611 let mut df = DataFrame::empty();
3612
3613 let df_data = df! {
3614 "flt" => [1., 1., 2., 2., 3., 3.],
3615 "int" => [1, 1, 2, 2, 3, 3, ],
3616 "str" => ["a", "a", "b", "b", "c", "c"]
3617 }
3618 .unwrap();
3619
3620 df.vstack_mut(&df_data).unwrap();
3621 assert_eq!(df.height, 6)
3622 }
3623
3624 #[test]
3625 fn test_replace_or_add() -> PolarsResult<()> {
3626 let mut df = df!(
3627 "a" => [1, 2, 3],
3628 "b" => [1, 2, 3]
3629 )?;
3630
3631 // check that the new column is "c" and not "bar".
3632 df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3633
3634 assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3635 Ok(())
3636 }
3637}