polars_core/frame/mod.rs
1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(feature = "proptest")]
33pub mod proptest;
34#[cfg(any(feature = "rows", feature = "object"))]
35pub mod row;
36mod top_k;
37mod upstream_traits;
38mod validation;
39
40use arrow::record_batch::{RecordBatch, RecordBatchT};
41use polars_utils::pl_str::PlSmallStr;
42#[cfg(feature = "serde")]
43use serde::{Deserialize, Serialize};
44use strum_macros::IntoStaticStr;
45
46use crate::POOL;
47#[cfg(feature = "row_hash")]
48use crate::hashing::_df_rows_to_hashes_threaded_vertical;
49use crate::prelude::sort::arg_sort;
50use crate::series::IsSorted;
51
52#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
55#[strum(serialize_all = "snake_case")]
56pub enum UniqueKeepStrategy {
57 /// Keep the first unique row.
58 First,
59 /// Keep the last unique row.
60 Last,
61 /// Keep None of the unique rows.
62 None,
63 /// Keep any of the unique rows
64 /// This allows more optimizations
65 #[default]
66 Any,
67}
68
69fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
70where
71 F: for<'a> FnMut(&'a T) -> &'a str,
72{
73 // Always unique.
74 if items.len() <= 1 {
75 return Ok(());
76 }
77
78 if items.len() <= 4 {
79 // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
80 for i in 0..items.len() - 1 {
81 let name = get_name(&items[i]);
82 for other in items.iter().skip(i + 1) {
83 if name == get_name(other) {
84 polars_bail!(duplicate = name);
85 }
86 }
87 }
88 } else {
89 let mut names = PlHashSet::with_capacity(items.len());
90 for item in items {
91 let name = get_name(item);
92 if !names.insert(name) {
93 polars_bail!(duplicate = name);
94 }
95 }
96 }
97 Ok(())
98}
99
100/// A contiguous growable collection of `Series` that have the same length.
101///
102/// ## Use declarations
103///
104/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
105///
106/// ```rust
107/// use polars_core::prelude::*; // if the crate polars-core is used directly
108/// // use polars::prelude::*; if the crate polars is used
109/// ```
110///
111/// # Initialization
112/// ## Default
113///
114/// A `DataFrame` can be initialized empty:
115///
116/// ```rust
117/// # use polars_core::prelude::*;
118/// let df = DataFrame::default();
119/// assert!(df.is_empty());
120/// ```
121///
122/// ## Wrapping a `Vec<Series>`
123///
124/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
125///
126/// ```rust
127/// # use polars_core::prelude::*;
128/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
129/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
130///
131/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
132/// ```
133///
134/// ## Using a macro
135///
136/// The [`df!`] macro is a convenient method:
137///
138/// ```rust
139/// # use polars_core::prelude::*;
140/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
141/// "Color" => ["Red", "Yellow", "Green"]);
142/// ```
143///
144/// ## Using a CSV file
145///
146/// See the `polars_io::csv::CsvReader`.
147///
148/// # Indexing
149/// ## By a number
150///
151/// The `Index<usize>` is implemented for the `DataFrame`.
152///
153/// ```rust
154/// # use polars_core::prelude::*;
155/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
156/// "Color" => ["Red", "Yellow", "Green"])?;
157///
158/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
159/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
160/// # Ok::<(), PolarsError>(())
161/// ```
162///
163/// ## By a `Series` name
164///
165/// ```rust
166/// # use polars_core::prelude::*;
167/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
168/// "Color" => ["Red", "Yellow", "Green"])?;
169///
170/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
171/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
172/// # Ok::<(), PolarsError>(())
173/// ```
174#[derive(Clone)]
175pub struct DataFrame {
176 height: usize,
177 // invariant: columns[i].len() == height for each 0 >= i > columns.len()
178 pub(crate) columns: Vec<Column>,
179
180 /// A cached schema. This might not give correct results if the DataFrame was modified in place
181 /// between schema and reading.
182 cached_schema: OnceLock<SchemaRef>,
183}
184
185impl DataFrame {
186 pub fn clear_schema(&mut self) {
187 self.cached_schema = OnceLock::new();
188 }
189
190 #[inline]
191 pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
192 self.columns.iter()
193 }
194
195 #[inline]
196 pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
197 self.columns.iter().map(Column::as_materialized_series)
198 }
199
200 #[inline]
201 pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
202 self.columns.par_iter().map(Column::as_materialized_series)
203 }
204
205 /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
206 ///
207 /// # Implementation
208 /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
209 /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
210 /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
211 ///
212 /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
213 /// However, this function will yield a smaller number. This is because this function returns
214 /// the visible size of the buffer, not its total capacity.
215 ///
216 /// FFI buffers are included in this estimation.
217 pub fn estimated_size(&self) -> usize {
218 self.columns.iter().map(Column::estimated_size).sum()
219 }
220
221 // Reduce monomorphization.
222 fn try_apply_columns(
223 &self,
224 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
225 ) -> PolarsResult<Vec<Column>> {
226 self.columns.iter().map(func).collect()
227 }
228 // Reduce monomorphization.
229 pub fn _apply_columns(&self, func: &dyn Fn(&Column) -> Column) -> Vec<Column> {
230 self.columns.iter().map(func).collect()
231 }
232 // Reduce monomorphization.
233 fn try_apply_columns_par(
234 &self,
235 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
236 ) -> PolarsResult<Vec<Column>> {
237 POOL.install(|| self.columns.par_iter().map(func).collect())
238 }
239 // Reduce monomorphization.
240 pub fn _apply_columns_par(
241 &self,
242 func: &(dyn Fn(&Column) -> Column + Send + Sync),
243 ) -> Vec<Column> {
244 POOL.install(|| self.columns.par_iter().map(func).collect())
245 }
246
247 /// Get the index of the column.
248 fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
249 self.get_column_index(name)
250 .ok_or_else(|| polars_err!(col_not_found = name))
251 }
252
253 fn check_already_present(&self, name: &str) -> PolarsResult<()> {
254 polars_ensure!(
255 self.columns.iter().all(|s| s.name().as_str() != name),
256 Duplicate: "column with name {:?} is already present in the DataFrame", name
257 );
258 Ok(())
259 }
260
261 /// Reserve additional slots into the chunks of the series.
262 pub(crate) fn reserve_chunks(&mut self, additional: usize) {
263 for s in &mut self.columns {
264 if let Column::Series(s) = s {
265 // SAFETY:
266 // do not modify the data, simply resize.
267 unsafe { s.chunks_mut().reserve(additional) }
268 }
269 }
270 }
271
272 /// Create a DataFrame from a Vector of Series.
273 ///
274 /// Errors if a column names are not unique, or if heights are not all equal.
275 ///
276 /// # Example
277 ///
278 /// ```
279 /// # use polars_core::prelude::*;
280 /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
281 /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
282 ///
283 /// let df = DataFrame::new(vec![s0, s1])?;
284 /// # Ok::<(), PolarsError>(())
285 /// ```
286 pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
287 DataFrame::validate_columns_slice(&columns)
288 .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
289 Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
290 }
291
292 pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
293 for col in &columns {
294 polars_ensure!(
295 col.len() == height,
296 ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
297 columns[0].name(), height, col.name(), col.len()
298 );
299 }
300
301 ensure_names_unique(&columns, |s| s.name().as_str())?;
302
303 Ok(DataFrame {
304 height,
305 columns,
306 cached_schema: OnceLock::new(),
307 })
308 }
309
310 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
311 /// columns to match the other columns.
312 pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
313 // The length of the longest non-unit length column determines the
314 // broadcast length. If all columns are unit-length the broadcast length
315 // is one.
316 let broadcast_len = columns
317 .iter()
318 .map(|s| s.len())
319 .filter(|l| *l != 1)
320 .max()
321 .unwrap_or(1);
322 Self::new_with_broadcast_len(columns, broadcast_len)
323 }
324
325 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
326 /// columns to broadcast_len.
327 pub fn new_with_broadcast_len(
328 columns: Vec<Column>,
329 broadcast_len: usize,
330 ) -> PolarsResult<Self> {
331 ensure_names_unique(&columns, |s| s.name().as_str())?;
332 unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
333 }
334
335 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
336 /// columns to match the other columns.
337 ///
338 /// # Safety
339 /// Does not check that the column names are unique (which they must be).
340 pub unsafe fn new_with_broadcast_no_namecheck(
341 mut columns: Vec<Column>,
342 broadcast_len: usize,
343 ) -> PolarsResult<Self> {
344 for col in &mut columns {
345 // Length not equal to the broadcast len, needs broadcast or is an error.
346 let len = col.len();
347 if len != broadcast_len {
348 if len != 1 {
349 let name = col.name().to_owned();
350 let extra_info =
351 if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
352 format!(" (matching column '{}')", c.name())
353 } else {
354 String::new()
355 };
356 polars_bail!(
357 ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
358 );
359 }
360 *col = col.new_from_index(0, broadcast_len);
361 }
362 }
363
364 let length = if columns.is_empty() { 0 } else { broadcast_len };
365
366 Ok(unsafe { DataFrame::new_no_checks(length, columns) })
367 }
368
369 pub fn new_from_index(&self, index: usize, height: usize) -> Self {
370 let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
371 unsafe { Self::new_no_checks(height, cols.collect()) }
372 }
373
374 /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
375 ///
376 /// # Example
377 ///
378 /// ```rust
379 /// use polars_core::prelude::DataFrame;
380 /// static EMPTY: DataFrame = DataFrame::empty();
381 /// ```
382 pub const fn empty() -> Self {
383 Self::empty_with_height(0)
384 }
385
386 /// Creates an empty `DataFrame` with a specific `height`.
387 pub const fn empty_with_height(height: usize) -> Self {
388 DataFrame {
389 height,
390 columns: vec![],
391 cached_schema: OnceLock::new(),
392 }
393 }
394
395 /// Create an empty `DataFrame` with empty columns as per the `schema`.
396 pub fn empty_with_arc_schema(schema: Arc<Schema>) -> Self {
397 let mut df = Self::empty_with_schema(&schema);
398 df.cached_schema = OnceLock::from(schema);
399 df
400 }
401
402 /// Create an empty `DataFrame` with empty columns as per the `schema`.
403 pub fn empty_with_schema(schema: &Schema) -> Self {
404 let cols = schema
405 .iter()
406 .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
407 .collect();
408 unsafe { DataFrame::new_no_checks(0, cols) }
409 }
410
411 /// Create an empty `DataFrame` with empty columns as per the `schema`.
412 pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
413 let cols = schema
414 .iter_values()
415 .map(|fld| {
416 Column::from(Series::new_empty(
417 fld.name.clone(),
418 &(DataType::from_arrow_field(fld)),
419 ))
420 })
421 .collect();
422 unsafe { DataFrame::new_no_checks(0, cols) }
423 }
424
425 /// Create a new `DataFrame` with the given schema, only containing nulls.
426 pub fn full_null(schema: &Schema, height: usize) -> Self {
427 let columns = schema
428 .iter_fields()
429 .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
430 .collect();
431 unsafe { DataFrame::new_no_checks(height, columns) }
432 }
433
434 /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
435 ///
436 /// # Example
437 ///
438 /// ```rust
439 /// # use polars_core::prelude::*;
440 /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
441 /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
442 /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
443 ///
444 /// assert_eq!(df.pop(), Some(s2));
445 /// assert_eq!(df.pop(), Some(s1));
446 /// assert_eq!(df.pop(), None);
447 /// assert!(df.is_empty());
448 /// # Ok::<(), PolarsError>(())
449 /// ```
450 pub fn pop(&mut self) -> Option<Column> {
451 self.clear_schema();
452
453 self.columns.pop()
454 }
455
456 /// Add a new column at index 0 that counts the rows.
457 ///
458 /// # Example
459 ///
460 /// ```
461 /// # use polars_core::prelude::*;
462 /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
463 /// assert_eq!(df1.shape(), (4, 1));
464 ///
465 /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
466 /// assert_eq!(df2.shape(), (4, 2));
467 /// println!("{}", df2);
468 ///
469 /// # Ok::<(), PolarsError>(())
470 /// ```
471 ///
472 /// Output:
473 ///
474 /// ```text
475 /// shape: (4, 2)
476 /// +-----+----------+
477 /// | Id | Name |
478 /// | --- | --- |
479 /// | u32 | str |
480 /// +=====+==========+
481 /// | 0 | James |
482 /// +-----+----------+
483 /// | 1 | Mary |
484 /// +-----+----------+
485 /// | 2 | John |
486 /// +-----+----------+
487 /// | 3 | Patricia |
488 /// +-----+----------+
489 /// ```
490 pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
491 let mut columns = Vec::with_capacity(self.columns.len() + 1);
492 let offset = offset.unwrap_or(0);
493
494 let col = Column::new_row_index(name, offset, self.height())?;
495 columns.push(col);
496 columns.extend_from_slice(&self.columns);
497 DataFrame::new(columns)
498 }
499
500 /// Add a row index column in place.
501 ///
502 /// # Safety
503 /// The caller should ensure the DataFrame does not already contain a column with the given name.
504 ///
505 /// # Panics
506 /// Panics if the resulting column would reach or overflow IdxSize::MAX.
507 pub unsafe fn with_row_index_mut(
508 &mut self,
509 name: PlSmallStr,
510 offset: Option<IdxSize>,
511 ) -> &mut Self {
512 // TODO: Make this function unsafe
513 debug_assert!(
514 self.columns.iter().all(|c| c.name() != &name),
515 "with_row_index_mut(): column with name {} already exists",
516 &name
517 );
518
519 let offset = offset.unwrap_or(0);
520 let col = Column::new_row_index(name, offset, self.height()).unwrap();
521
522 self.clear_schema();
523 self.columns.insert(0, col);
524 self
525 }
526
527 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
528 /// `Series`.
529 ///
530 /// Calculates the height from the first column or `0` if no columns are given.
531 ///
532 /// # Safety
533 ///
534 /// It is the callers responsibility to uphold the contract of all `Series`
535 /// having an equal length and a unique name, if not this may panic down the line.
536 pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
537 let height = columns.first().map_or(0, Column::len);
538 unsafe { Self::new_no_checks(height, columns) }
539 }
540
541 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
542 /// `Series`.
543 ///
544 /// It is advised to use [DataFrame::new] in favor of this method.
545 ///
546 /// # Safety
547 ///
548 /// It is the callers responsibility to uphold the contract of all `Series`
549 /// having an equal length and a unique name, if not this may panic down the line.
550 pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
551 if cfg!(debug_assertions) {
552 DataFrame::validate_columns_slice(&columns).unwrap();
553 }
554
555 unsafe { Self::_new_no_checks_impl(height, columns) }
556 }
557
558 /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
559 /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
560 /// constructed with this method is generally highly unsafe and should not be long-lived.
561 #[allow(clippy::missing_safety_doc)]
562 pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
563 DataFrame {
564 height,
565 columns,
566 cached_schema: OnceLock::new(),
567 }
568 }
569
570 /// Shrink the capacity of this DataFrame to fit its length.
571 pub fn shrink_to_fit(&mut self) {
572 // Don't parallelize this. Memory overhead
573 for s in &mut self.columns {
574 s.shrink_to_fit();
575 }
576 }
577
578 /// Aggregate all the chunks in the DataFrame to a single chunk.
579 pub fn as_single_chunk(&mut self) -> &mut Self {
580 // Don't parallelize this. Memory overhead
581 for s in &mut self.columns {
582 *s = s.rechunk();
583 }
584 self
585 }
586
587 /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
588 /// This may lead to more peak memory consumption.
589 pub fn as_single_chunk_par(&mut self) -> &mut Self {
590 if self.columns.iter().any(|c| c.n_chunks() > 1) {
591 self.columns = self._apply_columns_par(&|s| s.rechunk());
592 }
593 self
594 }
595
596 /// Rechunks all columns to only have a single chunk.
597 pub fn rechunk_mut(&mut self) {
598 // SAFETY: We never adjust the length or names of the columns.
599 let columns = unsafe { self.get_columns_mut() };
600
601 for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
602 *col = col.rechunk();
603 }
604 }
605
606 pub fn _deshare_views_mut(&mut self) {
607 // SAFETY: We never adjust the length or names of the columns.
608 unsafe {
609 let columns = self.get_columns_mut();
610 for col in columns {
611 let Column::Series(s) = col else { continue };
612
613 if let Ok(ca) = s.binary() {
614 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
615 *col = Column::from(gc_ca.into_series());
616 } else if let Ok(ca) = s.str() {
617 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
618 *col = Column::from(gc_ca.into_series());
619 }
620 }
621 }
622 }
623
624 /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
625 pub fn rechunk_to_record_batch(
626 self,
627 compat_level: CompatLevel,
628 ) -> RecordBatchT<Box<dyn Array>> {
629 let height = self.height();
630
631 let (schema, arrays) = self
632 .columns
633 .into_iter()
634 .map(|col| {
635 let mut series = col.take_materialized_series();
636 // Rechunk to one chunk if necessary
637 if series.n_chunks() > 1 {
638 series = series.rechunk();
639 }
640 (
641 series.field().to_arrow(compat_level),
642 series.to_arrow(0, compat_level),
643 )
644 })
645 .collect();
646
647 RecordBatchT::new(height, Arc::new(schema), arrays)
648 }
649
650 /// Returns true if the chunks of the columns do not align and re-chunking should be done
651 pub fn should_rechunk(&self) -> bool {
652 // Fast check. It is also needed for correctness, as code below doesn't check if the number
653 // of chunks is equal.
654 if !self
655 .get_columns()
656 .iter()
657 .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
658 .all_equal()
659 {
660 return true;
661 }
662
663 // From here we check chunk lengths.
664 let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
665 match chunk_lengths.next() {
666 None => false,
667 Some(first_column_chunk_lengths) => {
668 // Fast Path for single Chunk Series
669 if first_column_chunk_lengths.size_hint().0 == 1 {
670 return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
671 }
672 // Always rechunk if we have more chunks than rows.
673 // except when we have an empty df containing a single chunk
674 let height = self.height();
675 let n_chunks = first_column_chunk_lengths.size_hint().0;
676 if n_chunks > height && !(height == 0 && n_chunks == 1) {
677 return true;
678 }
679 // Slow Path for multi Chunk series
680 let v: Vec<_> = first_column_chunk_lengths.collect();
681 for cl in chunk_lengths {
682 if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
683 return true;
684 }
685 }
686 false
687 },
688 }
689 }
690
691 /// Ensure all the chunks in the [`DataFrame`] are aligned.
692 pub fn align_chunks_par(&mut self) -> &mut Self {
693 if self.should_rechunk() {
694 self.as_single_chunk_par()
695 } else {
696 self
697 }
698 }
699
700 pub fn align_chunks(&mut self) -> &mut Self {
701 if self.should_rechunk() {
702 self.as_single_chunk()
703 } else {
704 self
705 }
706 }
707
708 /// Get the [`DataFrame`] schema.
709 ///
710 /// # Example
711 ///
712 /// ```rust
713 /// # use polars_core::prelude::*;
714 /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
715 /// "Diameter (m)" => [8.8e26, f64::INFINITY])?;
716 ///
717 /// let f1: Field = Field::new("Thing".into(), DataType::String);
718 /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
719 /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
720 ///
721 /// assert_eq!(&**df.schema(), &sc);
722 /// # Ok::<(), PolarsError>(())
723 /// ```
724 pub fn schema(&self) -> &SchemaRef {
725 let out = self.cached_schema.get_or_init(|| {
726 Arc::new(
727 self.columns
728 .iter()
729 .map(|x| (x.name().clone(), x.dtype().clone()))
730 .collect(),
731 )
732 });
733
734 debug_assert_eq!(out.len(), self.width());
735
736 out
737 }
738
739 /// Get a reference to the [`DataFrame`] columns.
740 ///
741 /// # Example
742 ///
743 /// ```rust
744 /// # use polars_core::prelude::*;
745 /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
746 /// "Symbol" => ["A", "C", "G", "T"])?;
747 /// let columns: &[Column] = df.get_columns();
748 ///
749 /// assert_eq!(columns[0].name(), "Name");
750 /// assert_eq!(columns[1].name(), "Symbol");
751 /// # Ok::<(), PolarsError>(())
752 /// ```
753 #[inline]
754 pub fn get_columns(&self) -> &[Column] {
755 &self.columns
756 }
757
758 #[inline]
759 /// Get mutable access to the underlying columns.
760 ///
761 /// # Safety
762 ///
763 /// The caller must ensure the length of all [`Series`] remains equal to `height` or
764 /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
765 /// The caller must ensure that the cached schema is cleared if it modifies the schema by
766 /// calling [`DataFrame::clear_schema`].
767 pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
768 &mut self.columns
769 }
770
771 #[inline]
772 /// Remove all the columns in the [`DataFrame`] but keep the `height`.
773 pub fn clear_columns(&mut self) {
774 unsafe { self.get_columns_mut() }.clear();
775 self.clear_schema();
776 }
777
778 #[inline]
779 /// Extend the columns without checking for name collisions or height.
780 ///
781 /// # Safety
782 ///
783 /// The caller needs to ensure that:
784 /// - Column names are unique within the resulting [`DataFrame`].
785 /// - The length of each appended column matches the height of the [`DataFrame`]. For
786 /// `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
787 /// with [`DataFrame::set_height`].
788 pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
789 unsafe { self.get_columns_mut() }.extend(iter);
790 self.clear_schema();
791 }
792
793 /// Take ownership of the underlying columns vec.
794 pub fn take_columns(self) -> Vec<Column> {
795 self.columns
796 }
797
798 /// Iterator over the columns as [`Series`].
799 ///
800 /// # Example
801 ///
802 /// ```rust
803 /// # use polars_core::prelude::*;
804 /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
805 /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
806 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
807 ///
808 /// let mut iterator = df.iter();
809 ///
810 /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
811 /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
812 /// assert_eq!(iterator.next(), None);
813 /// # Ok::<(), PolarsError>(())
814 /// ```
815 pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
816 self.materialized_column_iter()
817 }
818
819 /// # Example
820 ///
821 /// ```rust
822 /// # use polars_core::prelude::*;
823 /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
824 /// "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
825 ///
826 /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
827 /// # Ok::<(), PolarsError>(())
828 /// ```
829 pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
830 self.columns.iter().map(|s| s.name()).collect()
831 }
832
833 /// Get the [`Vec<PlSmallStr>`] representing the column names.
834 pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
835 self.columns.iter().map(|s| s.name().clone()).collect()
836 }
837
838 pub fn get_column_names_str(&self) -> Vec<&str> {
839 self.columns.iter().map(|s| s.name().as_str()).collect()
840 }
841
842 /// Set the column names.
843 /// # Example
844 ///
845 /// ```rust
846 /// # use polars_core::prelude::*;
847 /// let mut df: DataFrame = df!("Mathematical set" => ["ā", "ā¤", "š»", "ā", "ā", "ā"])?;
848 /// df.set_column_names(["Set"])?;
849 ///
850 /// assert_eq!(df.get_column_names(), &["Set"]);
851 /// # Ok::<(), PolarsError>(())
852 /// ```
853 pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
854 where
855 I: IntoIterator<Item = S>,
856 S: Into<PlSmallStr>,
857 {
858 let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
859 self._set_column_names_impl(names.as_slice())
860 }
861
862 fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
863 polars_ensure!(
864 names.len() == self.width(),
865 ShapeMismatch: "{} column names provided for a DataFrame of width {}",
866 names.len(), self.width()
867 );
868 ensure_names_unique(names, |s| s.as_str())?;
869
870 let columns = mem::take(&mut self.columns);
871 self.columns = columns
872 .into_iter()
873 .zip(names)
874 .map(|(s, name)| {
875 let mut s = s;
876 s.rename(name.clone());
877 s
878 })
879 .collect();
880 self.clear_schema();
881 Ok(())
882 }
883
884 /// Get the data types of the columns in the [`DataFrame`].
885 ///
886 /// # Example
887 ///
888 /// ```rust
889 /// # use polars_core::prelude::*;
890 /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
891 /// "Fraction" => [0.965, 0.035])?;
892 ///
893 /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
894 /// # Ok::<(), PolarsError>(())
895 /// ```
896 pub fn dtypes(&self) -> Vec<DataType> {
897 self.columns.iter().map(|s| s.dtype().clone()).collect()
898 }
899
900 pub(crate) fn first_series_column(&self) -> Option<&Series> {
901 self.columns.iter().find_map(|col| col.as_series())
902 }
903
904 /// The number of chunks for the first column.
905 pub fn first_col_n_chunks(&self) -> usize {
906 match self.first_series_column() {
907 None if self.columns.is_empty() => 0,
908 None => 1,
909 Some(s) => s.n_chunks(),
910 }
911 }
912
913 /// The highest number of chunks for any column.
914 pub fn max_n_chunks(&self) -> usize {
915 self.columns
916 .iter()
917 .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
918 .max()
919 .unwrap_or(0)
920 }
921
922 /// Get a reference to the schema fields of the [`DataFrame`].
923 ///
924 /// # Example
925 ///
926 /// ```rust
927 /// # use polars_core::prelude::*;
928 /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
929 /// "Fraction" => [0.708, 0.292])?;
930 ///
931 /// let f1: Field = Field::new("Surface type".into(), DataType::String);
932 /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
933 ///
934 /// assert_eq!(earth.fields(), &[f1, f2]);
935 /// # Ok::<(), PolarsError>(())
936 /// ```
937 pub fn fields(&self) -> Vec<Field> {
938 self.columns
939 .iter()
940 .map(|s| s.field().into_owned())
941 .collect()
942 }
943
944 /// Get (height, width) of the [`DataFrame`].
945 ///
946 /// # Example
947 ///
948 /// ```rust
949 /// # use polars_core::prelude::*;
950 /// let df0: DataFrame = DataFrame::default();
951 /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
952 /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
953 /// "2" => [1, 2, 3, 4, 5])?;
954 ///
955 /// assert_eq!(df0.shape(), (0 ,0));
956 /// assert_eq!(df1.shape(), (5, 1));
957 /// assert_eq!(df2.shape(), (5, 2));
958 /// # Ok::<(), PolarsError>(())
959 /// ```
960 pub fn shape(&self) -> (usize, usize) {
961 (self.height, self.columns.len())
962 }
963
964 /// Get the width of the [`DataFrame`] which is the number of columns.
965 ///
966 /// # Example
967 ///
968 /// ```rust
969 /// # use polars_core::prelude::*;
970 /// let df0: DataFrame = DataFrame::default();
971 /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
972 /// let df2: DataFrame = df!("Series 1" => [0; 0],
973 /// "Series 2" => [0; 0])?;
974 ///
975 /// assert_eq!(df0.width(), 0);
976 /// assert_eq!(df1.width(), 1);
977 /// assert_eq!(df2.width(), 2);
978 /// # Ok::<(), PolarsError>(())
979 /// ```
980 pub fn width(&self) -> usize {
981 self.columns.len()
982 }
983
984 /// Get the height of the [`DataFrame`] which is the number of rows.
985 ///
986 /// # Example
987 ///
988 /// ```rust
989 /// # use polars_core::prelude::*;
990 /// let df0: DataFrame = DataFrame::default();
991 /// let df1: DataFrame = df!("Currency" => ["ā¬", "$"])?;
992 /// let df2: DataFrame = df!("Currency" => ["ā¬", "$", "Ā„", "Ā£", "āæ"])?;
993 ///
994 /// assert_eq!(df0.height(), 0);
995 /// assert_eq!(df1.height(), 2);
996 /// assert_eq!(df2.height(), 5);
997 /// # Ok::<(), PolarsError>(())
998 /// ```
999 pub fn height(&self) -> usize {
1000 self.height
1001 }
1002
1003 /// Returns the size as number of rows * number of columns
1004 pub fn size(&self) -> usize {
1005 let s = self.shape();
1006 s.0 * s.1
1007 }
1008
1009 /// Returns `true` if the [`DataFrame`] contains no rows.
1010 ///
1011 /// # Example
1012 ///
1013 /// ```rust
1014 /// # use polars_core::prelude::*;
1015 /// let df1: DataFrame = DataFrame::default();
1016 /// assert!(df1.is_empty());
1017 ///
1018 /// let df2: DataFrame = df!("First name" => ["Forever"],
1019 /// "Last name" => ["Alone"])?;
1020 /// assert!(!df2.is_empty());
1021 /// # Ok::<(), PolarsError>(())
1022 /// ```
1023 pub fn is_empty(&self) -> bool {
1024 matches!(self.shape(), (0, _) | (_, 0))
1025 }
1026
1027 /// Set the height (i.e. number of rows) of this [`DataFrame`].
1028 ///
1029 /// # Safety
1030 ///
1031 /// This needs to be equal to the length of all the columns.
1032 pub unsafe fn set_height(&mut self, height: usize) {
1033 self.height = height;
1034 }
1035
1036 /// Add multiple [`Series`] to a [`DataFrame`].
1037 /// The added `Series` are required to have the same length.
1038 ///
1039 /// # Example
1040 ///
1041 /// ```rust
1042 /// # use polars_core::prelude::*;
1043 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1044 /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1045 /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1046 ///
1047 /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1048 /// assert_eq!(df2.shape(), (3, 3));
1049 /// println!("{}", df2);
1050 /// # Ok::<(), PolarsError>(())
1051 /// ```
1052 ///
1053 /// Output:
1054 ///
1055 /// ```text
1056 /// shape: (3, 3)
1057 /// +---------+--------+----------+
1058 /// | Element | Proton | Electron |
1059 /// | --- | --- | --- |
1060 /// | str | i32 | i32 |
1061 /// +=========+========+==========+
1062 /// | Copper | 29 | 29 |
1063 /// +---------+--------+----------+
1064 /// | Silver | 47 | 47 |
1065 /// +---------+--------+----------+
1066 /// | Gold | 79 | 79 |
1067 /// +---------+--------+----------+
1068 /// ```
1069 pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1070 let mut new_cols = self.columns.clone();
1071 new_cols.extend_from_slice(columns);
1072 DataFrame::new(new_cols)
1073 }
1074
1075 /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1076 ///
1077 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1078 ///
1079 /// # Example
1080 ///
1081 /// ```rust
1082 /// # use polars_core::prelude::*;
1083 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1084 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1085 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1086 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1087 ///
1088 /// let df3: DataFrame = df1.vstack(&df2)?;
1089 ///
1090 /// assert_eq!(df3.shape(), (5, 2));
1091 /// println!("{}", df3);
1092 /// # Ok::<(), PolarsError>(())
1093 /// ```
1094 ///
1095 /// Output:
1096 ///
1097 /// ```text
1098 /// shape: (5, 2)
1099 /// +-----------+-------------------+
1100 /// | Element | Melting Point (K) |
1101 /// | --- | --- |
1102 /// | str | f64 |
1103 /// +===========+===================+
1104 /// | Copper | 1357.77 |
1105 /// +-----------+-------------------+
1106 /// | Silver | 1234.93 |
1107 /// +-----------+-------------------+
1108 /// | Gold | 1337.33 |
1109 /// +-----------+-------------------+
1110 /// | Platinum | 2041.4 |
1111 /// +-----------+-------------------+
1112 /// | Palladium | 1828.05 |
1113 /// +-----------+-------------------+
1114 /// ```
1115 pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1116 let mut df = self.clone();
1117 df.vstack_mut(other)?;
1118 Ok(df)
1119 }
1120
1121 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1122 ///
1123 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1124 ///
1125 /// # Example
1126 ///
1127 /// ```rust
1128 /// # use polars_core::prelude::*;
1129 /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1130 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1131 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1132 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1133 ///
1134 /// df1.vstack_mut(&df2)?;
1135 ///
1136 /// assert_eq!(df1.shape(), (5, 2));
1137 /// println!("{}", df1);
1138 /// # Ok::<(), PolarsError>(())
1139 /// ```
1140 ///
1141 /// Output:
1142 ///
1143 /// ```text
1144 /// shape: (5, 2)
1145 /// +-----------+-------------------+
1146 /// | Element | Melting Point (K) |
1147 /// | --- | --- |
1148 /// | str | f64 |
1149 /// +===========+===================+
1150 /// | Copper | 1357.77 |
1151 /// +-----------+-------------------+
1152 /// | Silver | 1234.93 |
1153 /// +-----------+-------------------+
1154 /// | Gold | 1337.33 |
1155 /// +-----------+-------------------+
1156 /// | Platinum | 2041.4 |
1157 /// +-----------+-------------------+
1158 /// | Palladium | 1828.05 |
1159 /// +-----------+-------------------+
1160 /// ```
1161 pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1162 if self.width() != other.width() {
1163 polars_ensure!(
1164 self.width() == 0,
1165 ShapeMismatch:
1166 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1167 self.width(), other.width(),
1168 );
1169 self.columns.clone_from(&other.columns);
1170 self.height = other.height;
1171 return Ok(self);
1172 }
1173
1174 self.columns
1175 .iter_mut()
1176 .zip(other.columns.iter())
1177 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1178 ensure_can_extend(&*left, right)?;
1179 left.append(right).map_err(|e| {
1180 e.context(format!("failed to vstack column '{}'", right.name()).into())
1181 })?;
1182 Ok(())
1183 })?;
1184 self.height += other.height;
1185 Ok(self)
1186 }
1187
1188 pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1189 if self.width() != other.width() {
1190 polars_ensure!(
1191 self.width() == 0,
1192 ShapeMismatch:
1193 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1194 self.width(), other.width(),
1195 );
1196 self.columns = other.columns;
1197 self.height = other.height;
1198 return Ok(self);
1199 }
1200
1201 self.columns
1202 .iter_mut()
1203 .zip(other.columns.into_iter())
1204 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1205 ensure_can_extend(&*left, &right)?;
1206 let right_name = right.name().clone();
1207 left.append_owned(right).map_err(|e| {
1208 e.context(format!("failed to vstack column '{right_name}'").into())
1209 })?;
1210 Ok(())
1211 })?;
1212 self.height += other.height;
1213 Ok(self)
1214 }
1215
1216 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1217 ///
1218 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1219 ///
1220 /// # Panics
1221 /// Panics if the schema's don't match.
1222 pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1223 self.columns
1224 .iter_mut()
1225 .zip(other.columns.iter())
1226 .for_each(|(left, right)| {
1227 left.append(right)
1228 .map_err(|e| {
1229 e.context(format!("failed to vstack column '{}'", right.name()).into())
1230 })
1231 .expect("should not fail");
1232 });
1233 self.height += other.height;
1234 }
1235
1236 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1237 ///
1238 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1239 ///
1240 /// # Panics
1241 /// Panics if the schema's don't match.
1242 pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1243 self.columns
1244 .iter_mut()
1245 .zip(other.columns)
1246 .for_each(|(left, right)| {
1247 left.append_owned(right).expect("should not fail");
1248 });
1249 self.height += other.height;
1250 }
1251
1252 /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1253 ///
1254 /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1255 /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1256 ///
1257 /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1258 /// and thus will yield faster queries.
1259 ///
1260 /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1261 /// online operations where you add `n` rows and rerun a query.
1262 ///
1263 /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1264 /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1265 /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1266 pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1267 polars_ensure!(
1268 self.width() == other.width(),
1269 ShapeMismatch:
1270 "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1271 self.width(), other.width(),
1272 );
1273
1274 self.columns
1275 .iter_mut()
1276 .zip(other.columns.iter())
1277 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1278 ensure_can_extend(&*left, right)?;
1279 left.extend(right).map_err(|e| {
1280 e.context(format!("failed to extend column '{}'", right.name()).into())
1281 })?;
1282 Ok(())
1283 })?;
1284 self.height += other.height;
1285 self.clear_schema();
1286 Ok(())
1287 }
1288
1289 /// Remove a column by name and return the column removed.
1290 ///
1291 /// # Example
1292 ///
1293 /// ```rust
1294 /// # use polars_core::prelude::*;
1295 /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1296 /// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1297 ///
1298 /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1299 /// assert!(s1.is_err());
1300 ///
1301 /// let s2: Column = df.drop_in_place("Animal")?;
1302 /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1303 /// # Ok::<(), PolarsError>(())
1304 /// ```
1305 pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1306 let idx = self.check_name_to_idx(name)?;
1307 self.clear_schema();
1308 Ok(self.columns.remove(idx))
1309 }
1310
1311 /// Return a new [`DataFrame`] where all null values are dropped.
1312 ///
1313 /// # Example
1314 ///
1315 /// ```no_run
1316 /// # use polars_core::prelude::*;
1317 /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1318 /// "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1319 /// assert_eq!(df1.shape(), (3, 2));
1320 ///
1321 /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1322 /// assert_eq!(df2.shape(), (1, 2));
1323 /// println!("{}", df2);
1324 /// # Ok::<(), PolarsError>(())
1325 /// ```
1326 ///
1327 /// Output:
1328 ///
1329 /// ```text
1330 /// shape: (1, 2)
1331 /// +---------+---------------------+
1332 /// | Country | Tax revenue (% GDP) |
1333 /// | --- | --- |
1334 /// | str | f64 |
1335 /// +=========+=====================+
1336 /// | Malta | 32.7 |
1337 /// +---------+---------------------+
1338 /// ```
1339 pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1340 where
1341 for<'a> &'a S: Into<PlSmallStr>,
1342 {
1343 if let Some(v) = subset {
1344 let v = self.select_columns(v)?;
1345 self._drop_nulls_impl(v.as_slice())
1346 } else {
1347 self._drop_nulls_impl(self.columns.as_slice())
1348 }
1349 }
1350
1351 fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1352 // fast path for no nulls in df
1353 if subset.iter().all(|s| !s.has_nulls()) {
1354 return Ok(self.clone());
1355 }
1356
1357 let mut iter = subset.iter();
1358
1359 let mask = iter
1360 .next()
1361 .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1362 let mut mask = mask.is_not_null();
1363
1364 for c in iter {
1365 mask = mask & c.is_not_null();
1366 }
1367 self.filter(&mask)
1368 }
1369
1370 /// Drop a column by name.
1371 /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1372 /// the current one in place.
1373 ///
1374 /// # Example
1375 ///
1376 /// ```rust
1377 /// # use polars_core::prelude::*;
1378 /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1379 /// let df2: DataFrame = df1.drop("Ray type")?;
1380 ///
1381 /// assert!(df2.is_empty());
1382 /// # Ok::<(), PolarsError>(())
1383 /// ```
1384 pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1385 let idx = self.check_name_to_idx(name)?;
1386 let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1387
1388 self.columns.iter().enumerate().for_each(|(i, s)| {
1389 if i != idx {
1390 new_cols.push(s.clone())
1391 }
1392 });
1393
1394 Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1395 }
1396
1397 /// Drop columns that are in `names`.
1398 pub fn drop_many<I, S>(&self, names: I) -> Self
1399 where
1400 I: IntoIterator<Item = S>,
1401 S: Into<PlSmallStr>,
1402 {
1403 let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1404 self.drop_many_amortized(&names)
1405 }
1406
1407 /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1408 pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1409 if names.is_empty() {
1410 return self.clone();
1411 }
1412 let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1413 self.columns.iter().for_each(|s| {
1414 if !names.contains(s.name()) {
1415 new_cols.push(s.clone())
1416 }
1417 });
1418
1419 unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1420 }
1421
1422 /// Insert a new column at a given index without checking for duplicates.
1423 /// This can leave the [`DataFrame`] at an invalid state
1424 fn insert_column_no_name_check(
1425 &mut self,
1426 index: usize,
1427 column: Column,
1428 ) -> PolarsResult<&mut Self> {
1429 polars_ensure!(
1430 self.width() == 0 || column.len() == self.height(),
1431 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1432 column.len(), self.height(),
1433 );
1434
1435 if self.width() == 0 {
1436 self.height = column.len();
1437 }
1438
1439 self.columns.insert(index, column);
1440 self.clear_schema();
1441 Ok(self)
1442 }
1443
1444 /// Insert a new column at a given index.
1445 pub fn insert_column<S: IntoColumn>(
1446 &mut self,
1447 index: usize,
1448 column: S,
1449 ) -> PolarsResult<&mut Self> {
1450 let column = column.into_column();
1451 self.check_already_present(column.name().as_str())?;
1452 self.insert_column_no_name_check(index, column)
1453 }
1454
1455 fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1456 if let Some(idx) = self.get_column_index(column.name().as_str()) {
1457 self.replace_column(idx, column)?;
1458 } else {
1459 if self.width() == 0 {
1460 self.height = column.len();
1461 }
1462
1463 self.columns.push(column);
1464 self.clear_schema();
1465 }
1466 Ok(())
1467 }
1468
1469 /// Add a new column to this [`DataFrame`] or replace an existing one.
1470 pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1471 fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1472 let height = df.height();
1473 if column.len() == 1 && height > 1 {
1474 column = column.new_from_index(0, height);
1475 }
1476
1477 if column.len() == height || df.get_columns().is_empty() {
1478 df.add_column_by_search(column)?;
1479 Ok(df)
1480 }
1481 // special case for literals
1482 else if height == 0 && column.len() == 1 {
1483 let s = column.clear();
1484 df.add_column_by_search(s)?;
1485 Ok(df)
1486 } else {
1487 polars_bail!(
1488 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1489 column.len(), height,
1490 );
1491 }
1492 }
1493 let column = column.into_column();
1494 inner(self, column)
1495 }
1496
1497 /// Adds a column to the [`DataFrame`] without doing any checks
1498 /// on length or duplicates.
1499 ///
1500 /// # Safety
1501 /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1502 pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1503 debug_assert!(self.width() == 0 || self.height() == column.len());
1504 debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1505
1506 // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1507 // properly for `width` == 0.
1508 if self.width() == 0 {
1509 unsafe { self.set_height(column.len()) };
1510 }
1511 unsafe { self.get_columns_mut() }.push(column);
1512 self.clear_schema();
1513
1514 self
1515 }
1516
1517 // Note: Schema can be both input or output_schema
1518 fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1519 let name = c.name();
1520 if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1521 if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1522 // Given schema is output_schema and we can push.
1523 if idx == self.columns.len() {
1524 if self.width() == 0 {
1525 self.height = c.len();
1526 }
1527
1528 self.columns.push(c);
1529 self.clear_schema();
1530 }
1531 // Schema is incorrect fallback to search
1532 else {
1533 debug_assert!(false);
1534 self.add_column_by_search(c)?;
1535 }
1536 } else {
1537 self.replace_column(idx, c)?;
1538 }
1539 } else {
1540 if self.width() == 0 {
1541 self.height = c.len();
1542 }
1543
1544 self.columns.push(c);
1545 self.clear_schema();
1546 }
1547
1548 Ok(())
1549 }
1550
1551 // Note: Schema can be both input or output_schema
1552 pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1553 for (i, s) in series.into_iter().enumerate() {
1554 // we need to branch here
1555 // because users can add multiple columns with the same name
1556 if i == 0 || schema.get(s.name().as_str()).is_some() {
1557 self.with_column_and_schema(s.into_column(), schema)?;
1558 } else {
1559 self.with_column(s.clone().into_column())?;
1560 }
1561 }
1562 Ok(())
1563 }
1564
1565 pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1566 for (i, s) in columns.into_iter().enumerate() {
1567 // we need to branch here
1568 // because users can add multiple columns with the same name
1569 if i == 0 || schema.get(s.name().as_str()).is_some() {
1570 self.with_column_and_schema(s, schema)?;
1571 } else {
1572 self.with_column(s.clone())?;
1573 }
1574 }
1575
1576 Ok(())
1577 }
1578
1579 /// Add a new column to this [`DataFrame`] or replace an existing one.
1580 /// Uses an existing schema to amortize lookups.
1581 /// If the schema is incorrect, we will fallback to linear search.
1582 ///
1583 /// Note: Schema can be both input or output_schema
1584 pub fn with_column_and_schema<C: IntoColumn>(
1585 &mut self,
1586 column: C,
1587 schema: &Schema,
1588 ) -> PolarsResult<&mut Self> {
1589 let mut column = column.into_column();
1590
1591 let height = self.height();
1592 if column.len() == 1 && height > 1 {
1593 column = column.new_from_index(0, height);
1594 }
1595
1596 if column.len() == height || self.columns.is_empty() {
1597 self.add_column_by_schema(column, schema)?;
1598 Ok(self)
1599 }
1600 // special case for literals
1601 else if height == 0 && column.len() == 1 {
1602 let s = column.clear();
1603 self.add_column_by_schema(s, schema)?;
1604 Ok(self)
1605 } else {
1606 polars_bail!(
1607 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1608 column.len(), height,
1609 );
1610 }
1611 }
1612
1613 /// Get a row in the [`DataFrame`]. Beware this is slow.
1614 ///
1615 /// # Example
1616 ///
1617 /// ```
1618 /// # use polars_core::prelude::*;
1619 /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1620 /// df.get(idx)
1621 /// }
1622 /// ```
1623 pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1624 match self.columns.first() {
1625 Some(s) => {
1626 if s.len() <= idx {
1627 return None;
1628 }
1629 },
1630 None => return None,
1631 }
1632 // SAFETY: we just checked bounds
1633 unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1634 }
1635
1636 /// Select a [`Series`] by index.
1637 ///
1638 /// # Example
1639 ///
1640 /// ```rust
1641 /// # use polars_core::prelude::*;
1642 /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1643 /// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1644 ///
1645 /// let s1: Option<&Column> = df.select_at_idx(0);
1646 /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1647 ///
1648 /// assert_eq!(s1, Some(&s2));
1649 /// # Ok::<(), PolarsError>(())
1650 /// ```
1651 pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1652 self.columns.get(idx)
1653 }
1654
1655 /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1656 ///
1657 /// # Examples
1658 ///
1659 /// ```rust
1660 /// # use polars_core::prelude::*;
1661 /// let df = df! {
1662 /// "0" => [0, 0, 0],
1663 /// "1" => [1, 1, 1],
1664 /// "2" => [2, 2, 2]
1665 /// }?;
1666 ///
1667 /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1668 /// assert!(df.equals(&df.select_by_range(..)?));
1669 /// # Ok::<(), PolarsError>(())
1670 /// ```
1671 pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1672 where
1673 R: ops::RangeBounds<usize>,
1674 {
1675 // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1676 // because it is the nightly feature. We should change here if this function were stable.
1677 fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1678 where
1679 R: ops::RangeBounds<usize>,
1680 {
1681 let len = bounds.end;
1682
1683 let start: ops::Bound<&usize> = range.start_bound();
1684 let start = match start {
1685 ops::Bound::Included(&start) => start,
1686 ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1687 panic!("attempted to index slice from after maximum usize");
1688 }),
1689 ops::Bound::Unbounded => 0,
1690 };
1691
1692 let end: ops::Bound<&usize> = range.end_bound();
1693 let end = match end {
1694 ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1695 panic!("attempted to index slice up to maximum usize");
1696 }),
1697 ops::Bound::Excluded(&end) => end,
1698 ops::Bound::Unbounded => len,
1699 };
1700
1701 if start > end {
1702 panic!("slice index starts at {start} but ends at {end}");
1703 }
1704 if end > len {
1705 panic!("range end index {end} out of range for slice of length {len}",);
1706 }
1707
1708 ops::Range { start, end }
1709 }
1710
1711 let colnames = self.get_column_names_owned();
1712 let range = get_range(range, ..colnames.len());
1713
1714 self._select_impl(&colnames[range])
1715 }
1716
1717 /// Get column index of a [`Series`] by name.
1718 /// # Example
1719 ///
1720 /// ```rust
1721 /// # use polars_core::prelude::*;
1722 /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1723 /// "Health" => [100, 200, 500],
1724 /// "Mana" => [250, 100, 0],
1725 /// "Strength" => [30, 150, 300])?;
1726 ///
1727 /// assert_eq!(df.get_column_index("Name"), Some(0));
1728 /// assert_eq!(df.get_column_index("Health"), Some(1));
1729 /// assert_eq!(df.get_column_index("Mana"), Some(2));
1730 /// assert_eq!(df.get_column_index("Strength"), Some(3));
1731 /// assert_eq!(df.get_column_index("Haste"), None);
1732 /// # Ok::<(), PolarsError>(())
1733 /// ```
1734 pub fn get_column_index(&self, name: &str) -> Option<usize> {
1735 let schema = self.schema();
1736 if let Some(idx) = schema.index_of(name) {
1737 if self
1738 .get_columns()
1739 .get(idx)
1740 .is_some_and(|c| c.name() == name)
1741 {
1742 return Some(idx);
1743 }
1744 }
1745
1746 self.columns.iter().position(|s| s.name().as_str() == name)
1747 }
1748
1749 /// Get column index of a [`Series`] by name.
1750 pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1751 self.get_column_index(name)
1752 .ok_or_else(|| polars_err!(col_not_found = name))
1753 }
1754
1755 /// Select a single column by name.
1756 ///
1757 /// # Example
1758 ///
1759 /// ```rust
1760 /// # use polars_core::prelude::*;
1761 /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1762 /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1763 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1764 ///
1765 /// assert_eq!(df.column("Password")?, &s1);
1766 /// # Ok::<(), PolarsError>(())
1767 /// ```
1768 pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1769 let idx = self.try_get_column_index(name)?;
1770 Ok(self.select_at_idx(idx).unwrap())
1771 }
1772
1773 /// Selected multiple columns by name.
1774 ///
1775 /// # Example
1776 ///
1777 /// ```rust
1778 /// # use polars_core::prelude::*;
1779 /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1780 /// "Max weight (kg)" => [16.0, 35.89])?;
1781 /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1782 ///
1783 /// assert_eq!(&df[0], sv[0]);
1784 /// assert_eq!(&df[1], sv[1]);
1785 /// # Ok::<(), PolarsError>(())
1786 /// ```
1787 pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1788 where
1789 I: IntoIterator<Item = S>,
1790 S: AsRef<str>,
1791 {
1792 names
1793 .into_iter()
1794 .map(|name| self.column(name.as_ref()))
1795 .collect()
1796 }
1797
1798 /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1799 ///
1800 /// # Examples
1801 ///
1802 /// ```
1803 /// # use polars_core::prelude::*;
1804 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1805 /// df.select(["foo", "bar"])
1806 /// }
1807 /// ```
1808 pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1809 where
1810 I: IntoIterator<Item = S>,
1811 S: Into<PlSmallStr>,
1812 {
1813 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1814 self._select_impl(cols.as_slice())
1815 }
1816
1817 pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1818 ensure_names_unique(cols, |s| s.as_str())?;
1819 self._select_impl_unchecked(cols)
1820 }
1821
1822 pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1823 let selected = self.select_columns_impl(cols)?;
1824 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1825 }
1826
1827 /// Select with a known schema. The schema names must match the column names of this DataFrame.
1828 pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1829 where
1830 I: IntoIterator<Item = S>,
1831 S: Into<PlSmallStr>,
1832 {
1833 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1834 self._select_with_schema_impl(&cols, schema, true)
1835 }
1836
1837 /// Select with a known schema without checking for duplicates in `selection`.
1838 /// The schema names must match the column names of this DataFrame.
1839 pub fn select_with_schema_unchecked<I, S>(
1840 &self,
1841 selection: I,
1842 schema: &Schema,
1843 ) -> PolarsResult<Self>
1844 where
1845 I: IntoIterator<Item = S>,
1846 S: Into<PlSmallStr>,
1847 {
1848 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1849 self._select_with_schema_impl(&cols, schema, false)
1850 }
1851
1852 /// * The schema names must match the column names of this DataFrame.
1853 pub fn _select_with_schema_impl(
1854 &self,
1855 cols: &[PlSmallStr],
1856 schema: &Schema,
1857 check_duplicates: bool,
1858 ) -> PolarsResult<Self> {
1859 if check_duplicates {
1860 ensure_names_unique(cols, |s| s.as_str())?;
1861 }
1862
1863 let selected = self.select_columns_impl_with_schema(cols, schema)?;
1864 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1865 }
1866
1867 /// A non generic implementation to reduce compiler bloat.
1868 fn select_columns_impl_with_schema(
1869 &self,
1870 cols: &[PlSmallStr],
1871 schema: &Schema,
1872 ) -> PolarsResult<Vec<Column>> {
1873 if cfg!(debug_assertions) {
1874 ensure_matching_schema_names(schema, self.schema())?;
1875 }
1876
1877 cols.iter()
1878 .map(|name| {
1879 let index = schema.try_get_full(name.as_str())?.0;
1880 Ok(self.columns[index].clone())
1881 })
1882 .collect()
1883 }
1884
1885 pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1886 where
1887 I: IntoIterator<Item = S>,
1888 S: Into<PlSmallStr>,
1889 {
1890 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1891 self.select_physical_impl(&cols)
1892 }
1893
1894 fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1895 ensure_names_unique(cols, |s| s.as_str())?;
1896 let selected = self.select_columns_physical_impl(cols)?;
1897 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1898 }
1899
1900 pub fn project(&self, to: SchemaRef) -> PolarsResult<Self> {
1901 let from = self.schema();
1902 let columns = to
1903 .iter_names()
1904 .map(|name| Ok(self.columns[from.try_index_of(name.as_str())?].clone()))
1905 .collect::<PolarsResult<Vec<_>>>()?;
1906 let mut df = unsafe { Self::new_no_checks(self.height(), columns) };
1907 df.cached_schema = to.into();
1908 Ok(df)
1909 }
1910
1911 /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1912 ///
1913 /// # Example
1914 ///
1915 /// ```rust
1916 /// # use polars_core::prelude::*;
1917 /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1918 /// "Carbon" => [1, 2, 3],
1919 /// "Hydrogen" => [4, 6, 8])?;
1920 /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1921 ///
1922 /// assert_eq!(df["Carbon"], sv[0]);
1923 /// assert_eq!(df["Hydrogen"], sv[1]);
1924 /// # Ok::<(), PolarsError>(())
1925 /// ```
1926 pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1927 let cols = selection.into_vec();
1928 self.select_columns_impl(&cols)
1929 }
1930
1931 fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1932 self.columns
1933 .iter()
1934 .enumerate()
1935 .map(|(i, s)| (s.name().as_str(), i))
1936 .collect()
1937 }
1938
1939 /// A non generic implementation to reduce compiler bloat.
1940 fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1941 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1942 let name_to_idx = self._names_to_idx_map();
1943 cols.iter()
1944 .map(|name| {
1945 let idx = *name_to_idx
1946 .get(name.as_str())
1947 .ok_or_else(|| polars_err!(col_not_found = name))?;
1948 Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1949 })
1950 .collect::<PolarsResult<Vec<_>>>()?
1951 } else {
1952 cols.iter()
1953 .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1954 .collect::<PolarsResult<Vec<_>>>()?
1955 };
1956
1957 Ok(selected)
1958 }
1959
1960 /// A non generic implementation to reduce compiler bloat.
1961 fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1962 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1963 // we hash, because there are user that having millions of columns.
1964 // # https://github.com/pola-rs/polars/issues/1023
1965 let name_to_idx = self._names_to_idx_map();
1966
1967 cols.iter()
1968 .map(|name| {
1969 let idx = *name_to_idx
1970 .get(name.as_str())
1971 .ok_or_else(|| polars_err!(col_not_found = name))?;
1972 Ok(self.select_at_idx(idx).unwrap().clone())
1973 })
1974 .collect::<PolarsResult<Vec<_>>>()?
1975 } else {
1976 cols.iter()
1977 .map(|c| self.column(c.as_str()).cloned())
1978 .collect::<PolarsResult<Vec<_>>>()?
1979 };
1980
1981 Ok(selected)
1982 }
1983
1984 fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1985 // If there is a filtered column just see how many columns there are left.
1986 if let Some(fst) = filtered.first() {
1987 return fst.len();
1988 }
1989
1990 // Otherwise, count the number of values that would be filtered and return that height.
1991 let num_trues = mask.num_trues();
1992 if mask.len() == self.height() {
1993 num_trues
1994 } else {
1995 // This is for broadcasting masks
1996 debug_assert!(num_trues == 0 || num_trues == 1);
1997 self.height() * num_trues
1998 }
1999 }
2000
2001 /// Take the [`DataFrame`] rows by a boolean mask.
2002 ///
2003 /// # Example
2004 ///
2005 /// ```
2006 /// # use polars_core::prelude::*;
2007 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2008 /// let mask = df.column("sepal_width")?.is_not_null();
2009 /// df.filter(&mask)
2010 /// }
2011 /// ```
2012 pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2013 let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
2014 let height = self.filter_height(&new_col, mask);
2015
2016 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2017 }
2018
2019 /// Same as `filter` but does not parallelize.
2020 pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2021 let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
2022 let height = self.filter_height(&new_col, mask);
2023
2024 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2025 }
2026
2027 /// Take [`DataFrame`] rows by index values.
2028 ///
2029 /// # Example
2030 ///
2031 /// ```
2032 /// # use polars_core::prelude::*;
2033 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2034 /// let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2035 /// df.take(&idx)
2036 /// }
2037 /// ```
2038 pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2039 let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2040
2041 Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2042 }
2043
2044 /// # Safety
2045 /// The indices must be in-bounds.
2046 pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2047 self.take_unchecked_impl(idx, true)
2048 }
2049
2050 /// # Safety
2051 /// The indices must be in-bounds.
2052 pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2053 let cols = if allow_threads && POOL.current_num_threads() > 1 {
2054 POOL.install(|| {
2055 if POOL.current_num_threads() > self.width() {
2056 let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2057 if self.len() / stride >= 2 {
2058 self._apply_columns_par(&|c| {
2059 // Nested types initiate a rechunk in their take_unchecked implementation.
2060 // If we do not rechunk, it will result in rechunk storms downstream.
2061 let c = if c.dtype().is_nested() {
2062 &c.rechunk()
2063 } else {
2064 c
2065 };
2066
2067 (0..idx.len().div_ceil(stride))
2068 .into_par_iter()
2069 .map(|i| c.take_unchecked(&idx.slice((i * stride) as i64, stride)))
2070 .reduce(
2071 || Column::new_empty(c.name().clone(), c.dtype()),
2072 |mut a, b| {
2073 a.append_owned(b).unwrap();
2074 a
2075 },
2076 )
2077 })
2078 } else {
2079 self._apply_columns_par(&|c| c.take_unchecked(idx))
2080 }
2081 } else {
2082 self._apply_columns_par(&|c| c.take_unchecked(idx))
2083 }
2084 })
2085 } else {
2086 self._apply_columns(&|s| s.take_unchecked(idx))
2087 };
2088 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2089 }
2090
2091 /// # Safety
2092 /// The indices must be in-bounds.
2093 pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2094 self.take_slice_unchecked_impl(idx, true)
2095 }
2096
2097 /// # Safety
2098 /// The indices must be in-bounds.
2099 pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2100 let cols = if allow_threads && POOL.current_num_threads() > 1 {
2101 POOL.install(|| {
2102 if POOL.current_num_threads() > self.width() {
2103 let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2104 if self.len() / stride >= 2 {
2105 self._apply_columns_par(&|c| {
2106 // Nested types initiate a rechunk in their take_unchecked implementation.
2107 // If we do not rechunk, it will result in rechunk storms downstream.
2108 let c = if c.dtype().is_nested() {
2109 &c.rechunk()
2110 } else {
2111 c
2112 };
2113
2114 (0..idx.len().div_ceil(stride))
2115 .into_par_iter()
2116 .map(|i| {
2117 let idx = &idx[i * stride..];
2118 let idx = &idx[..idx.len().min(stride)];
2119 c.take_slice_unchecked(idx)
2120 })
2121 .reduce(
2122 || Column::new_empty(c.name().clone(), c.dtype()),
2123 |mut a, b| {
2124 a.append_owned(b).unwrap();
2125 a
2126 },
2127 )
2128 })
2129 } else {
2130 self._apply_columns_par(&|s| s.take_slice_unchecked(idx))
2131 }
2132 } else {
2133 self._apply_columns_par(&|s| s.take_slice_unchecked(idx))
2134 }
2135 })
2136 } else {
2137 self._apply_columns(&|s| s.take_slice_unchecked(idx))
2138 };
2139 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2140 }
2141
2142 /// Rename a column in the [`DataFrame`].
2143 ///
2144 /// Should not be called in a loop as that can lead to quadratic behavior.
2145 ///
2146 /// # Example
2147 ///
2148 /// ```
2149 /// # use polars_core::prelude::*;
2150 /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2151 /// let original_name = "foo";
2152 /// let new_name = "bar";
2153 /// df.rename(original_name, new_name.into())
2154 /// }
2155 /// ```
2156 pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2157 if column == name.as_str() {
2158 return Ok(self);
2159 }
2160 polars_ensure!(
2161 !self.schema().contains(&name),
2162 Duplicate: "column rename attempted with already existing name \"{name}\""
2163 );
2164
2165 self.get_column_index(column)
2166 .and_then(|idx| self.columns.get_mut(idx))
2167 .ok_or_else(|| polars_err!(col_not_found = column))
2168 .map(|c| c.rename(name))?;
2169 self.clear_schema();
2170
2171 Ok(self)
2172 }
2173
2174 pub fn rename_many<'a>(
2175 &mut self,
2176 renames: impl Iterator<Item = (&'a str, PlSmallStr)>,
2177 ) -> PolarsResult<&mut Self> {
2178 let mut schema = self.schema().as_ref().clone();
2179 self.clear_schema();
2180
2181 for (from, to) in renames {
2182 if from == to.as_str() {
2183 continue;
2184 }
2185
2186 polars_ensure!(
2187 !schema.contains(&to),
2188 Duplicate: "column rename attempted with already existing name \"{to}\""
2189 );
2190
2191 match schema.get_full(from) {
2192 None => polars_bail!(col_not_found = from),
2193 Some((idx, _, _)) => {
2194 let (n, _) = schema.get_at_index_mut(idx).unwrap();
2195 *n = to.clone();
2196 self.columns.get_mut(idx).unwrap().rename(to);
2197 },
2198 }
2199 }
2200
2201 self.cached_schema = OnceLock::from(Arc::new(schema));
2202 Ok(self)
2203 }
2204
2205 /// Sort [`DataFrame`] in place.
2206 ///
2207 /// See [`DataFrame::sort`] for more instruction.
2208 pub fn sort_in_place(
2209 &mut self,
2210 by: impl IntoVec<PlSmallStr>,
2211 sort_options: SortMultipleOptions,
2212 ) -> PolarsResult<&mut Self> {
2213 let by_column = self.select_columns(by)?;
2214 self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2215 Ok(self)
2216 }
2217
2218 #[doc(hidden)]
2219 /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2220 pub fn sort_impl(
2221 &self,
2222 by_column: Vec<Column>,
2223 sort_options: SortMultipleOptions,
2224 slice: Option<(i64, usize)>,
2225 ) -> PolarsResult<Self> {
2226 if by_column.is_empty() {
2227 // If no columns selected, any order (including original order) is correct.
2228 return if let Some((offset, len)) = slice {
2229 Ok(self.slice(offset, len))
2230 } else {
2231 Ok(self.clone())
2232 };
2233 }
2234
2235 // note that the by_column argument also contains evaluated expression from
2236 // polars-lazy that may not even be present in this dataframe. therefore
2237 // when we try to set the first columns as sorted, we ignore the error as
2238 // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2239 let first_descending = sort_options.descending[0];
2240 let first_by_column = by_column[0].name().to_string();
2241
2242 let set_sorted = |df: &mut DataFrame| {
2243 // Mark the first sort column as sorted; if the column does not exist it
2244 // is ok, because we sorted by an expression not present in the dataframe
2245 let _ = df.apply(&first_by_column, |s| {
2246 let mut s = s.clone();
2247 if first_descending {
2248 s.set_sorted_flag(IsSorted::Descending)
2249 } else {
2250 s.set_sorted_flag(IsSorted::Ascending)
2251 }
2252 s
2253 });
2254 };
2255 if self.is_empty() {
2256 let mut out = self.clone();
2257 set_sorted(&mut out);
2258 return Ok(out);
2259 }
2260
2261 if let Some((0, k)) = slice {
2262 if k < self.len() {
2263 return self.bottom_k_impl(k, by_column, sort_options);
2264 }
2265 }
2266 // Check if the required column is already sorted; if so we can exit early
2267 // We can do so when there is only one column to sort by, for multiple columns
2268 // it will be complicated to do so
2269 #[cfg(feature = "dtype-categorical")]
2270 let is_not_categorical_enum =
2271 !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2272 || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2273
2274 #[cfg(not(feature = "dtype-categorical"))]
2275 #[allow(non_upper_case_globals)]
2276 const is_not_categorical_enum: bool = true;
2277
2278 if by_column.len() == 1 && is_not_categorical_enum {
2279 let required_sorting = if sort_options.descending[0] {
2280 IsSorted::Descending
2281 } else {
2282 IsSorted::Ascending
2283 };
2284 // If null count is 0 then nulls_last doesnt matter
2285 // Safe to get value at last position since the dataframe is not empty (taken care above)
2286 let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2287 && ((by_column[0].null_count() == 0)
2288 || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2289 == sort_options.nulls_last[0]);
2290
2291 if no_sorting_required {
2292 return if let Some((offset, len)) = slice {
2293 Ok(self.slice(offset, len))
2294 } else {
2295 Ok(self.clone())
2296 };
2297 }
2298 }
2299
2300 let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2301 let allow_threads = sort_options.multithreaded;
2302
2303 // a lot of indirection in both sorting and take
2304 let mut df = self.clone();
2305 let df = df.as_single_chunk_par();
2306 let mut take = match (by_column.len(), has_nested) {
2307 (1, false) => {
2308 let s = &by_column[0];
2309 let options = SortOptions {
2310 descending: sort_options.descending[0],
2311 nulls_last: sort_options.nulls_last[0],
2312 multithreaded: sort_options.multithreaded,
2313 maintain_order: sort_options.maintain_order,
2314 limit: sort_options.limit,
2315 };
2316 // fast path for a frame with a single series
2317 // no need to compute the sort indices and then take by these indices
2318 // simply sort and return as frame
2319 if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2320 let mut out = s.sort_with(options)?;
2321 if let Some((offset, len)) = slice {
2322 out = out.slice(offset, len);
2323 }
2324 return Ok(out.into_frame());
2325 }
2326 s.arg_sort(options)
2327 },
2328 _ => arg_sort(&by_column, sort_options)?,
2329 };
2330
2331 if let Some((offset, len)) = slice {
2332 take = take.slice(offset, len);
2333 }
2334
2335 // SAFETY:
2336 // the created indices are in bounds
2337 let mut df = unsafe { df.take_unchecked_impl(&take, allow_threads) };
2338 set_sorted(&mut df);
2339 Ok(df)
2340 }
2341
2342 /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2343 ///
2344 /// This dataframe does not necessarily have a specified schema and may be changed at any
2345 /// point. It is primarily used for debugging.
2346 pub fn _to_metadata(&self) -> DataFrame {
2347 let num_columns = self.columns.len();
2348
2349 let mut column_names =
2350 StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2351 let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2352 let mut sorted_asc_ca =
2353 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2354 let mut sorted_dsc_ca =
2355 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2356 let mut fast_explode_list_ca =
2357 BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2358 let mut materialized_at_ca =
2359 StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2360
2361 for col in &self.columns {
2362 let flags = col.get_flags();
2363
2364 let (repr, materialized_at) = match col {
2365 Column::Series(s) => ("series", s.materialized_at()),
2366 Column::Scalar(_) => ("scalar", None),
2367 };
2368 let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2369 let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2370 let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2371
2372 column_names.append_value(col.name().clone());
2373 repr_ca.append_value(repr);
2374 sorted_asc_ca.append_value(sorted_asc);
2375 sorted_dsc_ca.append_value(sorted_dsc);
2376 fast_explode_list_ca.append_value(fast_explode_list);
2377 materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2378 }
2379
2380 unsafe {
2381 DataFrame::new_no_checks(
2382 self.width(),
2383 vec![
2384 column_names.finish().into_column(),
2385 repr_ca.finish().into_column(),
2386 sorted_asc_ca.finish().into_column(),
2387 sorted_dsc_ca.finish().into_column(),
2388 fast_explode_list_ca.finish().into_column(),
2389 materialized_at_ca.finish().into_column(),
2390 ],
2391 )
2392 }
2393 }
2394
2395 /// Return a sorted clone of this [`DataFrame`].
2396 ///
2397 /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2398 /// # Example
2399 ///
2400 /// Sort by a single column with default options:
2401 /// ```
2402 /// # use polars_core::prelude::*;
2403 /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2404 /// df.sort(["sepal_width"], Default::default())
2405 /// }
2406 /// ```
2407 /// Sort by a single column with specific order:
2408 /// ```
2409 /// # use polars_core::prelude::*;
2410 /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2411 /// df.sort(
2412 /// ["sepal_width"],
2413 /// SortMultipleOptions::new()
2414 /// .with_order_descending(descending)
2415 /// )
2416 /// }
2417 /// ```
2418 /// Sort by multiple columns with specifying order for each column:
2419 /// ```
2420 /// # use polars_core::prelude::*;
2421 /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2422 /// df.sort(
2423 /// ["sepal_width", "sepal_length"],
2424 /// SortMultipleOptions::new()
2425 /// .with_order_descending_multi([false, true])
2426 /// )
2427 /// }
2428 /// ```
2429 /// See [`SortMultipleOptions`] for more options.
2430 ///
2431 /// Also see [`DataFrame::sort_in_place`].
2432 pub fn sort(
2433 &self,
2434 by: impl IntoVec<PlSmallStr>,
2435 sort_options: SortMultipleOptions,
2436 ) -> PolarsResult<Self> {
2437 let mut df = self.clone();
2438 df.sort_in_place(by, sort_options)?;
2439 Ok(df)
2440 }
2441
2442 /// Replace a column with a [`Series`].
2443 ///
2444 /// # Example
2445 ///
2446 /// ```rust
2447 /// # use polars_core::prelude::*;
2448 /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2449 /// "Area (km²)" => [9_833_520, 9_596_961])?;
2450 /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2451 ///
2452 /// assert!(df.replace("Nation", s.clone()).is_err());
2453 /// assert!(df.replace("Country", s).is_ok());
2454 /// # Ok::<(), PolarsError>(())
2455 /// ```
2456 pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2457 self.apply(column, |_| new_col.into_series())
2458 }
2459
2460 /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2461 /// is that now the value of `column: &str` determines the name of the column and not the name
2462 /// of the `Series` passed to this method.
2463 pub fn replace_or_add<S: IntoSeries>(
2464 &mut self,
2465 column: PlSmallStr,
2466 new_col: S,
2467 ) -> PolarsResult<&mut Self> {
2468 let mut new_col = new_col.into_series();
2469 new_col.rename(column);
2470 self.with_column(new_col)
2471 }
2472
2473 /// Replace column at index `idx` with a [`Series`].
2474 ///
2475 /// # Example
2476 ///
2477 /// ```ignored
2478 /// # use polars_core::prelude::*;
2479 /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2480 /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2481 /// let mut df = DataFrame::new(vec![s0, s1])?;
2482 ///
2483 /// // Add 32 to get lowercase ascii values
2484 /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2485 /// # Ok::<(), PolarsError>(())
2486 /// ```
2487 pub fn replace_column<C: IntoColumn>(
2488 &mut self,
2489 index: usize,
2490 new_column: C,
2491 ) -> PolarsResult<&mut Self> {
2492 polars_ensure!(
2493 index < self.width(),
2494 ShapeMismatch:
2495 "unable to replace at index {}, the DataFrame has only {} columns",
2496 index, self.width(),
2497 );
2498 let mut new_column = new_column.into_column();
2499 polars_ensure!(
2500 new_column.len() == self.height(),
2501 ShapeMismatch:
2502 "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2503 new_column.len(), self.height(),
2504 );
2505 let old_col = &mut self.columns[index];
2506 mem::swap(old_col, &mut new_column);
2507 self.clear_schema();
2508 Ok(self)
2509 }
2510
2511 /// Apply a closure to a column. This is the recommended way to do in place modification.
2512 ///
2513 /// # Example
2514 ///
2515 /// ```rust
2516 /// # use polars_core::prelude::*;
2517 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2518 /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2519 /// let mut df = DataFrame::new(vec![s0, s1])?;
2520 ///
2521 /// fn str_to_len(str_val: &Column) -> Column {
2522 /// str_val.str()
2523 /// .unwrap()
2524 /// .into_iter()
2525 /// .map(|opt_name: Option<&str>| {
2526 /// opt_name.map(|name: &str| name.len() as u32)
2527 /// })
2528 /// .collect::<UInt32Chunked>()
2529 /// .into_column()
2530 /// }
2531 ///
2532 /// // Replace the names column by the length of the names.
2533 /// df.apply("names", str_to_len);
2534 /// # Ok::<(), PolarsError>(())
2535 /// ```
2536 /// Results in:
2537 ///
2538 /// ```text
2539 /// +--------+-------+
2540 /// | foo | |
2541 /// | --- | names |
2542 /// | str | u32 |
2543 /// +========+=======+
2544 /// | "ham" | 4 |
2545 /// +--------+-------+
2546 /// | "spam" | 6 |
2547 /// +--------+-------+
2548 /// | "egg" | 3 |
2549 /// +--------+-------+
2550 /// ```
2551 pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2552 where
2553 F: FnOnce(&Column) -> C,
2554 C: IntoColumn,
2555 {
2556 let idx = self.check_name_to_idx(name)?;
2557 self.apply_at_idx(idx, f)?;
2558 Ok(self)
2559 }
2560
2561 /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2562 /// modification.
2563 ///
2564 /// # Example
2565 ///
2566 /// ```rust
2567 /// # use polars_core::prelude::*;
2568 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2569 /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2570 /// let mut df = DataFrame::new(vec![s0, s1])?;
2571 ///
2572 /// // Add 32 to get lowercase ascii values
2573 /// df.apply_at_idx(1, |s| s + 32);
2574 /// # Ok::<(), PolarsError>(())
2575 /// ```
2576 /// Results in:
2577 ///
2578 /// ```text
2579 /// +--------+-------+
2580 /// | foo | ascii |
2581 /// | --- | --- |
2582 /// | str | i32 |
2583 /// +========+=======+
2584 /// | "ham" | 102 |
2585 /// +--------+-------+
2586 /// | "spam" | 111 |
2587 /// +--------+-------+
2588 /// | "egg" | 111 |
2589 /// +--------+-------+
2590 /// ```
2591 pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2592 where
2593 F: FnOnce(&Column) -> C,
2594 C: IntoColumn,
2595 {
2596 let df_height = self.height();
2597 let width = self.width();
2598 let col = self.columns.get_mut(idx).ok_or_else(|| {
2599 polars_err!(
2600 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2601 idx, width
2602 )
2603 })?;
2604 let name = col.name().clone();
2605 let dtype_before = col.dtype().clone();
2606 let new_col = f(col).into_column();
2607 match new_col.len() {
2608 1 => {
2609 let new_col = new_col.new_from_index(0, df_height);
2610 let _ = mem::replace(col, new_col);
2611 },
2612 len if (len == df_height) => {
2613 let _ = mem::replace(col, new_col);
2614 },
2615 len => polars_bail!(
2616 ShapeMismatch:
2617 "resulting Series has length {} while the DataFrame has height {}",
2618 len, df_height
2619 ),
2620 }
2621
2622 // make sure the name remains the same after applying the closure
2623 unsafe {
2624 let col = self.columns.get_unchecked_mut(idx);
2625 col.rename(name);
2626
2627 if col.dtype() != &dtype_before {
2628 self.clear_schema();
2629 }
2630 }
2631 Ok(self)
2632 }
2633
2634 /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2635 /// modification.
2636 ///
2637 /// # Example
2638 ///
2639 /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2640 ///
2641 /// ```rust
2642 /// # use polars_core::prelude::*;
2643 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2644 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2645 /// let mut df = DataFrame::new(vec![s0, s1])?;
2646 ///
2647 /// let idx = vec![0, 1, 4];
2648 ///
2649 /// df.try_apply("foo", |c| {
2650 /// c.str()?
2651 /// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2652 /// });
2653 /// # Ok::<(), PolarsError>(())
2654 /// ```
2655 /// Results in:
2656 ///
2657 /// ```text
2658 /// +---------------------+--------+
2659 /// | foo | values |
2660 /// | --- | --- |
2661 /// | str | i32 |
2662 /// +=====================+========+
2663 /// | "ham-is-modified" | 1 |
2664 /// +---------------------+--------+
2665 /// | "spam-is-modified" | 2 |
2666 /// +---------------------+--------+
2667 /// | "egg" | 3 |
2668 /// +---------------------+--------+
2669 /// | "bacon" | 4 |
2670 /// +---------------------+--------+
2671 /// | "quack-is-modified" | 5 |
2672 /// +---------------------+--------+
2673 /// ```
2674 pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2675 where
2676 F: FnOnce(&Column) -> PolarsResult<C>,
2677 C: IntoColumn,
2678 {
2679 let width = self.width();
2680 let col = self.columns.get_mut(idx).ok_or_else(|| {
2681 polars_err!(
2682 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2683 idx, width
2684 )
2685 })?;
2686 let name = col.name().clone();
2687
2688 let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2689
2690 // make sure the name remains the same after applying the closure
2691 unsafe {
2692 let col = self.columns.get_unchecked_mut(idx);
2693 col.rename(name);
2694 }
2695 Ok(self)
2696 }
2697
2698 /// Apply a closure that may fail to a column. This is the recommended way to do in place
2699 /// modification.
2700 ///
2701 /// # Example
2702 ///
2703 /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2704 ///
2705 /// ```rust
2706 /// # use polars_core::prelude::*;
2707 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2708 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2709 /// let mut df = DataFrame::new(vec![s0, s1])?;
2710 ///
2711 /// // create a mask
2712 /// let values = df.column("values")?.as_materialized_series();
2713 /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2714 ///
2715 /// df.try_apply("foo", |c| {
2716 /// c.str()?
2717 /// .set(&mask, Some("not_within_bounds"))
2718 /// });
2719 /// # Ok::<(), PolarsError>(())
2720 /// ```
2721 /// Results in:
2722 ///
2723 /// ```text
2724 /// +---------------------+--------+
2725 /// | foo | values |
2726 /// | --- | --- |
2727 /// | str | i32 |
2728 /// +=====================+========+
2729 /// | "not_within_bounds" | 1 |
2730 /// +---------------------+--------+
2731 /// | "spam" | 2 |
2732 /// +---------------------+--------+
2733 /// | "egg" | 3 |
2734 /// +---------------------+--------+
2735 /// | "bacon" | 4 |
2736 /// +---------------------+--------+
2737 /// | "not_within_bounds" | 5 |
2738 /// +---------------------+--------+
2739 /// ```
2740 pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2741 where
2742 F: FnOnce(&Series) -> PolarsResult<C>,
2743 C: IntoColumn,
2744 {
2745 let idx = self.try_get_column_index(column)?;
2746 self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2747 }
2748
2749 /// Slice the [`DataFrame`] along the rows.
2750 ///
2751 /// # Example
2752 ///
2753 /// ```rust
2754 /// # use polars_core::prelude::*;
2755 /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2756 /// "Color" => ["Green", "Red", "White", "White", "Red"])?;
2757 /// let sl: DataFrame = df.slice(2, 3);
2758 ///
2759 /// assert_eq!(sl.shape(), (3, 2));
2760 /// println!("{}", sl);
2761 /// # Ok::<(), PolarsError>(())
2762 /// ```
2763 /// Output:
2764 /// ```text
2765 /// shape: (3, 2)
2766 /// +-------+-------+
2767 /// | Fruit | Color |
2768 /// | --- | --- |
2769 /// | str | str |
2770 /// +=======+=======+
2771 /// | Grape | White |
2772 /// +-------+-------+
2773 /// | Fig | White |
2774 /// +-------+-------+
2775 /// | Fig | Red |
2776 /// +-------+-------+
2777 /// ```
2778 #[must_use]
2779 pub fn slice(&self, offset: i64, length: usize) -> Self {
2780 if offset == 0 && length == self.height() {
2781 return self.clone();
2782 }
2783 if length == 0 {
2784 return self.clear();
2785 }
2786 let col = self
2787 .columns
2788 .iter()
2789 .map(|s| s.slice(offset, length))
2790 .collect::<Vec<_>>();
2791
2792 let height = if let Some(fst) = col.first() {
2793 fst.len()
2794 } else {
2795 let (_, length) = slice_offsets(offset, length, self.height());
2796 length
2797 };
2798
2799 unsafe { DataFrame::new_no_checks(height, col) }
2800 }
2801
2802 /// Split [`DataFrame`] at the given `offset`.
2803 pub fn split_at(&self, offset: i64) -> (Self, Self) {
2804 let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2805
2806 let (idx, _) = slice_offsets(offset, 0, self.height());
2807
2808 let a = unsafe { DataFrame::new_no_checks(idx, a) };
2809 let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2810 (a, b)
2811 }
2812
2813 #[must_use]
2814 pub fn clear(&self) -> Self {
2815 let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2816 unsafe { DataFrame::new_no_checks(0, col) }
2817 }
2818
2819 #[must_use]
2820 pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2821 if offset == 0 && length == self.height() {
2822 return self.clone();
2823 }
2824 let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2825 unsafe { DataFrame::new_no_checks(length, columns) }
2826 }
2827
2828 #[must_use]
2829 pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2830 if offset == 0 && length == self.height() {
2831 return self.clone();
2832 }
2833 // @scalar-opt
2834 let columns = self._apply_columns(&|s| {
2835 let mut out = s.slice(offset, length);
2836 out.shrink_to_fit();
2837 out
2838 });
2839 unsafe { DataFrame::new_no_checks(length, columns) }
2840 }
2841
2842 /// Get the head of the [`DataFrame`].
2843 ///
2844 /// # Example
2845 ///
2846 /// ```rust
2847 /// # use polars_core::prelude::*;
2848 /// let countries: DataFrame =
2849 /// df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2850 /// "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2851 /// "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2852 /// "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2853 /// assert_eq!(countries.shape(), (5, 4));
2854 ///
2855 /// println!("{}", countries.head(Some(3)));
2856 /// # Ok::<(), PolarsError>(())
2857 /// ```
2858 ///
2859 /// Output:
2860 ///
2861 /// ```text
2862 /// shape: (3, 4)
2863 /// +--------------------+---------------+---------------+------------+
2864 /// | Rank by GDP (2021) | Continent | Country | Capital |
2865 /// | --- | --- | --- | --- |
2866 /// | i32 | str | str | str |
2867 /// +====================+===============+===============+============+
2868 /// | 1 | North America | United States | Washington |
2869 /// +--------------------+---------------+---------------+------------+
2870 /// | 2 | Asia | China | Beijing |
2871 /// +--------------------+---------------+---------------+------------+
2872 /// | 3 | Asia | Japan | Tokyo |
2873 /// +--------------------+---------------+---------------+------------+
2874 /// ```
2875 #[must_use]
2876 pub fn head(&self, length: Option<usize>) -> Self {
2877 let col = self
2878 .columns
2879 .iter()
2880 .map(|c| c.head(length))
2881 .collect::<Vec<_>>();
2882
2883 let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2884 let height = usize::min(height, self.height());
2885 unsafe { DataFrame::new_no_checks(height, col) }
2886 }
2887
2888 /// Get the tail of the [`DataFrame`].
2889 ///
2890 /// # Example
2891 ///
2892 /// ```rust
2893 /// # use polars_core::prelude::*;
2894 /// let countries: DataFrame =
2895 /// df!("Rank (2021)" => [105, 106, 107, 108, 109],
2896 /// "Apple Price (ā¬/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2897 /// "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2898 /// assert_eq!(countries.shape(), (5, 3));
2899 ///
2900 /// println!("{}", countries.tail(Some(2)));
2901 /// # Ok::<(), PolarsError>(())
2902 /// ```
2903 ///
2904 /// Output:
2905 ///
2906 /// ```text
2907 /// shape: (2, 3)
2908 /// +-------------+--------------------+---------+
2909 /// | Rank (2021) | Apple Price (ā¬/kg) | Country |
2910 /// | --- | --- | --- |
2911 /// | i32 | f64 | str |
2912 /// +=============+====================+=========+
2913 /// | 108 | 0.63 | Syria |
2914 /// +-------------+--------------------+---------+
2915 /// | 109 | 0.63 | Turkey |
2916 /// +-------------+--------------------+---------+
2917 /// ```
2918 #[must_use]
2919 pub fn tail(&self, length: Option<usize>) -> Self {
2920 let col = self
2921 .columns
2922 .iter()
2923 .map(|c| c.tail(length))
2924 .collect::<Vec<_>>();
2925
2926 let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2927 let height = usize::min(height, self.height());
2928 unsafe { DataFrame::new_no_checks(height, col) }
2929 }
2930
2931 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2932 ///
2933 /// # Panics
2934 ///
2935 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2936 ///
2937 /// This responsibility is left to the caller as we don't want to take mutable references here,
2938 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2939 /// as well.
2940 pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter<'_> {
2941 debug_assert!(!self.should_rechunk(), "expected equal chunks");
2942 // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2943 // as we must allocate arrow strings/binaries.
2944 let must_convert = compat_level.0 == 0;
2945 let parallel = parallel
2946 && must_convert
2947 && self.columns.len() > 1
2948 && self
2949 .columns
2950 .iter()
2951 .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2952
2953 RecordBatchIter {
2954 columns: &self.columns,
2955 schema: Arc::new(
2956 self.columns
2957 .iter()
2958 .map(|c| c.field().to_arrow(compat_level))
2959 .collect(),
2960 ),
2961 idx: 0,
2962 n_chunks: self.first_col_n_chunks(),
2963 compat_level,
2964 parallel,
2965 }
2966 }
2967
2968 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2969 ///
2970 /// # Panics
2971 ///
2972 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2973 ///
2974 /// This responsibility is left to the caller as we don't want to take mutable references here,
2975 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2976 /// as well.
2977 pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2978 debug_assert!(!self.should_rechunk());
2979 PhysRecordBatchIter {
2980 schema: Arc::new(
2981 self.get_columns()
2982 .iter()
2983 .map(|c| c.field().to_arrow(CompatLevel::newest()))
2984 .collect(),
2985 ),
2986 arr_iters: self
2987 .materialized_column_iter()
2988 .map(|s| s.chunks().iter())
2989 .collect(),
2990 }
2991 }
2992
2993 /// Get a [`DataFrame`] with all the columns in reversed order.
2994 #[must_use]
2995 pub fn reverse(&self) -> Self {
2996 let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2997 unsafe { DataFrame::new_no_checks(self.height(), col) }
2998 }
2999
3000 /// Shift the values by a given period and fill the parts that will be empty due to this operation
3001 /// with `Nones`.
3002 ///
3003 /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
3004 #[must_use]
3005 pub fn shift(&self, periods: i64) -> Self {
3006 let col = self._apply_columns_par(&|s| s.shift(periods));
3007 unsafe { DataFrame::new_no_checks(self.height(), col) }
3008 }
3009
3010 /// Replace None values with one of the following strategies:
3011 /// * Forward fill (replace None with the previous value)
3012 /// * Backward fill (replace None with the next value)
3013 /// * Mean fill (replace None with the mean of the whole array)
3014 /// * Min fill (replace None with the minimum of the whole array)
3015 /// * Max fill (replace None with the maximum of the whole array)
3016 ///
3017 /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
3018 pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
3019 let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
3020
3021 Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
3022 }
3023
3024 /// Pipe different functions/ closure operations that work on a DataFrame together.
3025 pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
3026 where
3027 F: Fn(DataFrame) -> PolarsResult<B>,
3028 {
3029 f(self)
3030 }
3031
3032 /// Pipe different functions/ closure operations that work on a DataFrame together.
3033 pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
3034 where
3035 F: Fn(&mut DataFrame) -> PolarsResult<B>,
3036 {
3037 f(self)
3038 }
3039
3040 /// Pipe different functions/ closure operations that work on a DataFrame together.
3041 pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
3042 where
3043 F: Fn(DataFrame, Args) -> PolarsResult<B>,
3044 {
3045 f(self, args)
3046 }
3047
3048 /// Drop duplicate rows from a [`DataFrame`].
3049 /// *This fails when there is a column of type List in DataFrame*
3050 ///
3051 /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
3052 ///
3053 /// # Example
3054 ///
3055 /// ```no_run
3056 /// # use polars_core::prelude::*;
3057 /// let df = df! {
3058 /// "flt" => [1., 1., 2., 2., 3., 3.],
3059 /// "int" => [1, 1, 2, 2, 3, 3, ],
3060 /// "str" => ["a", "a", "b", "b", "c", "c"]
3061 /// }?;
3062 ///
3063 /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
3064 /// # Ok::<(), PolarsError>(())
3065 /// ```
3066 /// Returns
3067 ///
3068 /// ```text
3069 /// +-----+-----+-----+
3070 /// | flt | int | str |
3071 /// | --- | --- | --- |
3072 /// | f64 | i32 | str |
3073 /// +=====+=====+=====+
3074 /// | 1 | 1 | "a" |
3075 /// +-----+-----+-----+
3076 /// | 2 | 2 | "b" |
3077 /// +-----+-----+-----+
3078 /// | 3 | 3 | "c" |
3079 /// +-----+-----+-----+
3080 /// ```
3081 #[cfg(feature = "algorithm_group_by")]
3082 pub fn unique_stable(
3083 &self,
3084 subset: Option<&[String]>,
3085 keep: UniqueKeepStrategy,
3086 slice: Option<(i64, usize)>,
3087 ) -> PolarsResult<DataFrame> {
3088 self.unique_impl(
3089 true,
3090 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3091 keep,
3092 slice,
3093 )
3094 }
3095
3096 /// Unstable distinct. See [`DataFrame::unique_stable`].
3097 #[cfg(feature = "algorithm_group_by")]
3098 pub fn unique<I, S>(
3099 &self,
3100 subset: Option<&[String]>,
3101 keep: UniqueKeepStrategy,
3102 slice: Option<(i64, usize)>,
3103 ) -> PolarsResult<DataFrame> {
3104 self.unique_impl(
3105 false,
3106 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3107 keep,
3108 slice,
3109 )
3110 }
3111
3112 #[cfg(feature = "algorithm_group_by")]
3113 pub fn unique_impl(
3114 &self,
3115 maintain_order: bool,
3116 subset: Option<Vec<PlSmallStr>>,
3117 keep: UniqueKeepStrategy,
3118 slice: Option<(i64, usize)>,
3119 ) -> PolarsResult<Self> {
3120 let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3121 let mut df = self.clone();
3122 // take on multiple chunks is terrible
3123 df.as_single_chunk_par();
3124
3125 let columns = match (keep, maintain_order) {
3126 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3127 let gb = df.group_by_stable(names)?;
3128 let groups = gb.get_groups();
3129 let (offset, len) = slice.unwrap_or((0, groups.len()));
3130 let groups = groups.slice(offset, len);
3131 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3132 },
3133 (UniqueKeepStrategy::Last, true) => {
3134 // maintain order by last values, so the sorted groups are not correct as they
3135 // are sorted by the first value
3136 let gb = df.group_by_stable(names)?;
3137 let groups = gb.get_groups();
3138
3139 let last_idx: NoNull<IdxCa> = groups
3140 .iter()
3141 .map(|g| match g {
3142 GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3143 GroupsIndicator::Slice([first, len]) => first + len - 1,
3144 })
3145 .collect();
3146
3147 let mut last_idx = last_idx.into_inner().sort(false);
3148
3149 if let Some((offset, len)) = slice {
3150 last_idx = last_idx.slice(offset, len);
3151 }
3152
3153 let last_idx = NoNull::new(last_idx);
3154 let out = unsafe { df.take_unchecked(&last_idx) };
3155 return Ok(out);
3156 },
3157 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3158 let gb = df.group_by(names)?;
3159 let groups = gb.get_groups();
3160 let (offset, len) = slice.unwrap_or((0, groups.len()));
3161 let groups = groups.slice(offset, len);
3162 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3163 },
3164 (UniqueKeepStrategy::Last, false) => {
3165 let gb = df.group_by(names)?;
3166 let groups = gb.get_groups();
3167 let (offset, len) = slice.unwrap_or((0, groups.len()));
3168 let groups = groups.slice(offset, len);
3169 df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3170 },
3171 (UniqueKeepStrategy::None, _) => {
3172 let df_part = df.select(names)?;
3173 let mask = df_part.is_unique()?;
3174 let mut filtered = df.filter(&mask)?;
3175
3176 if let Some((offset, len)) = slice {
3177 filtered = filtered.slice(offset, len);
3178 }
3179 return Ok(filtered);
3180 },
3181 };
3182 let height = Self::infer_height(&columns);
3183 Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3184 }
3185
3186 /// Get a mask of all the unique rows in the [`DataFrame`].
3187 ///
3188 /// # Example
3189 ///
3190 /// ```no_run
3191 /// # use polars_core::prelude::*;
3192 /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3193 /// "ISIN" => ["US0378331005", "US5949181045"])?;
3194 /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3195 ///
3196 /// assert!(ca.all());
3197 /// # Ok::<(), PolarsError>(())
3198 /// ```
3199 #[cfg(feature = "algorithm_group_by")]
3200 pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3201 let gb = self.group_by(self.get_column_names_owned())?;
3202 let groups = gb.get_groups();
3203 Ok(is_unique_helper(
3204 groups,
3205 self.height() as IdxSize,
3206 true,
3207 false,
3208 ))
3209 }
3210
3211 /// Get a mask of all the duplicated rows in the [`DataFrame`].
3212 ///
3213 /// # Example
3214 ///
3215 /// ```no_run
3216 /// # use polars_core::prelude::*;
3217 /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3218 /// "ISIN" => ["US02079K3059", "US02079K1079"])?;
3219 /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3220 ///
3221 /// assert!(!ca.all());
3222 /// # Ok::<(), PolarsError>(())
3223 /// ```
3224 #[cfg(feature = "algorithm_group_by")]
3225 pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3226 let gb = self.group_by(self.get_column_names_owned())?;
3227 let groups = gb.get_groups();
3228 Ok(is_unique_helper(
3229 groups,
3230 self.height() as IdxSize,
3231 false,
3232 true,
3233 ))
3234 }
3235
3236 /// Create a new [`DataFrame`] that shows the null counts per column.
3237 #[must_use]
3238 pub fn null_count(&self) -> Self {
3239 let cols = self
3240 .columns
3241 .iter()
3242 .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3243 .collect();
3244 unsafe { Self::new_no_checks(1, cols) }
3245 }
3246
3247 /// Hash and combine the row values
3248 #[cfg(feature = "row_hash")]
3249 pub fn hash_rows(
3250 &mut self,
3251 hasher_builder: Option<PlSeedableRandomStateQuality>,
3252 ) -> PolarsResult<UInt64Chunked> {
3253 let dfs = split_df(self, POOL.current_num_threads(), false);
3254 let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3255
3256 let mut iter = cas.into_iter();
3257 let mut acc_ca = iter.next().unwrap();
3258 for ca in iter {
3259 acc_ca.append(&ca)?;
3260 }
3261 Ok(acc_ca.rechunk().into_owned())
3262 }
3263
3264 /// Get the supertype of the columns in this DataFrame
3265 pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3266 self.columns
3267 .iter()
3268 .map(|s| Ok(s.dtype().clone()))
3269 .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3270 }
3271
3272 /// Take by index values given by the slice `idx`.
3273 /// # Warning
3274 /// Be careful with allowing threads when calling this in a large hot loop
3275 /// every thread split may be on rayon stack and lead to SO
3276 #[doc(hidden)]
3277 pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3278 self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3279 }
3280
3281 /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3282 /// if the index value in `idx` are sorted. This will maintain sorted flags.
3283 ///
3284 /// # Warning
3285 /// Be careful with allowing threads when calling this in a large hot loop
3286 /// every thread split may be on rayon stack and lead to SO
3287 #[doc(hidden)]
3288 pub unsafe fn _take_unchecked_slice_sorted(
3289 &self,
3290 idx: &[IdxSize],
3291 allow_threads: bool,
3292 sorted: IsSorted,
3293 ) -> Self {
3294 #[cfg(debug_assertions)]
3295 {
3296 if idx.len() > 2 {
3297 match sorted {
3298 IsSorted::Ascending => {
3299 assert!(idx[0] <= idx[idx.len() - 1]);
3300 },
3301 IsSorted::Descending => {
3302 assert!(idx[0] >= idx[idx.len() - 1]);
3303 },
3304 _ => {},
3305 }
3306 }
3307 }
3308 let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3309 ca.set_sorted_flag(sorted);
3310 self.take_unchecked_impl(&ca, allow_threads)
3311 }
3312
3313 #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3314 #[doc(hidden)]
3315 pub fn _partition_by_impl(
3316 &self,
3317 cols: &[PlSmallStr],
3318 stable: bool,
3319 include_key: bool,
3320 parallel: bool,
3321 ) -> PolarsResult<Vec<DataFrame>> {
3322 let selected_keys = self.select_columns(cols.iter().cloned())?;
3323 let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3324 let groups = groups.take_groups();
3325
3326 // drop key columns prior to calculation if requested
3327 let df = if include_key {
3328 self.clone()
3329 } else {
3330 self.drop_many(cols.iter().cloned())
3331 };
3332
3333 if parallel {
3334 // don't parallelize this
3335 // there is a lot of parallelization in take and this may easily SO
3336 POOL.install(|| {
3337 match groups.as_ref() {
3338 GroupsType::Idx(idx) => {
3339 // Rechunk as the gather may rechunk for every group #17562.
3340 let mut df = df.clone();
3341 df.as_single_chunk_par();
3342 Ok(idx
3343 .into_par_iter()
3344 .map(|(_, group)| {
3345 // groups are in bounds
3346 unsafe {
3347 df._take_unchecked_slice_sorted(
3348 group,
3349 false,
3350 IsSorted::Ascending,
3351 )
3352 }
3353 })
3354 .collect())
3355 },
3356 GroupsType::Slice { groups, .. } => Ok(groups
3357 .into_par_iter()
3358 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3359 .collect()),
3360 }
3361 })
3362 } else {
3363 match groups.as_ref() {
3364 GroupsType::Idx(idx) => {
3365 // Rechunk as the gather may rechunk for every group #17562.
3366 let mut df = df;
3367 df.as_single_chunk();
3368 Ok(idx
3369 .into_iter()
3370 .map(|(_, group)| {
3371 // groups are in bounds
3372 unsafe {
3373 df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3374 }
3375 })
3376 .collect())
3377 },
3378 GroupsType::Slice { groups, .. } => Ok(groups
3379 .iter()
3380 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3381 .collect()),
3382 }
3383 }
3384 }
3385
3386 /// Split into multiple DataFrames partitioned by groups
3387 #[cfg(feature = "partition_by")]
3388 pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3389 where
3390 I: IntoIterator<Item = S>,
3391 S: Into<PlSmallStr>,
3392 {
3393 let cols = cols
3394 .into_iter()
3395 .map(Into::into)
3396 .collect::<Vec<PlSmallStr>>();
3397 self._partition_by_impl(cols.as_slice(), false, include_key, true)
3398 }
3399
3400 /// Split into multiple DataFrames partitioned by groups
3401 /// Order of the groups are maintained.
3402 #[cfg(feature = "partition_by")]
3403 pub fn partition_by_stable<I, S>(
3404 &self,
3405 cols: I,
3406 include_key: bool,
3407 ) -> PolarsResult<Vec<DataFrame>>
3408 where
3409 I: IntoIterator<Item = S>,
3410 S: Into<PlSmallStr>,
3411 {
3412 let cols = cols
3413 .into_iter()
3414 .map(Into::into)
3415 .collect::<Vec<PlSmallStr>>();
3416 self._partition_by_impl(cols.as_slice(), true, include_key, true)
3417 }
3418
3419 /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3420 /// inserted as columns.
3421 #[cfg(feature = "dtype-struct")]
3422 pub fn unnest<I: IntoVec<PlSmallStr>>(
3423 &self,
3424 cols: I,
3425 separator: Option<&str>,
3426 ) -> PolarsResult<DataFrame> {
3427 let cols = cols.into_vec();
3428 self.unnest_impl(cols.into_iter().collect(), separator)
3429 }
3430
3431 #[cfg(feature = "dtype-struct")]
3432 fn unnest_impl(
3433 &self,
3434 cols: PlHashSet<PlSmallStr>,
3435 separator: Option<&str>,
3436 ) -> PolarsResult<DataFrame> {
3437 let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3438 let mut count = 0;
3439 for s in &self.columns {
3440 if cols.contains(s.name()) {
3441 let ca = s.struct_()?.clone();
3442 new_cols.extend(ca.fields_as_series().into_iter().map(|mut f| {
3443 if let Some(separator) = &separator {
3444 f.rename(polars_utils::format_pl_smallstr!(
3445 "{}{}{}",
3446 s.name(),
3447 separator,
3448 f.name()
3449 ));
3450 }
3451 Column::from(f)
3452 }));
3453 count += 1;
3454 } else {
3455 new_cols.push(s.clone())
3456 }
3457 }
3458 if count != cols.len() {
3459 // one or more columns not found
3460 // the code below will return an error with the missing name
3461 let schema = self.schema();
3462 for col in cols {
3463 let _ = schema
3464 .get(col.as_str())
3465 .ok_or_else(|| polars_err!(col_not_found = col))?;
3466 }
3467 }
3468 DataFrame::new(new_cols)
3469 }
3470
3471 pub(crate) fn infer_height(cols: &[Column]) -> usize {
3472 cols.first().map_or(0, Column::len)
3473 }
3474
3475 pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3476 // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3477 // append_chunk or something like this. It is just quite difficult to make that safe.
3478 let df = DataFrame::from(rb);
3479 polars_ensure!(
3480 self.schema() == df.schema(),
3481 SchemaMismatch: "cannot append record batch with different schema\n\n
3482 Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3483 );
3484 self.vstack_mut_owned_unchecked(df);
3485 Ok(())
3486 }
3487
3488 pub fn into_columns(self) -> Vec<Column> {
3489 self.columns
3490 }
3491}
3492
3493pub struct RecordBatchIter<'a> {
3494 columns: &'a Vec<Column>,
3495 schema: ArrowSchemaRef,
3496 idx: usize,
3497 n_chunks: usize,
3498 compat_level: CompatLevel,
3499 parallel: bool,
3500}
3501
3502impl Iterator for RecordBatchIter<'_> {
3503 type Item = RecordBatch;
3504
3505 fn next(&mut self) -> Option<Self::Item> {
3506 if self.idx >= self.n_chunks {
3507 return None;
3508 }
3509
3510 // Create a batch of the columns with the same chunk no.
3511 let batch_cols: Vec<ArrayRef> = if self.parallel {
3512 let iter = self
3513 .columns
3514 .par_iter()
3515 .map(Column::as_materialized_series)
3516 .map(|s| s.to_arrow(self.idx, self.compat_level));
3517 POOL.install(|| iter.collect())
3518 } else {
3519 self.columns
3520 .iter()
3521 .map(Column::as_materialized_series)
3522 .map(|s| s.to_arrow(self.idx, self.compat_level))
3523 .collect()
3524 };
3525 self.idx += 1;
3526
3527 let length = batch_cols.first().map_or(0, |arr| arr.len());
3528 Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3529 }
3530
3531 fn size_hint(&self) -> (usize, Option<usize>) {
3532 let n = self.n_chunks - self.idx;
3533 (n, Some(n))
3534 }
3535}
3536
3537pub struct PhysRecordBatchIter<'a> {
3538 schema: ArrowSchemaRef,
3539 arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3540}
3541
3542impl Iterator for PhysRecordBatchIter<'_> {
3543 type Item = RecordBatch;
3544
3545 fn next(&mut self) -> Option<Self::Item> {
3546 let arrs = self
3547 .arr_iters
3548 .iter_mut()
3549 .map(|phys_iter| phys_iter.next().cloned())
3550 .collect::<Option<Vec<_>>>()?;
3551
3552 let length = arrs.first().map_or(0, |arr| arr.len());
3553 Some(RecordBatch::new(length, self.schema.clone(), arrs))
3554 }
3555
3556 fn size_hint(&self) -> (usize, Option<usize>) {
3557 if let Some(iter) = self.arr_iters.first() {
3558 iter.size_hint()
3559 } else {
3560 (0, None)
3561 }
3562 }
3563}
3564
3565impl Default for DataFrame {
3566 fn default() -> Self {
3567 DataFrame::empty()
3568 }
3569}
3570
3571impl From<DataFrame> for Vec<Column> {
3572 fn from(df: DataFrame) -> Self {
3573 df.columns
3574 }
3575}
3576
3577// utility to test if we can vstack/extend the columns
3578fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3579 polars_ensure!(
3580 left.name() == right.name(),
3581 ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3582 left.name(), right.name(),
3583 );
3584 Ok(())
3585}
3586
3587#[cfg(test)]
3588mod test {
3589 use super::*;
3590
3591 fn create_frame() -> DataFrame {
3592 let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3593 let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3594 DataFrame::new(vec![s0, s1]).unwrap()
3595 }
3596
3597 #[test]
3598 #[cfg_attr(miri, ignore)]
3599 fn test_recordbatch_iterator() {
3600 let df = df!(
3601 "foo" => [1, 2, 3, 4, 5]
3602 )
3603 .unwrap();
3604 let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3605 assert_eq!(5, iter.next().unwrap().len());
3606 assert!(iter.next().is_none());
3607 }
3608
3609 #[test]
3610 #[cfg_attr(miri, ignore)]
3611 fn test_select() {
3612 let df = create_frame();
3613 assert_eq!(
3614 df.column("days")
3615 .unwrap()
3616 .as_series()
3617 .unwrap()
3618 .equal(1)
3619 .unwrap()
3620 .sum(),
3621 Some(1)
3622 );
3623 }
3624
3625 #[test]
3626 #[cfg_attr(miri, ignore)]
3627 fn test_filter_broadcast_on_string_col() {
3628 let col_name = "some_col";
3629 let v = vec!["test".to_string()];
3630 let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3631 let mut df = DataFrame::new(vec![s0]).unwrap();
3632
3633 df = df
3634 .filter(
3635 &df.column(col_name)
3636 .unwrap()
3637 .as_materialized_series()
3638 .equal("")
3639 .unwrap(),
3640 )
3641 .unwrap();
3642 assert_eq!(
3643 df.column(col_name)
3644 .unwrap()
3645 .as_materialized_series()
3646 .n_chunks(),
3647 1
3648 );
3649 }
3650
3651 #[test]
3652 #[cfg_attr(miri, ignore)]
3653 fn test_filter_broadcast_on_list_col() {
3654 let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3655 let ll: ListChunked = [&s1].iter().copied().collect();
3656
3657 let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3658 let new = ll.filter(&mask).unwrap();
3659
3660 assert_eq!(new.chunks.len(), 1);
3661 assert_eq!(new.len(), 0);
3662 }
3663
3664 #[test]
3665 fn slice() {
3666 let df = create_frame();
3667 let sliced_df = df.slice(0, 2);
3668 assert_eq!(sliced_df.shape(), (2, 2));
3669 }
3670
3671 #[test]
3672 fn rechunk_false() {
3673 let df = create_frame();
3674 assert!(!df.should_rechunk())
3675 }
3676
3677 #[test]
3678 fn rechunk_true() -> PolarsResult<()> {
3679 let mut base = df!(
3680 "a" => [1, 2, 3],
3681 "b" => [1, 2, 3]
3682 )?;
3683
3684 // Create a series with multiple chunks
3685 let mut s = Series::new("foo".into(), 0..2);
3686 let s2 = Series::new("bar".into(), 0..1);
3687 s.append(&s2)?;
3688
3689 // Append series to frame
3690 let out = base.with_column(s)?;
3691
3692 // Now we should rechunk
3693 assert!(out.should_rechunk());
3694 Ok(())
3695 }
3696
3697 #[test]
3698 fn test_duplicate_column() {
3699 let mut df = df! {
3700 "foo" => [1, 2, 3]
3701 }
3702 .unwrap();
3703 // check if column is replaced
3704 assert!(
3705 df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3706 .is_ok()
3707 );
3708 assert!(
3709 df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3710 .is_ok()
3711 );
3712 assert!(df.column("bar").is_ok())
3713 }
3714
3715 #[test]
3716 #[cfg_attr(miri, ignore)]
3717 fn distinct() {
3718 let df = df! {
3719 "flt" => [1., 1., 2., 2., 3., 3.],
3720 "int" => [1, 1, 2, 2, 3, 3, ],
3721 "str" => ["a", "a", "b", "b", "c", "c"]
3722 }
3723 .unwrap();
3724 let df = df
3725 .unique_stable(None, UniqueKeepStrategy::First, None)
3726 .unwrap()
3727 .sort(["flt"], SortMultipleOptions::default())
3728 .unwrap();
3729 let valid = df! {
3730 "flt" => [1., 2., 3.],
3731 "int" => [1, 2, 3],
3732 "str" => ["a", "b", "c"]
3733 }
3734 .unwrap();
3735 assert!(df.equals(&valid));
3736 }
3737
3738 #[test]
3739 fn test_vstack() {
3740 // check that it does not accidentally rechunks
3741 let mut df = df! {
3742 "flt" => [1., 1., 2., 2., 3., 3.],
3743 "int" => [1, 1, 2, 2, 3, 3, ],
3744 "str" => ["a", "a", "b", "b", "c", "c"]
3745 }
3746 .unwrap();
3747
3748 df.vstack_mut(&df.slice(0, 3)).unwrap();
3749 assert_eq!(df.first_col_n_chunks(), 2)
3750 }
3751
3752 #[test]
3753 fn test_vstack_on_empty_dataframe() {
3754 let mut df = DataFrame::empty();
3755
3756 let df_data = df! {
3757 "flt" => [1., 1., 2., 2., 3., 3.],
3758 "int" => [1, 1, 2, 2, 3, 3, ],
3759 "str" => ["a", "a", "b", "b", "c", "c"]
3760 }
3761 .unwrap();
3762
3763 df.vstack_mut(&df_data).unwrap();
3764 assert_eq!(df.height, 6)
3765 }
3766
3767 #[test]
3768 fn test_replace_or_add() -> PolarsResult<()> {
3769 let mut df = df!(
3770 "a" => [1, 2, 3],
3771 "b" => [1, 2, 3]
3772 )?;
3773
3774 // check that the new column is "c" and not "bar".
3775 df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3776
3777 assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3778 Ok(())
3779 }
3780
3781 #[test]
3782 fn test_unique_keep_none_with_slice() {
3783 let df = df! {
3784 "x" => [1, 2, 3, 2, 1]
3785 }
3786 .unwrap();
3787 let out = df
3788 .unique_stable(
3789 Some(&["x".to_string()][..]),
3790 UniqueKeepStrategy::None,
3791 Some((0, 2)),
3792 )
3793 .unwrap();
3794 let expected = df! {
3795 "x" => [3]
3796 }
3797 .unwrap();
3798 assert!(out.equals(&expected));
3799 }
3800
3801 #[test]
3802 #[cfg(feature = "dtype-i8")]
3803 fn test_apply_result_schema() {
3804 let mut df = df! {
3805 "x" => [1, 2, 3, 2, 1]
3806 }
3807 .unwrap();
3808
3809 let schema_before = df.schema().clone();
3810 df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3811 assert_ne!(&schema_before, df.schema());
3812 }
3813}