polars_core/frame/mod.rs
1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(any(feature = "rows", feature = "object"))]
33pub mod row;
34mod top_k;
35mod upstream_traits;
36mod validation;
37
38use arrow::record_batch::{RecordBatch, RecordBatchT};
39use polars_utils::pl_str::PlSmallStr;
40#[cfg(feature = "serde")]
41use serde::{Deserialize, Serialize};
42use strum_macros::IntoStaticStr;
43
44use crate::POOL;
45#[cfg(feature = "row_hash")]
46use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48use crate::series::IsSorted;
49
50#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52#[strum(serialize_all = "snake_case")]
53pub enum UniqueKeepStrategy {
54 /// Keep the first unique row.
55 First,
56 /// Keep the last unique row.
57 Last,
58 /// Keep None of the unique rows.
59 None,
60 /// Keep any of the unique rows
61 /// This allows more optimizations
62 #[default]
63 Any,
64}
65
66fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
67where
68 F: for<'a> FnMut(&'a T) -> &'a str,
69{
70 // Always unique.
71 if items.len() <= 1 {
72 return Ok(());
73 }
74
75 if items.len() <= 4 {
76 // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
77 for i in 0..items.len() - 1 {
78 let name = get_name(&items[i]);
79 for other in items.iter().skip(i + 1) {
80 if name == get_name(other) {
81 polars_bail!(duplicate = name);
82 }
83 }
84 }
85 } else {
86 let mut names = PlHashSet::with_capacity(items.len());
87 for item in items {
88 let name = get_name(item);
89 if !names.insert(name) {
90 polars_bail!(duplicate = name);
91 }
92 }
93 }
94 Ok(())
95}
96
97/// A contiguous growable collection of `Series` that have the same length.
98///
99/// ## Use declarations
100///
101/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
102///
103/// ```rust
104/// use polars_core::prelude::*; // if the crate polars-core is used directly
105/// // use polars::prelude::*; if the crate polars is used
106/// ```
107///
108/// # Initialization
109/// ## Default
110///
111/// A `DataFrame` can be initialized empty:
112///
113/// ```rust
114/// # use polars_core::prelude::*;
115/// let df = DataFrame::default();
116/// assert!(df.is_empty());
117/// ```
118///
119/// ## Wrapping a `Vec<Series>`
120///
121/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
122///
123/// ```rust
124/// # use polars_core::prelude::*;
125/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
126/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
127///
128/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
129/// ```
130///
131/// ## Using a macro
132///
133/// The [`df!`] macro is a convenient method:
134///
135/// ```rust
136/// # use polars_core::prelude::*;
137/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
138/// "Color" => ["Red", "Yellow", "Green"]);
139/// ```
140///
141/// ## Using a CSV file
142///
143/// See the `polars_io::csv::CsvReader`.
144///
145/// # Indexing
146/// ## By a number
147///
148/// The `Index<usize>` is implemented for the `DataFrame`.
149///
150/// ```rust
151/// # use polars_core::prelude::*;
152/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
153/// "Color" => ["Red", "Yellow", "Green"])?;
154///
155/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
156/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
157/// # Ok::<(), PolarsError>(())
158/// ```
159///
160/// ## By a `Series` name
161///
162/// ```rust
163/// # use polars_core::prelude::*;
164/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
165/// "Color" => ["Red", "Yellow", "Green"])?;
166///
167/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
168/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
169/// # Ok::<(), PolarsError>(())
170/// ```
171#[derive(Clone)]
172pub struct DataFrame {
173 height: usize,
174 // invariant: columns[i].len() == height for each 0 >= i > columns.len()
175 pub(crate) columns: Vec<Column>,
176
177 /// A cached schema. This might not give correct results if the DataFrame was modified in place
178 /// between schema and reading.
179 cached_schema: OnceLock<SchemaRef>,
180}
181
182impl DataFrame {
183 pub fn clear_schema(&mut self) {
184 self.cached_schema = OnceLock::new();
185 }
186
187 #[inline]
188 pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
189 self.columns.iter()
190 }
191
192 #[inline]
193 pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
194 self.columns.iter().map(Column::as_materialized_series)
195 }
196
197 #[inline]
198 pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
199 self.columns.par_iter().map(Column::as_materialized_series)
200 }
201
202 /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
203 ///
204 /// # Implementation
205 /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
206 /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
207 /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
208 ///
209 /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
210 /// However, this function will yield a smaller number. This is because this function returns
211 /// the visible size of the buffer, not its total capacity.
212 ///
213 /// FFI buffers are included in this estimation.
214 pub fn estimated_size(&self) -> usize {
215 self.columns.iter().map(Column::estimated_size).sum()
216 }
217
218 // Reduce monomorphization.
219 fn try_apply_columns(
220 &self,
221 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
222 ) -> PolarsResult<Vec<Column>> {
223 self.columns.iter().map(func).collect()
224 }
225 // Reduce monomorphization.
226 pub fn _apply_columns(&self, func: &(dyn Fn(&Column) -> Column)) -> Vec<Column> {
227 self.columns.iter().map(func).collect()
228 }
229 // Reduce monomorphization.
230 fn try_apply_columns_par(
231 &self,
232 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
233 ) -> PolarsResult<Vec<Column>> {
234 POOL.install(|| self.columns.par_iter().map(func).collect())
235 }
236 // Reduce monomorphization.
237 pub fn _apply_columns_par(
238 &self,
239 func: &(dyn Fn(&Column) -> Column + Send + Sync),
240 ) -> Vec<Column> {
241 POOL.install(|| self.columns.par_iter().map(func).collect())
242 }
243
244 /// Get the index of the column.
245 fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
246 self.get_column_index(name)
247 .ok_or_else(|| polars_err!(col_not_found = name))
248 }
249
250 fn check_already_present(&self, name: &str) -> PolarsResult<()> {
251 polars_ensure!(
252 self.columns.iter().all(|s| s.name().as_str() != name),
253 Duplicate: "column with name {:?} is already present in the DataFrame", name
254 );
255 Ok(())
256 }
257
258 /// Reserve additional slots into the chunks of the series.
259 pub(crate) fn reserve_chunks(&mut self, additional: usize) {
260 for s in &mut self.columns {
261 if let Column::Series(s) = s {
262 // SAFETY:
263 // do not modify the data, simply resize.
264 unsafe { s.chunks_mut().reserve(additional) }
265 }
266 }
267 }
268
269 /// Create a DataFrame from a Vector of Series.
270 ///
271 /// Errors if a column names are not unique, or if heights are not all equal.
272 ///
273 /// # Example
274 ///
275 /// ```
276 /// # use polars_core::prelude::*;
277 /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
278 /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
279 ///
280 /// let df = DataFrame::new(vec![s0, s1])?;
281 /// # Ok::<(), PolarsError>(())
282 /// ```
283 pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
284 DataFrame::validate_columns_slice(&columns)
285 .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {}", e)))?;
286 Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
287 }
288
289 pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
290 for col in &columns {
291 polars_ensure!(
292 col.len() == height,
293 ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
294 columns[0].name(), height, col.name(), col.len()
295 );
296 }
297
298 Ok(DataFrame {
299 height,
300 columns,
301 cached_schema: OnceLock::new(),
302 })
303 }
304
305 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
306 /// columns to match the other columns.
307 pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
308 // The length of the longest non-unit length column determines the
309 // broadcast length. If all columns are unit-length the broadcast length
310 // is one.
311 let broadcast_len = columns
312 .iter()
313 .map(|s| s.len())
314 .filter(|l| *l != 1)
315 .max()
316 .unwrap_or(1);
317 Self::new_with_broadcast_len(columns, broadcast_len)
318 }
319
320 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
321 /// columns to broadcast_len.
322 pub fn new_with_broadcast_len(
323 columns: Vec<Column>,
324 broadcast_len: usize,
325 ) -> PolarsResult<Self> {
326 ensure_names_unique(&columns, |s| s.name().as_str())?;
327 unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
328 }
329
330 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
331 /// columns to match the other columns.
332 ///
333 /// # Safety
334 /// Does not check that the column names are unique (which they must be).
335 pub unsafe fn new_with_broadcast_no_namecheck(
336 mut columns: Vec<Column>,
337 broadcast_len: usize,
338 ) -> PolarsResult<Self> {
339 for col in &mut columns {
340 // Length not equal to the broadcast len, needs broadcast or is an error.
341 let len = col.len();
342 if len != broadcast_len {
343 if len != 1 {
344 let name = col.name().to_owned();
345 let extra_info =
346 if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
347 format!(" (matching column '{}')", c.name())
348 } else {
349 String::new()
350 };
351 polars_bail!(
352 ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
353 );
354 }
355 *col = col.new_from_index(0, broadcast_len);
356 }
357 }
358
359 let length = if columns.is_empty() { 0 } else { broadcast_len };
360
361 Ok(unsafe { DataFrame::new_no_checks(length, columns) })
362 }
363
364 /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
365 ///
366 /// # Example
367 ///
368 /// ```rust
369 /// use polars_core::prelude::DataFrame;
370 /// static EMPTY: DataFrame = DataFrame::empty();
371 /// ```
372 pub const fn empty() -> Self {
373 Self::empty_with_height(0)
374 }
375
376 /// Creates an empty `DataFrame` with a specific `height`.
377 pub const fn empty_with_height(height: usize) -> Self {
378 DataFrame {
379 height,
380 columns: vec![],
381 cached_schema: OnceLock::new(),
382 }
383 }
384
385 /// Create an empty `DataFrame` with empty columns as per the `schema`.
386 pub fn empty_with_schema(schema: &Schema) -> Self {
387 let cols = schema
388 .iter()
389 .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
390 .collect();
391 unsafe { DataFrame::new_no_checks(0, cols) }
392 }
393
394 /// Create an empty `DataFrame` with empty columns as per the `schema`.
395 pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
396 let cols = schema
397 .iter_values()
398 .map(|fld| {
399 Column::from(Series::new_empty(
400 fld.name.clone(),
401 &(DataType::from_arrow_field(fld)),
402 ))
403 })
404 .collect();
405 unsafe { DataFrame::new_no_checks(0, cols) }
406 }
407
408 /// Create a new `DataFrame` with the given schema, only containing nulls.
409 pub fn full_null(schema: &Schema, height: usize) -> Self {
410 let columns = schema
411 .iter_fields()
412 .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
413 .collect();
414 unsafe { DataFrame::new_no_checks(height, columns) }
415 }
416
417 /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
418 ///
419 /// # Example
420 ///
421 /// ```rust
422 /// # use polars_core::prelude::*;
423 /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
424 /// let s2 = Column::new("Area (kmĀ²)".into(), [106_460_000, 70_560_000]);
425 /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
426 ///
427 /// assert_eq!(df.pop(), Some(s2));
428 /// assert_eq!(df.pop(), Some(s1));
429 /// assert_eq!(df.pop(), None);
430 /// assert!(df.is_empty());
431 /// # Ok::<(), PolarsError>(())
432 /// ```
433 pub fn pop(&mut self) -> Option<Column> {
434 self.clear_schema();
435
436 self.columns.pop()
437 }
438
439 /// Add a new column at index 0 that counts the rows.
440 ///
441 /// # Example
442 ///
443 /// ```
444 /// # use polars_core::prelude::*;
445 /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
446 /// assert_eq!(df1.shape(), (4, 1));
447 ///
448 /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
449 /// assert_eq!(df2.shape(), (4, 2));
450 /// println!("{}", df2);
451 ///
452 /// # Ok::<(), PolarsError>(())
453 /// ```
454 ///
455 /// Output:
456 ///
457 /// ```text
458 /// shape: (4, 2)
459 /// +-----+----------+
460 /// | Id | Name |
461 /// | --- | --- |
462 /// | u32 | str |
463 /// +=====+==========+
464 /// | 0 | James |
465 /// +-----+----------+
466 /// | 1 | Mary |
467 /// +-----+----------+
468 /// | 2 | John |
469 /// +-----+----------+
470 /// | 3 | Patricia |
471 /// +-----+----------+
472 /// ```
473 pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
474 let mut columns = Vec::with_capacity(self.columns.len() + 1);
475 let offset = offset.unwrap_or(0);
476
477 let col = Column::new_row_index(name, offset, self.height())?;
478 columns.push(col);
479 columns.extend_from_slice(&self.columns);
480 DataFrame::new(columns)
481 }
482
483 /// Add a row index column in place.
484 ///
485 /// # Safety
486 /// The caller should ensure the DataFrame does not already contain a column with the given name.
487 ///
488 /// # Panics
489 /// Panics if the resulting column would reach or overflow IdxSize::MAX.
490 pub unsafe fn with_row_index_mut(
491 &mut self,
492 name: PlSmallStr,
493 offset: Option<IdxSize>,
494 ) -> &mut Self {
495 // TODO: Make this function unsafe
496 debug_assert!(
497 self.columns.iter().all(|c| c.name() != &name),
498 "with_row_index_mut(): column with name {} already exists",
499 &name
500 );
501
502 let offset = offset.unwrap_or(0);
503 let col = Column::new_row_index(name, offset, self.height()).unwrap();
504
505 self.clear_schema();
506 self.columns.insert(0, col);
507 self
508 }
509
510 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
511 /// `Series`.
512 ///
513 /// Calculates the height from the first column or `0` if no columns are given.
514 ///
515 /// # Safety
516 ///
517 /// It is the callers responsibility to uphold the contract of all `Series`
518 /// having an equal length and a unique name, if not this may panic down the line.
519 pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
520 let height = columns.first().map_or(0, Column::len);
521 unsafe { Self::new_no_checks(height, columns) }
522 }
523
524 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
525 /// `Series`.
526 ///
527 /// It is advised to use [DataFrame::new] in favor of this method.
528 ///
529 /// # Safety
530 ///
531 /// It is the callers responsibility to uphold the contract of all `Series`
532 /// having an equal length and a unique name, if not this may panic down the line.
533 pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
534 if cfg!(debug_assertions) {
535 DataFrame::validate_columns_slice(&columns).unwrap();
536 }
537
538 unsafe { Self::_new_no_checks_impl(height, columns) }
539 }
540
541 /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
542 /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
543 /// constructed with this method is generally highly unsafe and should not be long-lived.
544 #[allow(clippy::missing_safety_doc)]
545 pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
546 DataFrame {
547 height,
548 columns,
549 cached_schema: OnceLock::new(),
550 }
551 }
552
553 /// Shrink the capacity of this DataFrame to fit its length.
554 pub fn shrink_to_fit(&mut self) {
555 // Don't parallelize this. Memory overhead
556 for s in &mut self.columns {
557 s.shrink_to_fit();
558 }
559 }
560
561 /// Aggregate all the chunks in the DataFrame to a single chunk.
562 pub fn as_single_chunk(&mut self) -> &mut Self {
563 // Don't parallelize this. Memory overhead
564 for s in &mut self.columns {
565 if let Column::Series(s) = s {
566 *s = s.rechunk().into();
567 }
568 }
569 self
570 }
571
572 /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
573 /// This may lead to more peak memory consumption.
574 pub fn as_single_chunk_par(&mut self) -> &mut Self {
575 if self.columns.iter().any(|c| c.n_chunks() > 1) {
576 self.columns = self._apply_columns_par(&|s| s.rechunk());
577 }
578 self
579 }
580
581 /// Rechunks all columns to only have a single chunk.
582 pub fn rechunk_mut(&mut self) {
583 // SAFETY: We never adjust the length or names of the columns.
584 let columns = unsafe { self.get_columns_mut() };
585
586 for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
587 *col = col.rechunk();
588 }
589 }
590
591 pub fn _deshare_views_mut(&mut self) {
592 // SAFETY: We never adjust the length or names of the columns.
593 unsafe {
594 let columns = self.get_columns_mut();
595 for col in columns {
596 let Column::Series(s) = col else { continue };
597
598 if let Ok(ca) = s.binary() {
599 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
600 *col = Column::from(gc_ca.into_series());
601 } else if let Ok(ca) = s.str() {
602 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
603 *col = Column::from(gc_ca.into_series());
604 }
605 }
606 }
607 }
608
609 /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
610 pub fn rechunk_to_record_batch(
611 self,
612 compat_level: CompatLevel,
613 ) -> RecordBatchT<Box<dyn Array>> {
614 let height = self.height();
615
616 let (schema, arrays) = self
617 .columns
618 .into_iter()
619 .map(|col| {
620 let mut series = col.take_materialized_series();
621 // Rechunk to one chunk if necessary
622 if series.n_chunks() > 1 {
623 series = series.rechunk();
624 }
625 (
626 series.field().to_arrow(compat_level),
627 series.to_arrow(0, compat_level),
628 )
629 })
630 .collect();
631
632 RecordBatchT::new(height, Arc::new(schema), arrays)
633 }
634
635 /// Returns true if the chunks of the columns do not align and re-chunking should be done
636 pub fn should_rechunk(&self) -> bool {
637 // Fast check. It is also needed for correctness, as code below doesn't check if the number
638 // of chunks is equal.
639 if !self
640 .get_columns()
641 .iter()
642 .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
643 .all_equal()
644 {
645 return true;
646 }
647
648 // From here we check chunk lengths.
649 let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
650 match chunk_lengths.next() {
651 None => false,
652 Some(first_column_chunk_lengths) => {
653 // Fast Path for single Chunk Series
654 if first_column_chunk_lengths.size_hint().0 == 1 {
655 return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
656 }
657 // Always rechunk if we have more chunks than rows.
658 // except when we have an empty df containing a single chunk
659 let height = self.height();
660 let n_chunks = first_column_chunk_lengths.size_hint().0;
661 if n_chunks > height && !(height == 0 && n_chunks == 1) {
662 return true;
663 }
664 // Slow Path for multi Chunk series
665 let v: Vec<_> = first_column_chunk_lengths.collect();
666 for cl in chunk_lengths {
667 if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
668 return true;
669 }
670 }
671 false
672 },
673 }
674 }
675
676 /// Ensure all the chunks in the [`DataFrame`] are aligned.
677 pub fn align_chunks_par(&mut self) -> &mut Self {
678 if self.should_rechunk() {
679 self.as_single_chunk_par()
680 } else {
681 self
682 }
683 }
684
685 pub fn align_chunks(&mut self) -> &mut Self {
686 if self.should_rechunk() {
687 self.as_single_chunk()
688 } else {
689 self
690 }
691 }
692
693 /// Get the [`DataFrame`] schema.
694 ///
695 /// # Example
696 ///
697 /// ```rust
698 /// # use polars_core::prelude::*;
699 /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
700 /// "Diameter (m)" => [8.8e26, f64::INFINITY])?;
701 ///
702 /// let f1: Field = Field::new("Thing".into(), DataType::String);
703 /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
704 /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
705 ///
706 /// assert_eq!(&**df.schema(), &sc);
707 /// # Ok::<(), PolarsError>(())
708 /// ```
709 pub fn schema(&self) -> &SchemaRef {
710 let out = self.cached_schema.get_or_init(|| {
711 Arc::new(
712 self.columns
713 .iter()
714 .map(|x| (x.name().clone(), x.dtype().clone()))
715 .collect(),
716 )
717 });
718
719 debug_assert_eq!(out.len(), self.width());
720
721 out
722 }
723
724 /// Get a reference to the [`DataFrame`] columns.
725 ///
726 /// # Example
727 ///
728 /// ```rust
729 /// # use polars_core::prelude::*;
730 /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
731 /// "Symbol" => ["A", "C", "G", "T"])?;
732 /// let columns: &[Column] = df.get_columns();
733 ///
734 /// assert_eq!(columns[0].name(), "Name");
735 /// assert_eq!(columns[1].name(), "Symbol");
736 /// # Ok::<(), PolarsError>(())
737 /// ```
738 #[inline]
739 pub fn get_columns(&self) -> &[Column] {
740 &self.columns
741 }
742
743 #[inline]
744 /// Get mutable access to the underlying columns.
745 ///
746 /// # Safety
747 ///
748 /// The caller must ensure the length of all [`Series`] remains equal to `height` or
749 /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
750 /// The caller must ensure that the cached schema is cleared if it modifies the schema by
751 /// calling [`DataFrame::clear_schema`].
752 pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
753 &mut self.columns
754 }
755
756 #[inline]
757 /// Remove all the columns in the [`DataFrame`] but keep the `height`.
758 pub fn clear_columns(&mut self) {
759 unsafe { self.get_columns_mut() }.clear();
760 self.clear_schema();
761 }
762
763 #[inline]
764 /// Extend the columns without checking for name collisions or height.
765 ///
766 /// # Safety
767 ///
768 /// The caller needs to ensure that:
769 /// - Column names are unique within the resulting [`DataFrame`].
770 /// - The length of each appended column matches the height of the [`DataFrame`]. For
771 /// `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
772 /// with [`DataFrame::set_height`].
773 pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
774 unsafe { self.get_columns_mut() }.extend(iter);
775 self.clear_schema();
776 }
777
778 /// Take ownership of the underlying columns vec.
779 pub fn take_columns(self) -> Vec<Column> {
780 self.columns
781 }
782
783 /// Iterator over the columns as [`Series`].
784 ///
785 /// # Example
786 ///
787 /// ```rust
788 /// # use polars_core::prelude::*;
789 /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
790 /// let s2 = Column::new("Formula".into(), ["aĀ²+bĀ²=cĀ²", "H=-Ī£[P(x)log|P(x)|]"]);
791 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
792 ///
793 /// let mut iterator = df.iter();
794 ///
795 /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
796 /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
797 /// assert_eq!(iterator.next(), None);
798 /// # Ok::<(), PolarsError>(())
799 /// ```
800 pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
801 self.materialized_column_iter()
802 }
803
804 /// # Example
805 ///
806 /// ```rust
807 /// # use polars_core::prelude::*;
808 /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
809 /// "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
810 ///
811 /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
812 /// # Ok::<(), PolarsError>(())
813 /// ```
814 pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
815 self.columns.iter().map(|s| s.name()).collect()
816 }
817
818 /// Get the [`Vec<PlSmallStr>`] representing the column names.
819 pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
820 self.columns.iter().map(|s| s.name().clone()).collect()
821 }
822
823 pub fn get_column_names_str(&self) -> Vec<&str> {
824 self.columns.iter().map(|s| s.name().as_str()).collect()
825 }
826
827 /// Set the column names.
828 /// # Example
829 ///
830 /// ```rust
831 /// # use polars_core::prelude::*;
832 /// let mut df: DataFrame = df!("Mathematical set" => ["ā", "ā¤", "š»", "ā", "ā", "ā"])?;
833 /// df.set_column_names(["Set"])?;
834 ///
835 /// assert_eq!(df.get_column_names(), &["Set"]);
836 /// # Ok::<(), PolarsError>(())
837 /// ```
838 pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
839 where
840 I: IntoIterator<Item = S>,
841 S: Into<PlSmallStr>,
842 {
843 let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
844 self._set_column_names_impl(names.as_slice())
845 }
846
847 fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
848 polars_ensure!(
849 names.len() == self.width(),
850 ShapeMismatch: "{} column names provided for a DataFrame of width {}",
851 names.len(), self.width()
852 );
853 ensure_names_unique(names, |s| s.as_str())?;
854
855 let columns = mem::take(&mut self.columns);
856 self.columns = columns
857 .into_iter()
858 .zip(names)
859 .map(|(s, name)| {
860 let mut s = s;
861 s.rename(name.clone());
862 s
863 })
864 .collect();
865 self.clear_schema();
866 Ok(())
867 }
868
869 /// Get the data types of the columns in the [`DataFrame`].
870 ///
871 /// # Example
872 ///
873 /// ```rust
874 /// # use polars_core::prelude::*;
875 /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
876 /// "Fraction" => [0.965, 0.035])?;
877 ///
878 /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
879 /// # Ok::<(), PolarsError>(())
880 /// ```
881 pub fn dtypes(&self) -> Vec<DataType> {
882 self.columns.iter().map(|s| s.dtype().clone()).collect()
883 }
884
885 pub(crate) fn first_series_column(&self) -> Option<&Series> {
886 self.columns.iter().find_map(|col| col.as_series())
887 }
888
889 /// The number of chunks for the first column.
890 pub fn first_col_n_chunks(&self) -> usize {
891 match self.first_series_column() {
892 None if self.columns.is_empty() => 0,
893 None => 1,
894 Some(s) => s.n_chunks(),
895 }
896 }
897
898 /// The highest number of chunks for any column.
899 pub fn max_n_chunks(&self) -> usize {
900 self.columns
901 .iter()
902 .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
903 .max()
904 .unwrap_or(0)
905 }
906
907 /// Get a reference to the schema fields of the [`DataFrame`].
908 ///
909 /// # Example
910 ///
911 /// ```rust
912 /// # use polars_core::prelude::*;
913 /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
914 /// "Fraction" => [0.708, 0.292])?;
915 ///
916 /// let f1: Field = Field::new("Surface type".into(), DataType::String);
917 /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
918 ///
919 /// assert_eq!(earth.fields(), &[f1, f2]);
920 /// # Ok::<(), PolarsError>(())
921 /// ```
922 pub fn fields(&self) -> Vec<Field> {
923 self.columns
924 .iter()
925 .map(|s| s.field().into_owned())
926 .collect()
927 }
928
929 /// Get (height, width) of the [`DataFrame`].
930 ///
931 /// # Example
932 ///
933 /// ```rust
934 /// # use polars_core::prelude::*;
935 /// let df0: DataFrame = DataFrame::default();
936 /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
937 /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
938 /// "2" => [1, 2, 3, 4, 5])?;
939 ///
940 /// assert_eq!(df0.shape(), (0 ,0));
941 /// assert_eq!(df1.shape(), (5, 1));
942 /// assert_eq!(df2.shape(), (5, 2));
943 /// # Ok::<(), PolarsError>(())
944 /// ```
945 pub fn shape(&self) -> (usize, usize) {
946 (self.height, self.columns.len())
947 }
948
949 /// Get the width of the [`DataFrame`] which is the number of columns.
950 ///
951 /// # Example
952 ///
953 /// ```rust
954 /// # use polars_core::prelude::*;
955 /// let df0: DataFrame = DataFrame::default();
956 /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
957 /// let df2: DataFrame = df!("Series 1" => [0; 0],
958 /// "Series 2" => [0; 0])?;
959 ///
960 /// assert_eq!(df0.width(), 0);
961 /// assert_eq!(df1.width(), 1);
962 /// assert_eq!(df2.width(), 2);
963 /// # Ok::<(), PolarsError>(())
964 /// ```
965 pub fn width(&self) -> usize {
966 self.columns.len()
967 }
968
969 /// Get the height of the [`DataFrame`] which is the number of rows.
970 ///
971 /// # Example
972 ///
973 /// ```rust
974 /// # use polars_core::prelude::*;
975 /// let df0: DataFrame = DataFrame::default();
976 /// let df1: DataFrame = df!("Currency" => ["ā¬", "$"])?;
977 /// let df2: DataFrame = df!("Currency" => ["ā¬", "$", "Ā„", "Ā£", "āæ"])?;
978 ///
979 /// assert_eq!(df0.height(), 0);
980 /// assert_eq!(df1.height(), 2);
981 /// assert_eq!(df2.height(), 5);
982 /// # Ok::<(), PolarsError>(())
983 /// ```
984 pub fn height(&self) -> usize {
985 self.height
986 }
987
988 /// Returns the size as number of rows * number of columns
989 pub fn size(&self) -> usize {
990 let s = self.shape();
991 s.0 * s.1
992 }
993
994 /// Returns `true` if the [`DataFrame`] contains no rows.
995 ///
996 /// # Example
997 ///
998 /// ```rust
999 /// # use polars_core::prelude::*;
1000 /// let df1: DataFrame = DataFrame::default();
1001 /// assert!(df1.is_empty());
1002 ///
1003 /// let df2: DataFrame = df!("First name" => ["Forever"],
1004 /// "Last name" => ["Alone"])?;
1005 /// assert!(!df2.is_empty());
1006 /// # Ok::<(), PolarsError>(())
1007 /// ```
1008 pub fn is_empty(&self) -> bool {
1009 matches!(self.shape(), (0, _) | (_, 0))
1010 }
1011
1012 /// Set the height (i.e. number of rows) of this [`DataFrame`].
1013 ///
1014 /// # Safety
1015 ///
1016 /// This needs to be equal to the length of all the columns.
1017 pub unsafe fn set_height(&mut self, height: usize) {
1018 self.height = height;
1019 }
1020
1021 /// Add multiple [`Series`] to a [`DataFrame`].
1022 /// The added `Series` are required to have the same length.
1023 ///
1024 /// # Example
1025 ///
1026 /// ```rust
1027 /// # use polars_core::prelude::*;
1028 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1029 /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1030 /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1031 ///
1032 /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1033 /// assert_eq!(df2.shape(), (3, 3));
1034 /// println!("{}", df2);
1035 /// # Ok::<(), PolarsError>(())
1036 /// ```
1037 ///
1038 /// Output:
1039 ///
1040 /// ```text
1041 /// shape: (3, 3)
1042 /// +---------+--------+----------+
1043 /// | Element | Proton | Electron |
1044 /// | --- | --- | --- |
1045 /// | str | i32 | i32 |
1046 /// +=========+========+==========+
1047 /// | Copper | 29 | 29 |
1048 /// +---------+--------+----------+
1049 /// | Silver | 47 | 47 |
1050 /// +---------+--------+----------+
1051 /// | Gold | 79 | 79 |
1052 /// +---------+--------+----------+
1053 /// ```
1054 pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1055 let mut new_cols = self.columns.clone();
1056 new_cols.extend_from_slice(columns);
1057 DataFrame::new(new_cols)
1058 }
1059
1060 /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1061 ///
1062 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1063 ///
1064 /// # Example
1065 ///
1066 /// ```rust
1067 /// # use polars_core::prelude::*;
1068 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1069 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1070 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1071 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1072 ///
1073 /// let df3: DataFrame = df1.vstack(&df2)?;
1074 ///
1075 /// assert_eq!(df3.shape(), (5, 2));
1076 /// println!("{}", df3);
1077 /// # Ok::<(), PolarsError>(())
1078 /// ```
1079 ///
1080 /// Output:
1081 ///
1082 /// ```text
1083 /// shape: (5, 2)
1084 /// +-----------+-------------------+
1085 /// | Element | Melting Point (K) |
1086 /// | --- | --- |
1087 /// | str | f64 |
1088 /// +===========+===================+
1089 /// | Copper | 1357.77 |
1090 /// +-----------+-------------------+
1091 /// | Silver | 1234.93 |
1092 /// +-----------+-------------------+
1093 /// | Gold | 1337.33 |
1094 /// +-----------+-------------------+
1095 /// | Platinum | 2041.4 |
1096 /// +-----------+-------------------+
1097 /// | Palladium | 1828.05 |
1098 /// +-----------+-------------------+
1099 /// ```
1100 pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1101 let mut df = self.clone();
1102 df.vstack_mut(other)?;
1103 Ok(df)
1104 }
1105
1106 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1107 ///
1108 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1109 ///
1110 /// # Example
1111 ///
1112 /// ```rust
1113 /// # use polars_core::prelude::*;
1114 /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1115 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1116 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1117 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1118 ///
1119 /// df1.vstack_mut(&df2)?;
1120 ///
1121 /// assert_eq!(df1.shape(), (5, 2));
1122 /// println!("{}", df1);
1123 /// # Ok::<(), PolarsError>(())
1124 /// ```
1125 ///
1126 /// Output:
1127 ///
1128 /// ```text
1129 /// shape: (5, 2)
1130 /// +-----------+-------------------+
1131 /// | Element | Melting Point (K) |
1132 /// | --- | --- |
1133 /// | str | f64 |
1134 /// +===========+===================+
1135 /// | Copper | 1357.77 |
1136 /// +-----------+-------------------+
1137 /// | Silver | 1234.93 |
1138 /// +-----------+-------------------+
1139 /// | Gold | 1337.33 |
1140 /// +-----------+-------------------+
1141 /// | Platinum | 2041.4 |
1142 /// +-----------+-------------------+
1143 /// | Palladium | 1828.05 |
1144 /// +-----------+-------------------+
1145 /// ```
1146 pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1147 if self.width() != other.width() {
1148 polars_ensure!(
1149 self.width() == 0,
1150 ShapeMismatch:
1151 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1152 self.width(), other.width(),
1153 );
1154 self.columns.clone_from(&other.columns);
1155 self.height = other.height;
1156 return Ok(self);
1157 }
1158
1159 self.columns
1160 .iter_mut()
1161 .zip(other.columns.iter())
1162 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1163 ensure_can_extend(&*left, right)?;
1164 left.append(right).map_err(|e| {
1165 e.context(format!("failed to vstack column '{}'", right.name()).into())
1166 })?;
1167 Ok(())
1168 })?;
1169 self.height += other.height;
1170 Ok(self)
1171 }
1172
1173 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1174 ///
1175 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1176 ///
1177 /// # Panics
1178 /// Panics if the schema's don't match.
1179 pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1180 self.columns
1181 .iter_mut()
1182 .zip(other.columns.iter())
1183 .for_each(|(left, right)| {
1184 left.append(right)
1185 .map_err(|e| {
1186 e.context(format!("failed to vstack column '{}'", right.name()).into())
1187 })
1188 .expect("should not fail");
1189 });
1190 self.height += other.height;
1191 }
1192
1193 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1194 ///
1195 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1196 ///
1197 /// # Panics
1198 /// Panics if the schema's don't match.
1199 pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1200 self.columns
1201 .iter_mut()
1202 .zip(other.columns)
1203 .for_each(|(left, right)| {
1204 left.append_owned(right).expect("should not fail");
1205 });
1206 self.height += other.height;
1207 }
1208
1209 /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1210 ///
1211 /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1212 /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1213 ///
1214 /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1215 /// and thus will yield faster queries.
1216 ///
1217 /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1218 /// online operations where you add `n` rows and rerun a query.
1219 ///
1220 /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1221 /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1222 /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1223 pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1224 polars_ensure!(
1225 self.width() == other.width(),
1226 ShapeMismatch:
1227 "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1228 self.width(), other.width(),
1229 );
1230
1231 self.columns
1232 .iter_mut()
1233 .zip(other.columns.iter())
1234 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1235 ensure_can_extend(&*left, right)?;
1236 left.extend(right).map_err(|e| {
1237 e.context(format!("failed to extend column '{}'", right.name()).into())
1238 })?;
1239 Ok(())
1240 })?;
1241 self.height += other.height;
1242 self.clear_schema();
1243 Ok(())
1244 }
1245
1246 /// Remove a column by name and return the column removed.
1247 ///
1248 /// # Example
1249 ///
1250 /// ```rust
1251 /// # use polars_core::prelude::*;
1252 /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1253 /// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1254 ///
1255 /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1256 /// assert!(s1.is_err());
1257 ///
1258 /// let s2: Column = df.drop_in_place("Animal")?;
1259 /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1260 /// # Ok::<(), PolarsError>(())
1261 /// ```
1262 pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1263 let idx = self.check_name_to_idx(name)?;
1264 self.clear_schema();
1265 Ok(self.columns.remove(idx))
1266 }
1267
1268 /// Return a new [`DataFrame`] where all null values are dropped.
1269 ///
1270 /// # Example
1271 ///
1272 /// ```no_run
1273 /// # use polars_core::prelude::*;
1274 /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1275 /// "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1276 /// assert_eq!(df1.shape(), (3, 2));
1277 ///
1278 /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1279 /// assert_eq!(df2.shape(), (1, 2));
1280 /// println!("{}", df2);
1281 /// # Ok::<(), PolarsError>(())
1282 /// ```
1283 ///
1284 /// Output:
1285 ///
1286 /// ```text
1287 /// shape: (1, 2)
1288 /// +---------+---------------------+
1289 /// | Country | Tax revenue (% GDP) |
1290 /// | --- | --- |
1291 /// | str | f64 |
1292 /// +=========+=====================+
1293 /// | Malta | 32.7 |
1294 /// +---------+---------------------+
1295 /// ```
1296 pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1297 where
1298 for<'a> &'a S: Into<PlSmallStr>,
1299 {
1300 if let Some(v) = subset {
1301 let v = self.select_columns(v)?;
1302 self._drop_nulls_impl(v.as_slice())
1303 } else {
1304 self._drop_nulls_impl(self.columns.as_slice())
1305 }
1306 }
1307
1308 fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1309 // fast path for no nulls in df
1310 if subset.iter().all(|s| !s.has_nulls()) {
1311 return Ok(self.clone());
1312 }
1313
1314 let mut iter = subset.iter();
1315
1316 let mask = iter
1317 .next()
1318 .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1319 let mut mask = mask.is_not_null();
1320
1321 for c in iter {
1322 mask = mask & c.is_not_null();
1323 }
1324 self.filter(&mask)
1325 }
1326
1327 /// Drop a column by name.
1328 /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1329 /// the current one in place.
1330 ///
1331 /// # Example
1332 ///
1333 /// ```rust
1334 /// # use polars_core::prelude::*;
1335 /// let df1: DataFrame = df!("Ray type" => ["Ī±", "Ī²", "X", "Ī³"])?;
1336 /// let df2: DataFrame = df1.drop("Ray type")?;
1337 ///
1338 /// assert!(df2.is_empty());
1339 /// # Ok::<(), PolarsError>(())
1340 /// ```
1341 pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1342 let idx = self.check_name_to_idx(name)?;
1343 let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1344
1345 self.columns.iter().enumerate().for_each(|(i, s)| {
1346 if i != idx {
1347 new_cols.push(s.clone())
1348 }
1349 });
1350
1351 Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1352 }
1353
1354 /// Drop columns that are in `names`.
1355 pub fn drop_many<I, S>(&self, names: I) -> Self
1356 where
1357 I: IntoIterator<Item = S>,
1358 S: Into<PlSmallStr>,
1359 {
1360 let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1361 self.drop_many_amortized(&names)
1362 }
1363
1364 /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1365 pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1366 if names.is_empty() {
1367 return self.clone();
1368 }
1369 let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1370 self.columns.iter().for_each(|s| {
1371 if !names.contains(s.name()) {
1372 new_cols.push(s.clone())
1373 }
1374 });
1375
1376 unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1377 }
1378
1379 /// Insert a new column at a given index without checking for duplicates.
1380 /// This can leave the [`DataFrame`] at an invalid state
1381 fn insert_column_no_name_check(
1382 &mut self,
1383 index: usize,
1384 column: Column,
1385 ) -> PolarsResult<&mut Self> {
1386 polars_ensure!(
1387 self.width() == 0 || column.len() == self.height(),
1388 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1389 column.len(), self.height(),
1390 );
1391
1392 if self.width() == 0 {
1393 self.height = column.len();
1394 }
1395
1396 self.columns.insert(index, column);
1397 self.clear_schema();
1398 Ok(self)
1399 }
1400
1401 /// Insert a new column at a given index.
1402 pub fn insert_column<S: IntoColumn>(
1403 &mut self,
1404 index: usize,
1405 column: S,
1406 ) -> PolarsResult<&mut Self> {
1407 let column = column.into_column();
1408 self.check_already_present(column.name().as_str())?;
1409 self.insert_column_no_name_check(index, column)
1410 }
1411
1412 fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1413 if let Some(idx) = self.get_column_index(column.name().as_str()) {
1414 self.replace_column(idx, column)?;
1415 } else {
1416 if self.width() == 0 {
1417 self.height = column.len();
1418 }
1419
1420 self.columns.push(column);
1421 self.clear_schema();
1422 }
1423 Ok(())
1424 }
1425
1426 /// Add a new column to this [`DataFrame`] or replace an existing one.
1427 pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1428 fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1429 let height = df.height();
1430 if column.len() == 1 && height > 1 {
1431 column = column.new_from_index(0, height);
1432 }
1433
1434 if column.len() == height || df.get_columns().is_empty() {
1435 df.add_column_by_search(column)?;
1436 Ok(df)
1437 }
1438 // special case for literals
1439 else if height == 0 && column.len() == 1 {
1440 let s = column.clear();
1441 df.add_column_by_search(s)?;
1442 Ok(df)
1443 } else {
1444 polars_bail!(
1445 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1446 column.len(), height,
1447 );
1448 }
1449 }
1450 let column = column.into_column();
1451 inner(self, column)
1452 }
1453
1454 /// Adds a column to the [`DataFrame`] without doing any checks
1455 /// on length or duplicates.
1456 ///
1457 /// # Safety
1458 /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1459 pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1460 debug_assert!(self.width() == 0 || self.height() == column.len());
1461 debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1462
1463 // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1464 // properly for `width` == 0.
1465 if self.width() == 0 {
1466 unsafe { self.set_height(column.len()) };
1467 }
1468 unsafe { self.get_columns_mut() }.push(column);
1469 self.clear_schema();
1470
1471 self
1472 }
1473
1474 // Note: Schema can be both input or output_schema
1475 fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1476 let name = c.name();
1477 if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1478 if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1479 // Given schema is output_schema and we can push.
1480 if idx == self.columns.len() {
1481 if self.width() == 0 {
1482 self.height = c.len();
1483 }
1484
1485 self.columns.push(c);
1486 self.clear_schema();
1487 }
1488 // Schema is incorrect fallback to search
1489 else {
1490 debug_assert!(false);
1491 self.add_column_by_search(c)?;
1492 }
1493 } else {
1494 self.replace_column(idx, c)?;
1495 }
1496 } else {
1497 if self.width() == 0 {
1498 self.height = c.len();
1499 }
1500
1501 self.columns.push(c);
1502 self.clear_schema();
1503 }
1504
1505 Ok(())
1506 }
1507
1508 // Note: Schema can be both input or output_schema
1509 pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1510 for (i, s) in series.into_iter().enumerate() {
1511 // we need to branch here
1512 // because users can add multiple columns with the same name
1513 if i == 0 || schema.get(s.name().as_str()).is_some() {
1514 self.with_column_and_schema(s.into_column(), schema)?;
1515 } else {
1516 self.with_column(s.clone().into_column())?;
1517 }
1518 }
1519 Ok(())
1520 }
1521
1522 pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1523 for (i, s) in columns.into_iter().enumerate() {
1524 // we need to branch here
1525 // because users can add multiple columns with the same name
1526 if i == 0 || schema.get(s.name().as_str()).is_some() {
1527 self.with_column_and_schema(s, schema)?;
1528 } else {
1529 self.with_column(s.clone())?;
1530 }
1531 }
1532
1533 Ok(())
1534 }
1535
1536 /// Add a new column to this [`DataFrame`] or replace an existing one.
1537 /// Uses an existing schema to amortize lookups.
1538 /// If the schema is incorrect, we will fallback to linear search.
1539 ///
1540 /// Note: Schema can be both input or output_schema
1541 pub fn with_column_and_schema<C: IntoColumn>(
1542 &mut self,
1543 column: C,
1544 schema: &Schema,
1545 ) -> PolarsResult<&mut Self> {
1546 let mut column = column.into_column();
1547
1548 let height = self.height();
1549 if column.len() == 1 && height > 1 {
1550 column = column.new_from_index(0, height);
1551 }
1552
1553 if column.len() == height || self.columns.is_empty() {
1554 self.add_column_by_schema(column, schema)?;
1555 Ok(self)
1556 }
1557 // special case for literals
1558 else if height == 0 && column.len() == 1 {
1559 let s = column.clear();
1560 self.add_column_by_schema(s, schema)?;
1561 Ok(self)
1562 } else {
1563 polars_bail!(
1564 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1565 column.len(), height,
1566 );
1567 }
1568 }
1569
1570 /// Get a row in the [`DataFrame`]. Beware this is slow.
1571 ///
1572 /// # Example
1573 ///
1574 /// ```
1575 /// # use polars_core::prelude::*;
1576 /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1577 /// df.get(idx)
1578 /// }
1579 /// ```
1580 pub fn get(&self, idx: usize) -> Option<Vec<AnyValue>> {
1581 match self.columns.first() {
1582 Some(s) => {
1583 if s.len() <= idx {
1584 return None;
1585 }
1586 },
1587 None => return None,
1588 }
1589 // SAFETY: we just checked bounds
1590 unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1591 }
1592
1593 /// Select a [`Series`] by index.
1594 ///
1595 /// # Example
1596 ///
1597 /// ```rust
1598 /// # use polars_core::prelude::*;
1599 /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1600 /// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1601 ///
1602 /// let s1: Option<&Column> = df.select_at_idx(0);
1603 /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1604 ///
1605 /// assert_eq!(s1, Some(&s2));
1606 /// # Ok::<(), PolarsError>(())
1607 /// ```
1608 pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1609 self.columns.get(idx)
1610 }
1611
1612 /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1613 ///
1614 /// # Examples
1615 ///
1616 /// ```rust
1617 /// # use polars_core::prelude::*;
1618 /// let df = df! {
1619 /// "0" => [0, 0, 0],
1620 /// "1" => [1, 1, 1],
1621 /// "2" => [2, 2, 2]
1622 /// }?;
1623 ///
1624 /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1625 /// assert!(df.equals(&df.select_by_range(..)?));
1626 /// # Ok::<(), PolarsError>(())
1627 /// ```
1628 pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1629 where
1630 R: ops::RangeBounds<usize>,
1631 {
1632 // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1633 // because it is the nightly feature. We should change here if this function were stable.
1634 fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1635 where
1636 R: ops::RangeBounds<usize>,
1637 {
1638 let len = bounds.end;
1639
1640 let start: ops::Bound<&usize> = range.start_bound();
1641 let start = match start {
1642 ops::Bound::Included(&start) => start,
1643 ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1644 panic!("attempted to index slice from after maximum usize");
1645 }),
1646 ops::Bound::Unbounded => 0,
1647 };
1648
1649 let end: ops::Bound<&usize> = range.end_bound();
1650 let end = match end {
1651 ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1652 panic!("attempted to index slice up to maximum usize");
1653 }),
1654 ops::Bound::Excluded(&end) => end,
1655 ops::Bound::Unbounded => len,
1656 };
1657
1658 if start > end {
1659 panic!("slice index starts at {start} but ends at {end}");
1660 }
1661 if end > len {
1662 panic!("range end index {end} out of range for slice of length {len}",);
1663 }
1664
1665 ops::Range { start, end }
1666 }
1667
1668 let colnames = self.get_column_names_owned();
1669 let range = get_range(range, ..colnames.len());
1670
1671 self._select_impl(&colnames[range])
1672 }
1673
1674 /// Get column index of a [`Series`] by name.
1675 /// # Example
1676 ///
1677 /// ```rust
1678 /// # use polars_core::prelude::*;
1679 /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1680 /// "Health" => [100, 200, 500],
1681 /// "Mana" => [250, 100, 0],
1682 /// "Strength" => [30, 150, 300])?;
1683 ///
1684 /// assert_eq!(df.get_column_index("Name"), Some(0));
1685 /// assert_eq!(df.get_column_index("Health"), Some(1));
1686 /// assert_eq!(df.get_column_index("Mana"), Some(2));
1687 /// assert_eq!(df.get_column_index("Strength"), Some(3));
1688 /// assert_eq!(df.get_column_index("Haste"), None);
1689 /// # Ok::<(), PolarsError>(())
1690 /// ```
1691 pub fn get_column_index(&self, name: &str) -> Option<usize> {
1692 let schema = self.schema();
1693 if let Some(idx) = schema.index_of(name) {
1694 if self
1695 .get_columns()
1696 .get(idx)
1697 .is_some_and(|c| c.name() == name)
1698 {
1699 return Some(idx);
1700 }
1701 }
1702
1703 self.columns.iter().position(|s| s.name().as_str() == name)
1704 }
1705
1706 /// Get column index of a [`Series`] by name.
1707 pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1708 self.get_column_index(name)
1709 .ok_or_else(|| polars_err!(col_not_found = name))
1710 }
1711
1712 /// Select a single column by name.
1713 ///
1714 /// # Example
1715 ///
1716 /// ```rust
1717 /// # use polars_core::prelude::*;
1718 /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1719 /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1720 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1721 ///
1722 /// assert_eq!(df.column("Password")?, &s1);
1723 /// # Ok::<(), PolarsError>(())
1724 /// ```
1725 pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1726 let idx = self.try_get_column_index(name)?;
1727 Ok(self.select_at_idx(idx).unwrap())
1728 }
1729
1730 /// Selected multiple columns by name.
1731 ///
1732 /// # Example
1733 ///
1734 /// ```rust
1735 /// # use polars_core::prelude::*;
1736 /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1737 /// "Max weight (kg)" => [16.0, 35.89])?;
1738 /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1739 ///
1740 /// assert_eq!(&df[0], sv[0]);
1741 /// assert_eq!(&df[1], sv[1]);
1742 /// # Ok::<(), PolarsError>(())
1743 /// ```
1744 pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1745 where
1746 I: IntoIterator<Item = S>,
1747 S: AsRef<str>,
1748 {
1749 names
1750 .into_iter()
1751 .map(|name| self.column(name.as_ref()))
1752 .collect()
1753 }
1754
1755 /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1756 ///
1757 /// # Examples
1758 ///
1759 /// ```
1760 /// # use polars_core::prelude::*;
1761 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1762 /// df.select(["foo", "bar"])
1763 /// }
1764 /// ```
1765 pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1766 where
1767 I: IntoIterator<Item = S>,
1768 S: Into<PlSmallStr>,
1769 {
1770 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1771 self._select_impl(cols.as_slice())
1772 }
1773
1774 pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1775 ensure_names_unique(cols, |s| s.as_str())?;
1776 self._select_impl_unchecked(cols)
1777 }
1778
1779 pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1780 let selected = self.select_columns_impl(cols)?;
1781 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1782 }
1783
1784 /// Select with a known schema. The schema names must match the column names of this DataFrame.
1785 pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1786 where
1787 I: IntoIterator<Item = S>,
1788 S: Into<PlSmallStr>,
1789 {
1790 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1791 self._select_with_schema_impl(&cols, schema, true)
1792 }
1793
1794 /// Select with a known schema without checking for duplicates in `selection`.
1795 /// The schema names must match the column names of this DataFrame.
1796 pub fn select_with_schema_unchecked<I, S>(
1797 &self,
1798 selection: I,
1799 schema: &Schema,
1800 ) -> PolarsResult<Self>
1801 where
1802 I: IntoIterator<Item = S>,
1803 S: Into<PlSmallStr>,
1804 {
1805 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1806 self._select_with_schema_impl(&cols, schema, false)
1807 }
1808
1809 /// * The schema names must match the column names of this DataFrame.
1810 pub fn _select_with_schema_impl(
1811 &self,
1812 cols: &[PlSmallStr],
1813 schema: &Schema,
1814 check_duplicates: bool,
1815 ) -> PolarsResult<Self> {
1816 if check_duplicates {
1817 ensure_names_unique(cols, |s| s.as_str())?;
1818 }
1819
1820 let selected = self.select_columns_impl_with_schema(cols, schema)?;
1821 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1822 }
1823
1824 /// A non generic implementation to reduce compiler bloat.
1825 fn select_columns_impl_with_schema(
1826 &self,
1827 cols: &[PlSmallStr],
1828 schema: &Schema,
1829 ) -> PolarsResult<Vec<Column>> {
1830 if cfg!(debug_assertions) {
1831 ensure_matching_schema_names(schema, self.schema())?;
1832 }
1833
1834 cols.iter()
1835 .map(|name| {
1836 let index = schema.try_get_full(name.as_str())?.0;
1837 Ok(self.columns[index].clone())
1838 })
1839 .collect()
1840 }
1841
1842 pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1843 where
1844 I: IntoIterator<Item = S>,
1845 S: Into<PlSmallStr>,
1846 {
1847 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1848 self.select_physical_impl(&cols)
1849 }
1850
1851 fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1852 ensure_names_unique(cols, |s| s.as_str())?;
1853 let selected = self.select_columns_physical_impl(cols)?;
1854 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1855 }
1856
1857 /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1858 ///
1859 /// # Example
1860 ///
1861 /// ```rust
1862 /// # use polars_core::prelude::*;
1863 /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1864 /// "Carbon" => [1, 2, 3],
1865 /// "Hydrogen" => [4, 6, 8])?;
1866 /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1867 ///
1868 /// assert_eq!(df["Carbon"], sv[0]);
1869 /// assert_eq!(df["Hydrogen"], sv[1]);
1870 /// # Ok::<(), PolarsError>(())
1871 /// ```
1872 pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1873 let cols = selection.into_vec();
1874 self.select_columns_impl(&cols)
1875 }
1876
1877 fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1878 self.columns
1879 .iter()
1880 .enumerate()
1881 .map(|(i, s)| (s.name().as_str(), i))
1882 .collect()
1883 }
1884
1885 /// A non generic implementation to reduce compiler bloat.
1886 fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1887 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1888 let name_to_idx = self._names_to_idx_map();
1889 cols.iter()
1890 .map(|name| {
1891 let idx = *name_to_idx
1892 .get(name.as_str())
1893 .ok_or_else(|| polars_err!(col_not_found = name))?;
1894 Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1895 })
1896 .collect::<PolarsResult<Vec<_>>>()?
1897 } else {
1898 cols.iter()
1899 .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1900 .collect::<PolarsResult<Vec<_>>>()?
1901 };
1902
1903 Ok(selected)
1904 }
1905
1906 /// A non generic implementation to reduce compiler bloat.
1907 fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1908 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1909 // we hash, because there are user that having millions of columns.
1910 // # https://github.com/pola-rs/polars/issues/1023
1911 let name_to_idx = self._names_to_idx_map();
1912
1913 cols.iter()
1914 .map(|name| {
1915 let idx = *name_to_idx
1916 .get(name.as_str())
1917 .ok_or_else(|| polars_err!(col_not_found = name))?;
1918 Ok(self.select_at_idx(idx).unwrap().clone())
1919 })
1920 .collect::<PolarsResult<Vec<_>>>()?
1921 } else {
1922 cols.iter()
1923 .map(|c| self.column(c.as_str()).cloned())
1924 .collect::<PolarsResult<Vec<_>>>()?
1925 };
1926
1927 Ok(selected)
1928 }
1929
1930 fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1931 // If there is a filtered column just see how many columns there are left.
1932 if let Some(fst) = filtered.first() {
1933 return fst.len();
1934 }
1935
1936 // Otherwise, count the number of values that would be filtered and return that height.
1937 let num_trues = mask.num_trues();
1938 if mask.len() == self.height() {
1939 num_trues
1940 } else {
1941 // This is for broadcasting masks
1942 debug_assert!(num_trues == 0 || num_trues == 1);
1943 self.height() * num_trues
1944 }
1945 }
1946
1947 /// Take the [`DataFrame`] rows by a boolean mask.
1948 ///
1949 /// # Example
1950 ///
1951 /// ```
1952 /// # use polars_core::prelude::*;
1953 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1954 /// let mask = df.column("sepal_width")?.is_not_null();
1955 /// df.filter(&mask)
1956 /// }
1957 /// ```
1958 pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1959 let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
1960 let height = self.filter_height(&new_col, mask);
1961
1962 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1963 }
1964
1965 /// Same as `filter` but does not parallelize.
1966 pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1967 let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
1968 let height = self.filter_height(&new_col, mask);
1969
1970 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1971 }
1972
1973 /// Take [`DataFrame`] rows by index values.
1974 ///
1975 /// # Example
1976 ///
1977 /// ```
1978 /// # use polars_core::prelude::*;
1979 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1980 /// let idx = IdxCa::new("idx".into(), [0, 1, 9]);
1981 /// df.take(&idx)
1982 /// }
1983 /// ```
1984 pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
1985 let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
1986
1987 Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
1988 }
1989
1990 /// # Safety
1991 /// The indices must be in-bounds.
1992 pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
1993 self.take_unchecked_impl(idx, true)
1994 }
1995
1996 /// # Safety
1997 /// The indices must be in-bounds.
1998 pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
1999 let cols = if allow_threads {
2000 POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
2001 } else {
2002 self._apply_columns(&|s| s.take_unchecked(idx))
2003 };
2004 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2005 }
2006
2007 /// # Safety
2008 /// The indices must be in-bounds.
2009 pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2010 self.take_slice_unchecked_impl(idx, true)
2011 }
2012
2013 /// # Safety
2014 /// The indices must be in-bounds.
2015 pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2016 let cols = if allow_threads {
2017 POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2018 } else {
2019 self._apply_columns(&|s| s.take_slice_unchecked(idx))
2020 };
2021 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2022 }
2023
2024 /// Rename a column in the [`DataFrame`].
2025 ///
2026 /// # Example
2027 ///
2028 /// ```
2029 /// # use polars_core::prelude::*;
2030 /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2031 /// let original_name = "foo";
2032 /// let new_name = "bar";
2033 /// df.rename(original_name, new_name.into())
2034 /// }
2035 /// ```
2036 pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2037 if column == name.as_str() {
2038 return Ok(self);
2039 }
2040 polars_ensure!(
2041 !self.schema().contains(&name),
2042 Duplicate: "column rename attempted with already existing name \"{name}\""
2043 );
2044
2045 self.get_column_index(column)
2046 .and_then(|idx| self.columns.get_mut(idx))
2047 .ok_or_else(|| polars_err!(col_not_found = column))
2048 .map(|c| c.rename(name))?;
2049 Ok(self)
2050 }
2051
2052 /// Sort [`DataFrame`] in place.
2053 ///
2054 /// See [`DataFrame::sort`] for more instruction.
2055 pub fn sort_in_place(
2056 &mut self,
2057 by: impl IntoVec<PlSmallStr>,
2058 sort_options: SortMultipleOptions,
2059 ) -> PolarsResult<&mut Self> {
2060 let by_column = self.select_columns(by)?;
2061 self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2062 Ok(self)
2063 }
2064
2065 #[doc(hidden)]
2066 /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2067 pub fn sort_impl(
2068 &self,
2069 by_column: Vec<Column>,
2070 mut sort_options: SortMultipleOptions,
2071 slice: Option<(i64, usize)>,
2072 ) -> PolarsResult<Self> {
2073 if by_column.is_empty() {
2074 // If no columns selected, any order (including original order) is correct.
2075 return if let Some((offset, len)) = slice {
2076 Ok(self.slice(offset, len))
2077 } else {
2078 Ok(self.clone())
2079 };
2080 }
2081
2082 // note that the by_column argument also contains evaluated expression from
2083 // polars-lazy that may not even be present in this dataframe. therefore
2084 // when we try to set the first columns as sorted, we ignore the error as
2085 // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2086 let first_descending = sort_options.descending[0];
2087 let first_by_column = by_column[0].name().to_string();
2088
2089 let set_sorted = |df: &mut DataFrame| {
2090 // Mark the first sort column as sorted; if the column does not exist it
2091 // is ok, because we sorted by an expression not present in the dataframe
2092 let _ = df.apply(&first_by_column, |s| {
2093 let mut s = s.clone();
2094 if first_descending {
2095 s.set_sorted_flag(IsSorted::Descending)
2096 } else {
2097 s.set_sorted_flag(IsSorted::Ascending)
2098 }
2099 s
2100 });
2101 };
2102 if self.is_empty() {
2103 let mut out = self.clone();
2104 set_sorted(&mut out);
2105 return Ok(out);
2106 }
2107
2108 if let Some((0, k)) = slice {
2109 if k < self.len() {
2110 return self.bottom_k_impl(k, by_column, sort_options);
2111 }
2112 }
2113 // Check if the required column is already sorted; if so we can exit early
2114 // We can do so when there is only one column to sort by, for multiple columns
2115 // it will be complicated to do so
2116 #[cfg(feature = "dtype-categorical")]
2117 let is_not_categorical_enum =
2118 !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2119 || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2120
2121 #[cfg(not(feature = "dtype-categorical"))]
2122 #[allow(non_upper_case_globals)]
2123 const is_not_categorical_enum: bool = true;
2124
2125 if by_column.len() == 1 && is_not_categorical_enum {
2126 let required_sorting = if sort_options.descending[0] {
2127 IsSorted::Descending
2128 } else {
2129 IsSorted::Ascending
2130 };
2131 // If null count is 0 then nulls_last doesnt matter
2132 // Safe to get value at last position since the dataframe is not empty (taken care above)
2133 let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2134 && ((by_column[0].null_count() == 0)
2135 || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2136 == sort_options.nulls_last[0]);
2137
2138 if no_sorting_required {
2139 return if let Some((offset, len)) = slice {
2140 Ok(self.slice(offset, len))
2141 } else {
2142 Ok(self.clone())
2143 };
2144 }
2145 }
2146
2147 let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2148
2149 // a lot of indirection in both sorting and take
2150 let mut df = self.clone();
2151 let df = df.as_single_chunk_par();
2152 let mut take = match (by_column.len(), has_nested) {
2153 (1, false) => {
2154 let s = &by_column[0];
2155 let options = SortOptions {
2156 descending: sort_options.descending[0],
2157 nulls_last: sort_options.nulls_last[0],
2158 multithreaded: sort_options.multithreaded,
2159 maintain_order: sort_options.maintain_order,
2160 limit: sort_options.limit,
2161 };
2162 // fast path for a frame with a single series
2163 // no need to compute the sort indices and then take by these indices
2164 // simply sort and return as frame
2165 if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2166 let mut out = s.sort_with(options)?;
2167 if let Some((offset, len)) = slice {
2168 out = out.slice(offset, len);
2169 }
2170 return Ok(out.into_frame());
2171 }
2172 s.arg_sort(options)
2173 },
2174 _ => {
2175 if sort_options.nulls_last.iter().all(|&x| x)
2176 || has_nested
2177 || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2178 {
2179 argsort_multiple_row_fmt(
2180 &by_column,
2181 sort_options.descending,
2182 sort_options.nulls_last,
2183 sort_options.multithreaded,
2184 )?
2185 } else {
2186 let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2187 first
2188 .as_materialized_series()
2189 .arg_sort_multiple(&other, &sort_options)?
2190 }
2191 },
2192 };
2193
2194 if let Some((offset, len)) = slice {
2195 take = take.slice(offset, len);
2196 }
2197
2198 // SAFETY:
2199 // the created indices are in bounds
2200 let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2201 set_sorted(&mut df);
2202 Ok(df)
2203 }
2204
2205 /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2206 ///
2207 /// This dataframe does not necessarily have a specified schema and may be changed at any
2208 /// point. It is primarily used for debugging.
2209 pub fn _to_metadata(&self) -> DataFrame {
2210 let num_columns = self.columns.len();
2211
2212 let mut column_names =
2213 StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2214 let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2215 let mut sorted_asc_ca =
2216 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2217 let mut sorted_dsc_ca =
2218 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2219 let mut fast_explode_list_ca =
2220 BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2221 let mut materialized_at_ca =
2222 StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2223
2224 for col in &self.columns {
2225 let flags = col.get_flags();
2226
2227 let (repr, materialized_at) = match col {
2228 Column::Series(s) => ("series", s.materialized_at()),
2229 Column::Partitioned(_) => ("partitioned", None),
2230 Column::Scalar(_) => ("scalar", None),
2231 };
2232 let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2233 let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2234 let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2235
2236 column_names.append_value(col.name().clone());
2237 repr_ca.append_value(repr);
2238 sorted_asc_ca.append_value(sorted_asc);
2239 sorted_dsc_ca.append_value(sorted_dsc);
2240 fast_explode_list_ca.append_value(fast_explode_list);
2241 materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2242 }
2243
2244 unsafe {
2245 DataFrame::new_no_checks(
2246 self.width(),
2247 vec![
2248 column_names.finish().into_column(),
2249 repr_ca.finish().into_column(),
2250 sorted_asc_ca.finish().into_column(),
2251 sorted_dsc_ca.finish().into_column(),
2252 fast_explode_list_ca.finish().into_column(),
2253 materialized_at_ca.finish().into_column(),
2254 ],
2255 )
2256 }
2257 }
2258
2259 /// Return a sorted clone of this [`DataFrame`].
2260 ///
2261 /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2262 /// # Example
2263 ///
2264 /// Sort by a single column with default options:
2265 /// ```
2266 /// # use polars_core::prelude::*;
2267 /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2268 /// df.sort(["sepal_width"], Default::default())
2269 /// }
2270 /// ```
2271 /// Sort by a single column with specific order:
2272 /// ```
2273 /// # use polars_core::prelude::*;
2274 /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2275 /// df.sort(
2276 /// ["sepal_width"],
2277 /// SortMultipleOptions::new()
2278 /// .with_order_descending(descending)
2279 /// )
2280 /// }
2281 /// ```
2282 /// Sort by multiple columns with specifying order for each column:
2283 /// ```
2284 /// # use polars_core::prelude::*;
2285 /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2286 /// df.sort(
2287 /// ["sepal_width", "sepal_length"],
2288 /// SortMultipleOptions::new()
2289 /// .with_order_descending_multi([false, true])
2290 /// )
2291 /// }
2292 /// ```
2293 /// See [`SortMultipleOptions`] for more options.
2294 ///
2295 /// Also see [`DataFrame::sort_in_place`].
2296 pub fn sort(
2297 &self,
2298 by: impl IntoVec<PlSmallStr>,
2299 sort_options: SortMultipleOptions,
2300 ) -> PolarsResult<Self> {
2301 let mut df = self.clone();
2302 df.sort_in_place(by, sort_options)?;
2303 Ok(df)
2304 }
2305
2306 /// Replace a column with a [`Series`].
2307 ///
2308 /// # Example
2309 ///
2310 /// ```rust
2311 /// # use polars_core::prelude::*;
2312 /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2313 /// "Area (kmĀ²)" => [9_833_520, 9_596_961])?;
2314 /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2315 ///
2316 /// assert!(df.replace("Nation", s.clone()).is_err());
2317 /// assert!(df.replace("Country", s).is_ok());
2318 /// # Ok::<(), PolarsError>(())
2319 /// ```
2320 pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2321 self.apply(column, |_| new_col.into_series())
2322 }
2323
2324 /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2325 /// is that now the value of `column: &str` determines the name of the column and not the name
2326 /// of the `Series` passed to this method.
2327 pub fn replace_or_add<S: IntoSeries>(
2328 &mut self,
2329 column: PlSmallStr,
2330 new_col: S,
2331 ) -> PolarsResult<&mut Self> {
2332 let mut new_col = new_col.into_series();
2333 new_col.rename(column);
2334 self.with_column(new_col)
2335 }
2336
2337 /// Replace column at index `idx` with a [`Series`].
2338 ///
2339 /// # Example
2340 ///
2341 /// ```ignored
2342 /// # use polars_core::prelude::*;
2343 /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2344 /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2345 /// let mut df = DataFrame::new(vec![s0, s1])?;
2346 ///
2347 /// // Add 32 to get lowercase ascii values
2348 /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2349 /// # Ok::<(), PolarsError>(())
2350 /// ```
2351 pub fn replace_column<C: IntoColumn>(
2352 &mut self,
2353 index: usize,
2354 new_column: C,
2355 ) -> PolarsResult<&mut Self> {
2356 polars_ensure!(
2357 index < self.width(),
2358 ShapeMismatch:
2359 "unable to replace at index {}, the DataFrame has only {} columns",
2360 index, self.width(),
2361 );
2362 let mut new_column = new_column.into_column();
2363 polars_ensure!(
2364 new_column.len() == self.height(),
2365 ShapeMismatch:
2366 "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2367 new_column.len(), self.height(),
2368 );
2369 let old_col = &mut self.columns[index];
2370 mem::swap(old_col, &mut new_column);
2371 self.clear_schema();
2372 Ok(self)
2373 }
2374
2375 /// Apply a closure to a column. This is the recommended way to do in place modification.
2376 ///
2377 /// # Example
2378 ///
2379 /// ```rust
2380 /// # use polars_core::prelude::*;
2381 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2382 /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2383 /// let mut df = DataFrame::new(vec![s0, s1])?;
2384 ///
2385 /// fn str_to_len(str_val: &Column) -> Column {
2386 /// str_val.str()
2387 /// .unwrap()
2388 /// .into_iter()
2389 /// .map(|opt_name: Option<&str>| {
2390 /// opt_name.map(|name: &str| name.len() as u32)
2391 /// })
2392 /// .collect::<UInt32Chunked>()
2393 /// .into_column()
2394 /// }
2395 ///
2396 /// // Replace the names column by the length of the names.
2397 /// df.apply("names", str_to_len);
2398 /// # Ok::<(), PolarsError>(())
2399 /// ```
2400 /// Results in:
2401 ///
2402 /// ```text
2403 /// +--------+-------+
2404 /// | foo | |
2405 /// | --- | names |
2406 /// | str | u32 |
2407 /// +========+=======+
2408 /// | "ham" | 4 |
2409 /// +--------+-------+
2410 /// | "spam" | 6 |
2411 /// +--------+-------+
2412 /// | "egg" | 3 |
2413 /// +--------+-------+
2414 /// ```
2415 pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2416 where
2417 F: FnOnce(&Column) -> C,
2418 C: IntoColumn,
2419 {
2420 let idx = self.check_name_to_idx(name)?;
2421 self.apply_at_idx(idx, f)
2422 }
2423
2424 /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2425 /// modification.
2426 ///
2427 /// # Example
2428 ///
2429 /// ```rust
2430 /// # use polars_core::prelude::*;
2431 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2432 /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2433 /// let mut df = DataFrame::new(vec![s0, s1])?;
2434 ///
2435 /// // Add 32 to get lowercase ascii values
2436 /// df.apply_at_idx(1, |s| s + 32);
2437 /// # Ok::<(), PolarsError>(())
2438 /// ```
2439 /// Results in:
2440 ///
2441 /// ```text
2442 /// +--------+-------+
2443 /// | foo | ascii |
2444 /// | --- | --- |
2445 /// | str | i32 |
2446 /// +========+=======+
2447 /// | "ham" | 102 |
2448 /// +--------+-------+
2449 /// | "spam" | 111 |
2450 /// +--------+-------+
2451 /// | "egg" | 111 |
2452 /// +--------+-------+
2453 /// ```
2454 pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2455 where
2456 F: FnOnce(&Column) -> C,
2457 C: IntoColumn,
2458 {
2459 let df_height = self.height();
2460 let width = self.width();
2461 let col = self.columns.get_mut(idx).ok_or_else(|| {
2462 polars_err!(
2463 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2464 idx, width
2465 )
2466 })?;
2467 let name = col.name().clone();
2468 let new_col = f(col).into_column();
2469 match new_col.len() {
2470 1 => {
2471 let new_col = new_col.new_from_index(0, df_height);
2472 let _ = mem::replace(col, new_col);
2473 },
2474 len if (len == df_height) => {
2475 let _ = mem::replace(col, new_col);
2476 },
2477 len => polars_bail!(
2478 ShapeMismatch:
2479 "resulting Series has length {} while the DataFrame has height {}",
2480 len, df_height
2481 ),
2482 }
2483
2484 // make sure the name remains the same after applying the closure
2485 unsafe {
2486 let col = self.columns.get_unchecked_mut(idx);
2487 col.rename(name);
2488 }
2489 Ok(self)
2490 }
2491
2492 /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2493 /// modification.
2494 ///
2495 /// # Example
2496 ///
2497 /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2498 ///
2499 /// ```rust
2500 /// # use polars_core::prelude::*;
2501 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2502 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2503 /// let mut df = DataFrame::new(vec![s0, s1])?;
2504 ///
2505 /// let idx = vec![0, 1, 4];
2506 ///
2507 /// df.try_apply("foo", |c| {
2508 /// c.str()?
2509 /// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2510 /// });
2511 /// # Ok::<(), PolarsError>(())
2512 /// ```
2513 /// Results in:
2514 ///
2515 /// ```text
2516 /// +---------------------+--------+
2517 /// | foo | values |
2518 /// | --- | --- |
2519 /// | str | i32 |
2520 /// +=====================+========+
2521 /// | "ham-is-modified" | 1 |
2522 /// +---------------------+--------+
2523 /// | "spam-is-modified" | 2 |
2524 /// +---------------------+--------+
2525 /// | "egg" | 3 |
2526 /// +---------------------+--------+
2527 /// | "bacon" | 4 |
2528 /// +---------------------+--------+
2529 /// | "quack-is-modified" | 5 |
2530 /// +---------------------+--------+
2531 /// ```
2532 pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2533 where
2534 F: FnOnce(&Column) -> PolarsResult<C>,
2535 C: IntoColumn,
2536 {
2537 let width = self.width();
2538 let col = self.columns.get_mut(idx).ok_or_else(|| {
2539 polars_err!(
2540 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2541 idx, width
2542 )
2543 })?;
2544 let name = col.name().clone();
2545
2546 let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2547
2548 // make sure the name remains the same after applying the closure
2549 unsafe {
2550 let col = self.columns.get_unchecked_mut(idx);
2551 col.rename(name);
2552 }
2553 Ok(self)
2554 }
2555
2556 /// Apply a closure that may fail to a column. This is the recommended way to do in place
2557 /// modification.
2558 ///
2559 /// # Example
2560 ///
2561 /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2562 ///
2563 /// ```rust
2564 /// # use polars_core::prelude::*;
2565 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2566 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2567 /// let mut df = DataFrame::new(vec![s0, s1])?;
2568 ///
2569 /// // create a mask
2570 /// let values = df.column("values")?.as_materialized_series();
2571 /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2572 ///
2573 /// df.try_apply("foo", |c| {
2574 /// c.str()?
2575 /// .set(&mask, Some("not_within_bounds"))
2576 /// });
2577 /// # Ok::<(), PolarsError>(())
2578 /// ```
2579 /// Results in:
2580 ///
2581 /// ```text
2582 /// +---------------------+--------+
2583 /// | foo | values |
2584 /// | --- | --- |
2585 /// | str | i32 |
2586 /// +=====================+========+
2587 /// | "not_within_bounds" | 1 |
2588 /// +---------------------+--------+
2589 /// | "spam" | 2 |
2590 /// +---------------------+--------+
2591 /// | "egg" | 3 |
2592 /// +---------------------+--------+
2593 /// | "bacon" | 4 |
2594 /// +---------------------+--------+
2595 /// | "not_within_bounds" | 5 |
2596 /// +---------------------+--------+
2597 /// ```
2598 pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2599 where
2600 F: FnOnce(&Series) -> PolarsResult<C>,
2601 C: IntoColumn,
2602 {
2603 let idx = self.try_get_column_index(column)?;
2604 self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2605 }
2606
2607 /// Slice the [`DataFrame`] along the rows.
2608 ///
2609 /// # Example
2610 ///
2611 /// ```rust
2612 /// # use polars_core::prelude::*;
2613 /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2614 /// "Color" => ["Green", "Red", "White", "White", "Red"])?;
2615 /// let sl: DataFrame = df.slice(2, 3);
2616 ///
2617 /// assert_eq!(sl.shape(), (3, 2));
2618 /// println!("{}", sl);
2619 /// # Ok::<(), PolarsError>(())
2620 /// ```
2621 /// Output:
2622 /// ```text
2623 /// shape: (3, 2)
2624 /// +-------+-------+
2625 /// | Fruit | Color |
2626 /// | --- | --- |
2627 /// | str | str |
2628 /// +=======+=======+
2629 /// | Grape | White |
2630 /// +-------+-------+
2631 /// | Fig | White |
2632 /// +-------+-------+
2633 /// | Fig | Red |
2634 /// +-------+-------+
2635 /// ```
2636 #[must_use]
2637 pub fn slice(&self, offset: i64, length: usize) -> Self {
2638 if offset == 0 && length == self.height() {
2639 return self.clone();
2640 }
2641 if length == 0 {
2642 return self.clear();
2643 }
2644 let col = self
2645 .columns
2646 .iter()
2647 .map(|s| s.slice(offset, length))
2648 .collect::<Vec<_>>();
2649
2650 let height = if let Some(fst) = col.first() {
2651 fst.len()
2652 } else {
2653 let (_, length) = slice_offsets(offset, length, self.height());
2654 length
2655 };
2656
2657 unsafe { DataFrame::new_no_checks(height, col) }
2658 }
2659
2660 /// Split [`DataFrame`] at the given `offset`.
2661 pub fn split_at(&self, offset: i64) -> (Self, Self) {
2662 let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2663
2664 let (idx, _) = slice_offsets(offset, 0, self.height());
2665
2666 let a = unsafe { DataFrame::new_no_checks(idx, a) };
2667 let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2668 (a, b)
2669 }
2670
2671 pub fn clear(&self) -> Self {
2672 let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2673 unsafe { DataFrame::new_no_checks(0, col) }
2674 }
2675
2676 #[must_use]
2677 pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2678 if offset == 0 && length == self.height() {
2679 return self.clone();
2680 }
2681 let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2682 unsafe { DataFrame::new_no_checks(length, columns) }
2683 }
2684
2685 #[must_use]
2686 pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2687 if offset == 0 && length == self.height() {
2688 return self.clone();
2689 }
2690 // @scalar-opt
2691 let columns = self._apply_columns(&|s| {
2692 let mut out = s.slice(offset, length);
2693 out.shrink_to_fit();
2694 out
2695 });
2696 unsafe { DataFrame::new_no_checks(length, columns) }
2697 }
2698
2699 /// Get the head of the [`DataFrame`].
2700 ///
2701 /// # Example
2702 ///
2703 /// ```rust
2704 /// # use polars_core::prelude::*;
2705 /// let countries: DataFrame =
2706 /// df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2707 /// "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2708 /// "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2709 /// "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2710 /// assert_eq!(countries.shape(), (5, 4));
2711 ///
2712 /// println!("{}", countries.head(Some(3)));
2713 /// # Ok::<(), PolarsError>(())
2714 /// ```
2715 ///
2716 /// Output:
2717 ///
2718 /// ```text
2719 /// shape: (3, 4)
2720 /// +--------------------+---------------+---------------+------------+
2721 /// | Rank by GDP (2021) | Continent | Country | Capital |
2722 /// | --- | --- | --- | --- |
2723 /// | i32 | str | str | str |
2724 /// +====================+===============+===============+============+
2725 /// | 1 | North America | United States | Washington |
2726 /// +--------------------+---------------+---------------+------------+
2727 /// | 2 | Asia | China | Beijing |
2728 /// +--------------------+---------------+---------------+------------+
2729 /// | 3 | Asia | Japan | Tokyo |
2730 /// +--------------------+---------------+---------------+------------+
2731 /// ```
2732 #[must_use]
2733 pub fn head(&self, length: Option<usize>) -> Self {
2734 let col = self
2735 .columns
2736 .iter()
2737 .map(|c| c.head(length))
2738 .collect::<Vec<_>>();
2739
2740 let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2741 let height = usize::min(height, self.height());
2742 unsafe { DataFrame::new_no_checks(height, col) }
2743 }
2744
2745 /// Get the tail of the [`DataFrame`].
2746 ///
2747 /// # Example
2748 ///
2749 /// ```rust
2750 /// # use polars_core::prelude::*;
2751 /// let countries: DataFrame =
2752 /// df!("Rank (2021)" => [105, 106, 107, 108, 109],
2753 /// "Apple Price (ā¬/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2754 /// "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2755 /// assert_eq!(countries.shape(), (5, 3));
2756 ///
2757 /// println!("{}", countries.tail(Some(2)));
2758 /// # Ok::<(), PolarsError>(())
2759 /// ```
2760 ///
2761 /// Output:
2762 ///
2763 /// ```text
2764 /// shape: (2, 3)
2765 /// +-------------+--------------------+---------+
2766 /// | Rank (2021) | Apple Price (ā¬/kg) | Country |
2767 /// | --- | --- | --- |
2768 /// | i32 | f64 | str |
2769 /// +=============+====================+=========+
2770 /// | 108 | 0.63 | Syria |
2771 /// +-------------+--------------------+---------+
2772 /// | 109 | 0.63 | Turkey |
2773 /// +-------------+--------------------+---------+
2774 /// ```
2775 #[must_use]
2776 pub fn tail(&self, length: Option<usize>) -> Self {
2777 let col = self
2778 .columns
2779 .iter()
2780 .map(|c| c.tail(length))
2781 .collect::<Vec<_>>();
2782
2783 let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2784 let height = usize::min(height, self.height());
2785 unsafe { DataFrame::new_no_checks(height, col) }
2786 }
2787
2788 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2789 ///
2790 /// # Panics
2791 ///
2792 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2793 ///
2794 /// This responsibility is left to the caller as we don't want to take mutable references here,
2795 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2796 /// as well.
2797 pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter {
2798 debug_assert!(!self.should_rechunk(), "expected equal chunks");
2799 // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2800 // as we must allocate arrow strings/binaries.
2801 let must_convert = compat_level.0 == 0;
2802 let parallel = parallel
2803 && must_convert
2804 && self.columns.len() > 1
2805 && self
2806 .columns
2807 .iter()
2808 .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2809
2810 RecordBatchIter {
2811 columns: &self.columns,
2812 schema: Arc::new(
2813 self.columns
2814 .iter()
2815 .map(|c| c.field().to_arrow(compat_level))
2816 .collect(),
2817 ),
2818 idx: 0,
2819 n_chunks: self.first_col_n_chunks(),
2820 compat_level,
2821 parallel,
2822 }
2823 }
2824
2825 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2826 ///
2827 /// # Panics
2828 ///
2829 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2830 ///
2831 /// This responsibility is left to the caller as we don't want to take mutable references here,
2832 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2833 /// as well.
2834 pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2835 PhysRecordBatchIter {
2836 schema: Arc::new(
2837 self.get_columns()
2838 .iter()
2839 .map(|c| c.field().to_arrow(CompatLevel::newest()))
2840 .collect(),
2841 ),
2842 arr_iters: self
2843 .materialized_column_iter()
2844 .map(|s| s.chunks().iter())
2845 .collect(),
2846 }
2847 }
2848
2849 /// Get a [`DataFrame`] with all the columns in reversed order.
2850 #[must_use]
2851 pub fn reverse(&self) -> Self {
2852 let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2853 unsafe { DataFrame::new_no_checks(self.height(), col) }
2854 }
2855
2856 /// Shift the values by a given period and fill the parts that will be empty due to this operation
2857 /// with `Nones`.
2858 ///
2859 /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2860 #[must_use]
2861 pub fn shift(&self, periods: i64) -> Self {
2862 let col = self._apply_columns_par(&|s| s.shift(periods));
2863 unsafe { DataFrame::new_no_checks(self.height(), col) }
2864 }
2865
2866 /// Replace None values with one of the following strategies:
2867 /// * Forward fill (replace None with the previous value)
2868 /// * Backward fill (replace None with the next value)
2869 /// * Mean fill (replace None with the mean of the whole array)
2870 /// * Min fill (replace None with the minimum of the whole array)
2871 /// * Max fill (replace None with the maximum of the whole array)
2872 ///
2873 /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2874 pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2875 let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2876
2877 Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2878 }
2879
2880 /// Pipe different functions/ closure operations that work on a DataFrame together.
2881 pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2882 where
2883 F: Fn(DataFrame) -> PolarsResult<B>,
2884 {
2885 f(self)
2886 }
2887
2888 /// Pipe different functions/ closure operations that work on a DataFrame together.
2889 pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2890 where
2891 F: Fn(&mut DataFrame) -> PolarsResult<B>,
2892 {
2893 f(self)
2894 }
2895
2896 /// Pipe different functions/ closure operations that work on a DataFrame together.
2897 pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2898 where
2899 F: Fn(DataFrame, Args) -> PolarsResult<B>,
2900 {
2901 f(self, args)
2902 }
2903
2904 /// Drop duplicate rows from a [`DataFrame`].
2905 /// *This fails when there is a column of type List in DataFrame*
2906 ///
2907 /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2908 ///
2909 /// # Example
2910 ///
2911 /// ```no_run
2912 /// # use polars_core::prelude::*;
2913 /// let df = df! {
2914 /// "flt" => [1., 1., 2., 2., 3., 3.],
2915 /// "int" => [1, 1, 2, 2, 3, 3, ],
2916 /// "str" => ["a", "a", "b", "b", "c", "c"]
2917 /// }?;
2918 ///
2919 /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2920 /// # Ok::<(), PolarsError>(())
2921 /// ```
2922 /// Returns
2923 ///
2924 /// ```text
2925 /// +-----+-----+-----+
2926 /// | flt | int | str |
2927 /// | --- | --- | --- |
2928 /// | f64 | i32 | str |
2929 /// +=====+=====+=====+
2930 /// | 1 | 1 | "a" |
2931 /// +-----+-----+-----+
2932 /// | 2 | 2 | "b" |
2933 /// +-----+-----+-----+
2934 /// | 3 | 3 | "c" |
2935 /// +-----+-----+-----+
2936 /// ```
2937 #[cfg(feature = "algorithm_group_by")]
2938 pub fn unique_stable(
2939 &self,
2940 subset: Option<&[String]>,
2941 keep: UniqueKeepStrategy,
2942 slice: Option<(i64, usize)>,
2943 ) -> PolarsResult<DataFrame> {
2944 self.unique_impl(
2945 true,
2946 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2947 keep,
2948 slice,
2949 )
2950 }
2951
2952 /// Unstable distinct. See [`DataFrame::unique_stable`].
2953 #[cfg(feature = "algorithm_group_by")]
2954 pub fn unique<I, S>(
2955 &self,
2956 subset: Option<&[String]>,
2957 keep: UniqueKeepStrategy,
2958 slice: Option<(i64, usize)>,
2959 ) -> PolarsResult<DataFrame> {
2960 self.unique_impl(
2961 false,
2962 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2963 keep,
2964 slice,
2965 )
2966 }
2967
2968 #[cfg(feature = "algorithm_group_by")]
2969 pub fn unique_impl(
2970 &self,
2971 maintain_order: bool,
2972 subset: Option<Vec<PlSmallStr>>,
2973 keep: UniqueKeepStrategy,
2974 slice: Option<(i64, usize)>,
2975 ) -> PolarsResult<Self> {
2976 let names = subset.unwrap_or_else(|| self.get_column_names_owned());
2977 let mut df = self.clone();
2978 // take on multiple chunks is terrible
2979 df.as_single_chunk_par();
2980
2981 let columns = match (keep, maintain_order) {
2982 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
2983 let gb = df.group_by_stable(names)?;
2984 let groups = gb.get_groups();
2985 let (offset, len) = slice.unwrap_or((0, groups.len()));
2986 let groups = groups.slice(offset, len);
2987 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
2988 },
2989 (UniqueKeepStrategy::Last, true) => {
2990 // maintain order by last values, so the sorted groups are not correct as they
2991 // are sorted by the first value
2992 let gb = df.group_by(names)?;
2993 let groups = gb.get_groups();
2994
2995 let func = |g: GroupsIndicator| match g {
2996 GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
2997 GroupsIndicator::Slice([first, len]) => first + len - 1,
2998 };
2999
3000 let last_idx: NoNull<IdxCa> = match slice {
3001 None => groups.iter().map(func).collect(),
3002 Some((offset, len)) => {
3003 let (offset, len) = slice_offsets(offset, len, groups.len());
3004 groups.iter().skip(offset).take(len).map(func).collect()
3005 },
3006 };
3007
3008 let last_idx = last_idx.sort(false);
3009 return Ok(unsafe { df.take_unchecked(&last_idx) });
3010 },
3011 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3012 let gb = df.group_by(names)?;
3013 let groups = gb.get_groups();
3014 let (offset, len) = slice.unwrap_or((0, groups.len()));
3015 let groups = groups.slice(offset, len);
3016 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3017 },
3018 (UniqueKeepStrategy::Last, false) => {
3019 let gb = df.group_by(names)?;
3020 let groups = gb.get_groups();
3021 let (offset, len) = slice.unwrap_or((0, groups.len()));
3022 let groups = groups.slice(offset, len);
3023 df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3024 },
3025 (UniqueKeepStrategy::None, _) => {
3026 let df_part = df.select(names)?;
3027 let mask = df_part.is_unique()?;
3028 let mask = match slice {
3029 None => mask,
3030 Some((offset, len)) => mask.slice(offset, len),
3031 };
3032 return df.filter(&mask);
3033 },
3034 };
3035
3036 let height = Self::infer_height(&columns);
3037 Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3038 }
3039
3040 /// Get a mask of all the unique rows in the [`DataFrame`].
3041 ///
3042 /// # Example
3043 ///
3044 /// ```no_run
3045 /// # use polars_core::prelude::*;
3046 /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3047 /// "ISIN" => ["US0378331005", "US5949181045"])?;
3048 /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3049 ///
3050 /// assert!(ca.all());
3051 /// # Ok::<(), PolarsError>(())
3052 /// ```
3053 #[cfg(feature = "algorithm_group_by")]
3054 pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3055 let gb = self.group_by(self.get_column_names_owned())?;
3056 let groups = gb.get_groups();
3057 Ok(is_unique_helper(
3058 groups,
3059 self.height() as IdxSize,
3060 true,
3061 false,
3062 ))
3063 }
3064
3065 /// Get a mask of all the duplicated rows in the [`DataFrame`].
3066 ///
3067 /// # Example
3068 ///
3069 /// ```no_run
3070 /// # use polars_core::prelude::*;
3071 /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3072 /// "ISIN" => ["US02079K3059", "US02079K1079"])?;
3073 /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3074 ///
3075 /// assert!(!ca.all());
3076 /// # Ok::<(), PolarsError>(())
3077 /// ```
3078 #[cfg(feature = "algorithm_group_by")]
3079 pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3080 let gb = self.group_by(self.get_column_names_owned())?;
3081 let groups = gb.get_groups();
3082 Ok(is_unique_helper(
3083 groups,
3084 self.height() as IdxSize,
3085 false,
3086 true,
3087 ))
3088 }
3089
3090 /// Create a new [`DataFrame`] that shows the null counts per column.
3091 #[must_use]
3092 pub fn null_count(&self) -> Self {
3093 let cols = self
3094 .columns
3095 .iter()
3096 .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3097 .collect();
3098 unsafe { Self::new_no_checks(1, cols) }
3099 }
3100
3101 /// Hash and combine the row values
3102 #[cfg(feature = "row_hash")]
3103 pub fn hash_rows(
3104 &mut self,
3105 hasher_builder: Option<PlSeedableRandomStateQuality>,
3106 ) -> PolarsResult<UInt64Chunked> {
3107 let dfs = split_df(self, POOL.current_num_threads(), false);
3108 let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3109
3110 let mut iter = cas.into_iter();
3111 let mut acc_ca = iter.next().unwrap();
3112 for ca in iter {
3113 acc_ca.append(&ca)?;
3114 }
3115 Ok(acc_ca.rechunk().into_owned())
3116 }
3117
3118 /// Get the supertype of the columns in this DataFrame
3119 pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3120 self.columns
3121 .iter()
3122 .map(|s| Ok(s.dtype().clone()))
3123 .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3124 }
3125
3126 /// Take by index values given by the slice `idx`.
3127 /// # Warning
3128 /// Be careful with allowing threads when calling this in a large hot loop
3129 /// every thread split may be on rayon stack and lead to SO
3130 #[doc(hidden)]
3131 pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3132 self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3133 }
3134
3135 /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3136 /// if the index value in `idx` are sorted. This will maintain sorted flags.
3137 ///
3138 /// # Warning
3139 /// Be careful with allowing threads when calling this in a large hot loop
3140 /// every thread split may be on rayon stack and lead to SO
3141 #[doc(hidden)]
3142 pub unsafe fn _take_unchecked_slice_sorted(
3143 &self,
3144 idx: &[IdxSize],
3145 allow_threads: bool,
3146 sorted: IsSorted,
3147 ) -> Self {
3148 #[cfg(debug_assertions)]
3149 {
3150 if idx.len() > 2 {
3151 match sorted {
3152 IsSorted::Ascending => {
3153 assert!(idx[0] <= idx[idx.len() - 1]);
3154 },
3155 IsSorted::Descending => {
3156 assert!(idx[0] >= idx[idx.len() - 1]);
3157 },
3158 _ => {},
3159 }
3160 }
3161 }
3162 let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3163 ca.set_sorted_flag(sorted);
3164 self.take_unchecked_impl(&ca, allow_threads)
3165 }
3166
3167 #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3168 #[doc(hidden)]
3169 pub fn _partition_by_impl(
3170 &self,
3171 cols: &[PlSmallStr],
3172 stable: bool,
3173 include_key: bool,
3174 parallel: bool,
3175 ) -> PolarsResult<Vec<DataFrame>> {
3176 let selected_keys = self.select_columns(cols.iter().cloned())?;
3177 let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3178 let groups = groups.take_groups();
3179
3180 // drop key columns prior to calculation if requested
3181 let df = if include_key {
3182 self.clone()
3183 } else {
3184 self.drop_many(cols.iter().cloned())
3185 };
3186
3187 if parallel {
3188 // don't parallelize this
3189 // there is a lot of parallelization in take and this may easily SO
3190 POOL.install(|| {
3191 match groups.as_ref() {
3192 GroupsType::Idx(idx) => {
3193 // Rechunk as the gather may rechunk for every group #17562.
3194 let mut df = df.clone();
3195 df.as_single_chunk_par();
3196 Ok(idx
3197 .into_par_iter()
3198 .map(|(_, group)| {
3199 // groups are in bounds
3200 unsafe {
3201 df._take_unchecked_slice_sorted(
3202 group,
3203 false,
3204 IsSorted::Ascending,
3205 )
3206 }
3207 })
3208 .collect())
3209 },
3210 GroupsType::Slice { groups, .. } => Ok(groups
3211 .into_par_iter()
3212 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3213 .collect()),
3214 }
3215 })
3216 } else {
3217 match groups.as_ref() {
3218 GroupsType::Idx(idx) => {
3219 // Rechunk as the gather may rechunk for every group #17562.
3220 let mut df = df.clone();
3221 df.as_single_chunk();
3222 Ok(idx
3223 .into_iter()
3224 .map(|(_, group)| {
3225 // groups are in bounds
3226 unsafe {
3227 df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3228 }
3229 })
3230 .collect())
3231 },
3232 GroupsType::Slice { groups, .. } => Ok(groups
3233 .iter()
3234 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3235 .collect()),
3236 }
3237 }
3238 }
3239
3240 /// Split into multiple DataFrames partitioned by groups
3241 #[cfg(feature = "partition_by")]
3242 pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3243 where
3244 I: IntoIterator<Item = S>,
3245 S: Into<PlSmallStr>,
3246 {
3247 let cols = cols
3248 .into_iter()
3249 .map(Into::into)
3250 .collect::<Vec<PlSmallStr>>();
3251 self._partition_by_impl(cols.as_slice(), false, include_key, true)
3252 }
3253
3254 /// Split into multiple DataFrames partitioned by groups
3255 /// Order of the groups are maintained.
3256 #[cfg(feature = "partition_by")]
3257 pub fn partition_by_stable<I, S>(
3258 &self,
3259 cols: I,
3260 include_key: bool,
3261 ) -> PolarsResult<Vec<DataFrame>>
3262 where
3263 I: IntoIterator<Item = S>,
3264 S: Into<PlSmallStr>,
3265 {
3266 let cols = cols
3267 .into_iter()
3268 .map(Into::into)
3269 .collect::<Vec<PlSmallStr>>();
3270 self._partition_by_impl(cols.as_slice(), true, include_key, true)
3271 }
3272
3273 /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3274 /// inserted as columns.
3275 #[cfg(feature = "dtype-struct")]
3276 pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3277 let cols = cols.into_vec();
3278 self.unnest_impl(cols.into_iter().collect())
3279 }
3280
3281 #[cfg(feature = "dtype-struct")]
3282 fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3283 let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3284 let mut count = 0;
3285 for s in &self.columns {
3286 if cols.contains(s.name()) {
3287 let ca = s.struct_()?.clone();
3288 new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3289 count += 1;
3290 } else {
3291 new_cols.push(s.clone())
3292 }
3293 }
3294 if count != cols.len() {
3295 // one or more columns not found
3296 // the code below will return an error with the missing name
3297 let schema = self.schema();
3298 for col in cols {
3299 let _ = schema
3300 .get(col.as_str())
3301 .ok_or_else(|| polars_err!(col_not_found = col))?;
3302 }
3303 }
3304 DataFrame::new(new_cols)
3305 }
3306
3307 pub(crate) fn infer_height(cols: &[Column]) -> usize {
3308 cols.first().map_or(0, Column::len)
3309 }
3310
3311 pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3312 // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3313 // append_chunk or something like this. It is just quite difficult to make that safe.
3314 let df = DataFrame::from(rb);
3315 polars_ensure!(
3316 self.schema() == df.schema(),
3317 SchemaMismatch: "cannot append record batch with different schema",
3318 );
3319 self.vstack_mut_owned_unchecked(df);
3320 Ok(())
3321 }
3322}
3323
3324pub struct RecordBatchIter<'a> {
3325 columns: &'a Vec<Column>,
3326 schema: ArrowSchemaRef,
3327 idx: usize,
3328 n_chunks: usize,
3329 compat_level: CompatLevel,
3330 parallel: bool,
3331}
3332
3333impl Iterator for RecordBatchIter<'_> {
3334 type Item = RecordBatch;
3335
3336 fn next(&mut self) -> Option<Self::Item> {
3337 if self.idx >= self.n_chunks {
3338 return None;
3339 }
3340
3341 // Create a batch of the columns with the same chunk no.
3342 let batch_cols: Vec<ArrayRef> = if self.parallel {
3343 let iter = self
3344 .columns
3345 .par_iter()
3346 .map(Column::as_materialized_series)
3347 .map(|s| s.to_arrow(self.idx, self.compat_level));
3348 POOL.install(|| iter.collect())
3349 } else {
3350 self.columns
3351 .iter()
3352 .map(Column::as_materialized_series)
3353 .map(|s| s.to_arrow(self.idx, self.compat_level))
3354 .collect()
3355 };
3356 self.idx += 1;
3357
3358 let length = batch_cols.first().map_or(0, |arr| arr.len());
3359 Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3360 }
3361
3362 fn size_hint(&self) -> (usize, Option<usize>) {
3363 let n = self.n_chunks - self.idx;
3364 (n, Some(n))
3365 }
3366}
3367
3368pub struct PhysRecordBatchIter<'a> {
3369 schema: ArrowSchemaRef,
3370 arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3371}
3372
3373impl Iterator for PhysRecordBatchIter<'_> {
3374 type Item = RecordBatch;
3375
3376 fn next(&mut self) -> Option<Self::Item> {
3377 let arrs = self
3378 .arr_iters
3379 .iter_mut()
3380 .map(|phys_iter| phys_iter.next().cloned())
3381 .collect::<Option<Vec<_>>>()?;
3382
3383 let length = arrs.first().map_or(0, |arr| arr.len());
3384 Some(RecordBatch::new(length, self.schema.clone(), arrs))
3385 }
3386
3387 fn size_hint(&self) -> (usize, Option<usize>) {
3388 if let Some(iter) = self.arr_iters.first() {
3389 iter.size_hint()
3390 } else {
3391 (0, None)
3392 }
3393 }
3394}
3395
3396impl Default for DataFrame {
3397 fn default() -> Self {
3398 DataFrame::empty()
3399 }
3400}
3401
3402impl From<DataFrame> for Vec<Column> {
3403 fn from(df: DataFrame) -> Self {
3404 df.columns
3405 }
3406}
3407
3408// utility to test if we can vstack/extend the columns
3409fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3410 polars_ensure!(
3411 left.name() == right.name(),
3412 ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3413 left.name(), right.name(),
3414 );
3415 Ok(())
3416}
3417
3418#[cfg(test)]
3419mod test {
3420 use super::*;
3421
3422 fn create_frame() -> DataFrame {
3423 let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3424 let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3425 DataFrame::new(vec![s0, s1]).unwrap()
3426 }
3427
3428 #[test]
3429 #[cfg_attr(miri, ignore)]
3430 fn test_recordbatch_iterator() {
3431 let df = df!(
3432 "foo" => [1, 2, 3, 4, 5]
3433 )
3434 .unwrap();
3435 let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3436 assert_eq!(5, iter.next().unwrap().len());
3437 assert!(iter.next().is_none());
3438 }
3439
3440 #[test]
3441 #[cfg_attr(miri, ignore)]
3442 fn test_select() {
3443 let df = create_frame();
3444 assert_eq!(
3445 df.column("days")
3446 .unwrap()
3447 .as_series()
3448 .unwrap()
3449 .equal(1)
3450 .unwrap()
3451 .sum(),
3452 Some(1)
3453 );
3454 }
3455
3456 #[test]
3457 #[cfg_attr(miri, ignore)]
3458 fn test_filter_broadcast_on_string_col() {
3459 let col_name = "some_col";
3460 let v = vec!["test".to_string()];
3461 let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3462 let mut df = DataFrame::new(vec![s0]).unwrap();
3463
3464 df = df
3465 .filter(
3466 &df.column(col_name)
3467 .unwrap()
3468 .as_materialized_series()
3469 .equal("")
3470 .unwrap(),
3471 )
3472 .unwrap();
3473 assert_eq!(
3474 df.column(col_name)
3475 .unwrap()
3476 .as_materialized_series()
3477 .n_chunks(),
3478 1
3479 );
3480 }
3481
3482 #[test]
3483 #[cfg_attr(miri, ignore)]
3484 fn test_filter_broadcast_on_list_col() {
3485 let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3486 let ll: ListChunked = [&s1].iter().copied().collect();
3487
3488 let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3489 let new = ll.filter(&mask).unwrap();
3490
3491 assert_eq!(new.chunks.len(), 1);
3492 assert_eq!(new.len(), 0);
3493 }
3494
3495 #[test]
3496 fn slice() {
3497 let df = create_frame();
3498 let sliced_df = df.slice(0, 2);
3499 assert_eq!(sliced_df.shape(), (2, 2));
3500 }
3501
3502 #[test]
3503 fn rechunk_false() {
3504 let df = create_frame();
3505 assert!(!df.should_rechunk())
3506 }
3507
3508 #[test]
3509 fn rechunk_true() -> PolarsResult<()> {
3510 let mut base = df!(
3511 "a" => [1, 2, 3],
3512 "b" => [1, 2, 3]
3513 )?;
3514
3515 // Create a series with multiple chunks
3516 let mut s = Series::new("foo".into(), 0..2);
3517 let s2 = Series::new("bar".into(), 0..1);
3518 s.append(&s2)?;
3519
3520 // Append series to frame
3521 let out = base.with_column(s)?;
3522
3523 // Now we should rechunk
3524 assert!(out.should_rechunk());
3525 Ok(())
3526 }
3527
3528 #[test]
3529 fn test_duplicate_column() {
3530 let mut df = df! {
3531 "foo" => [1, 2, 3]
3532 }
3533 .unwrap();
3534 // check if column is replaced
3535 assert!(
3536 df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3537 .is_ok()
3538 );
3539 assert!(
3540 df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3541 .is_ok()
3542 );
3543 assert!(df.column("bar").is_ok())
3544 }
3545
3546 #[test]
3547 #[cfg_attr(miri, ignore)]
3548 fn distinct() {
3549 let df = df! {
3550 "flt" => [1., 1., 2., 2., 3., 3.],
3551 "int" => [1, 1, 2, 2, 3, 3, ],
3552 "str" => ["a", "a", "b", "b", "c", "c"]
3553 }
3554 .unwrap();
3555 let df = df
3556 .unique_stable(None, UniqueKeepStrategy::First, None)
3557 .unwrap()
3558 .sort(["flt"], SortMultipleOptions::default())
3559 .unwrap();
3560 let valid = df! {
3561 "flt" => [1., 2., 3.],
3562 "int" => [1, 2, 3],
3563 "str" => ["a", "b", "c"]
3564 }
3565 .unwrap();
3566 assert!(df.equals(&valid));
3567 }
3568
3569 #[test]
3570 fn test_vstack() {
3571 // check that it does not accidentally rechunks
3572 let mut df = df! {
3573 "flt" => [1., 1., 2., 2., 3., 3.],
3574 "int" => [1, 1, 2, 2, 3, 3, ],
3575 "str" => ["a", "a", "b", "b", "c", "c"]
3576 }
3577 .unwrap();
3578
3579 df.vstack_mut(&df.slice(0, 3)).unwrap();
3580 assert_eq!(df.first_col_n_chunks(), 2)
3581 }
3582
3583 #[test]
3584 fn test_vstack_on_empty_dataframe() {
3585 let mut df = DataFrame::empty();
3586
3587 let df_data = df! {
3588 "flt" => [1., 1., 2., 2., 3., 3.],
3589 "int" => [1, 1, 2, 2, 3, 3, ],
3590 "str" => ["a", "a", "b", "b", "c", "c"]
3591 }
3592 .unwrap();
3593
3594 df.vstack_mut(&df_data).unwrap();
3595 assert_eq!(df.height, 6)
3596 }
3597
3598 #[test]
3599 fn test_replace_or_add() -> PolarsResult<()> {
3600 let mut df = df!(
3601 "a" => [1, 2, 3],
3602 "b" => [1, 2, 3]
3603 )?;
3604
3605 // check that the new column is "c" and not "bar".
3606 df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3607
3608 assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3609 Ok(())
3610 }
3611}