polars_core/frame/mod.rs
1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(any(feature = "rows", feature = "object"))]
33pub mod row;
34mod top_k;
35mod upstream_traits;
36mod validation;
37
38use arrow::record_batch::{RecordBatch, RecordBatchT};
39use polars_utils::pl_str::PlSmallStr;
40#[cfg(feature = "serde")]
41use serde::{Deserialize, Serialize};
42use strum_macros::IntoStaticStr;
43
44use crate::POOL;
45#[cfg(feature = "row_hash")]
46use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48use crate::series::IsSorted;
49
50#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52#[strum(serialize_all = "snake_case")]
53pub enum UniqueKeepStrategy {
54 /// Keep the first unique row.
55 First,
56 /// Keep the last unique row.
57 Last,
58 /// Keep None of the unique rows.
59 None,
60 /// Keep any of the unique rows
61 /// This allows more optimizations
62 #[default]
63 Any,
64}
65
66fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
67where
68 F: for<'a> FnMut(&'a T) -> &'a str,
69{
70 // Always unique.
71 if items.len() <= 1 {
72 return Ok(());
73 }
74
75 if items.len() <= 4 {
76 // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
77 for i in 0..items.len() - 1 {
78 let name = get_name(&items[i]);
79 for other in items.iter().skip(i + 1) {
80 if name == get_name(other) {
81 polars_bail!(duplicate = name);
82 }
83 }
84 }
85 } else {
86 let mut names = PlHashSet::with_capacity(items.len());
87 for item in items {
88 let name = get_name(item);
89 if !names.insert(name) {
90 polars_bail!(duplicate = name);
91 }
92 }
93 }
94 Ok(())
95}
96
97/// A contiguous growable collection of `Series` that have the same length.
98///
99/// ## Use declarations
100///
101/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
102///
103/// ```rust
104/// use polars_core::prelude::*; // if the crate polars-core is used directly
105/// // use polars::prelude::*; if the crate polars is used
106/// ```
107///
108/// # Initialization
109/// ## Default
110///
111/// A `DataFrame` can be initialized empty:
112///
113/// ```rust
114/// # use polars_core::prelude::*;
115/// let df = DataFrame::default();
116/// assert!(df.is_empty());
117/// ```
118///
119/// ## Wrapping a `Vec<Series>`
120///
121/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
122///
123/// ```rust
124/// # use polars_core::prelude::*;
125/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
126/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
127///
128/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
129/// ```
130///
131/// ## Using a macro
132///
133/// The [`df!`] macro is a convenient method:
134///
135/// ```rust
136/// # use polars_core::prelude::*;
137/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
138/// "Color" => ["Red", "Yellow", "Green"]);
139/// ```
140///
141/// ## Using a CSV file
142///
143/// See the `polars_io::csv::CsvReader`.
144///
145/// # Indexing
146/// ## By a number
147///
148/// The `Index<usize>` is implemented for the `DataFrame`.
149///
150/// ```rust
151/// # use polars_core::prelude::*;
152/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
153/// "Color" => ["Red", "Yellow", "Green"])?;
154///
155/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
156/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
157/// # Ok::<(), PolarsError>(())
158/// ```
159///
160/// ## By a `Series` name
161///
162/// ```rust
163/// # use polars_core::prelude::*;
164/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
165/// "Color" => ["Red", "Yellow", "Green"])?;
166///
167/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
168/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
169/// # Ok::<(), PolarsError>(())
170/// ```
171#[derive(Clone)]
172pub struct DataFrame {
173 height: usize,
174 // invariant: columns[i].len() == height for each 0 >= i > columns.len()
175 pub(crate) columns: Vec<Column>,
176
177 /// A cached schema. This might not give correct results if the DataFrame was modified in place
178 /// between schema and reading.
179 cached_schema: OnceLock<SchemaRef>,
180}
181
182impl DataFrame {
183 pub fn clear_schema(&mut self) {
184 self.cached_schema = OnceLock::new();
185 }
186
187 #[inline]
188 pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
189 self.columns.iter()
190 }
191
192 #[inline]
193 pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
194 self.columns.iter().map(Column::as_materialized_series)
195 }
196
197 #[inline]
198 pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
199 self.columns.par_iter().map(Column::as_materialized_series)
200 }
201
202 /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
203 ///
204 /// # Implementation
205 /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
206 /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
207 /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
208 ///
209 /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
210 /// However, this function will yield a smaller number. This is because this function returns
211 /// the visible size of the buffer, not its total capacity.
212 ///
213 /// FFI buffers are included in this estimation.
214 pub fn estimated_size(&self) -> usize {
215 self.columns.iter().map(Column::estimated_size).sum()
216 }
217
218 // Reduce monomorphization.
219 fn try_apply_columns(
220 &self,
221 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
222 ) -> PolarsResult<Vec<Column>> {
223 self.columns.iter().map(func).collect()
224 }
225 // Reduce monomorphization.
226 pub fn _apply_columns(&self, func: &(dyn Fn(&Column) -> Column)) -> Vec<Column> {
227 self.columns.iter().map(func).collect()
228 }
229 // Reduce monomorphization.
230 fn try_apply_columns_par(
231 &self,
232 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
233 ) -> PolarsResult<Vec<Column>> {
234 POOL.install(|| self.columns.par_iter().map(func).collect())
235 }
236 // Reduce monomorphization.
237 pub fn _apply_columns_par(
238 &self,
239 func: &(dyn Fn(&Column) -> Column + Send + Sync),
240 ) -> Vec<Column> {
241 POOL.install(|| self.columns.par_iter().map(func).collect())
242 }
243
244 /// Get the index of the column.
245 fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
246 self.get_column_index(name)
247 .ok_or_else(|| polars_err!(col_not_found = name))
248 }
249
250 fn check_already_present(&self, name: &str) -> PolarsResult<()> {
251 polars_ensure!(
252 self.columns.iter().all(|s| s.name().as_str() != name),
253 Duplicate: "column with name {:?} is already present in the DataFrame", name
254 );
255 Ok(())
256 }
257
258 /// Reserve additional slots into the chunks of the series.
259 pub(crate) fn reserve_chunks(&mut self, additional: usize) {
260 for s in &mut self.columns {
261 if let Column::Series(s) = s {
262 // SAFETY:
263 // do not modify the data, simply resize.
264 unsafe { s.chunks_mut().reserve(additional) }
265 }
266 }
267 }
268
269 /// Create a DataFrame from a Vector of Series.
270 ///
271 /// Errors if a column names are not unique, or if heights are not all equal.
272 ///
273 /// # Example
274 ///
275 /// ```
276 /// # use polars_core::prelude::*;
277 /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
278 /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
279 ///
280 /// let df = DataFrame::new(vec![s0, s1])?;
281 /// # Ok::<(), PolarsError>(())
282 /// ```
283 pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
284 DataFrame::validate_columns_slice(&columns)
285 .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {}", e)))?;
286 Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
287 }
288
289 pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
290 for col in &columns {
291 polars_ensure!(
292 col.len() == height,
293 ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
294 columns[0].name(), height, col.name(), col.len()
295 );
296 }
297
298 Ok(DataFrame {
299 height,
300 columns,
301 cached_schema: OnceLock::new(),
302 })
303 }
304
305 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
306 /// columns to match the other columns.
307 pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
308 // The length of the longest non-unit length column determines the
309 // broadcast length. If all columns are unit-length the broadcast length
310 // is one.
311 let broadcast_len = columns
312 .iter()
313 .map(|s| s.len())
314 .filter(|l| *l != 1)
315 .max()
316 .unwrap_or(1);
317 Self::new_with_broadcast_len(columns, broadcast_len)
318 }
319
320 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
321 /// columns to broadcast_len.
322 pub fn new_with_broadcast_len(
323 columns: Vec<Column>,
324 broadcast_len: usize,
325 ) -> PolarsResult<Self> {
326 ensure_names_unique(&columns, |s| s.name().as_str())?;
327 unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
328 }
329
330 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
331 /// columns to match the other columns.
332 ///
333 /// # Safety
334 /// Does not check that the column names are unique (which they must be).
335 pub unsafe fn new_with_broadcast_no_namecheck(
336 mut columns: Vec<Column>,
337 broadcast_len: usize,
338 ) -> PolarsResult<Self> {
339 for col in &mut columns {
340 // Length not equal to the broadcast len, needs broadcast or is an error.
341 let len = col.len();
342 if len != broadcast_len {
343 if len != 1 {
344 let name = col.name().to_owned();
345 let extra_info =
346 if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
347 format!(" (matching column '{}')", c.name())
348 } else {
349 String::new()
350 };
351 polars_bail!(
352 ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
353 );
354 }
355 *col = col.new_from_index(0, broadcast_len);
356 }
357 }
358
359 let length = if columns.is_empty() { 0 } else { broadcast_len };
360
361 Ok(unsafe { DataFrame::new_no_checks(length, columns) })
362 }
363
364 /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
365 ///
366 /// # Example
367 ///
368 /// ```rust
369 /// use polars_core::prelude::DataFrame;
370 /// static EMPTY: DataFrame = DataFrame::empty();
371 /// ```
372 pub const fn empty() -> Self {
373 Self::empty_with_height(0)
374 }
375
376 /// Creates an empty `DataFrame` with a specific `height`.
377 pub const fn empty_with_height(height: usize) -> Self {
378 DataFrame {
379 height,
380 columns: vec![],
381 cached_schema: OnceLock::new(),
382 }
383 }
384
385 /// Create an empty `DataFrame` with empty columns as per the `schema`.
386 pub fn empty_with_schema(schema: &Schema) -> Self {
387 let cols = schema
388 .iter()
389 .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
390 .collect();
391 unsafe { DataFrame::new_no_checks(0, cols) }
392 }
393
394 /// Create an empty `DataFrame` with empty columns as per the `schema`.
395 pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
396 let cols = schema
397 .iter_values()
398 .map(|fld| {
399 Column::from(Series::new_empty(
400 fld.name.clone(),
401 &(DataType::from_arrow_field(fld)),
402 ))
403 })
404 .collect();
405 unsafe { DataFrame::new_no_checks(0, cols) }
406 }
407
408 /// Create a new `DataFrame` with the given schema, only containing nulls.
409 pub fn full_null(schema: &Schema, height: usize) -> Self {
410 let columns = schema
411 .iter_fields()
412 .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
413 .collect();
414 unsafe { DataFrame::new_no_checks(height, columns) }
415 }
416
417 /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
418 ///
419 /// # Example
420 ///
421 /// ```rust
422 /// # use polars_core::prelude::*;
423 /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
424 /// let s2 = Column::new("Area (kmĀ²)".into(), [106_460_000, 70_560_000]);
425 /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
426 ///
427 /// assert_eq!(df.pop(), Some(s2));
428 /// assert_eq!(df.pop(), Some(s1));
429 /// assert_eq!(df.pop(), None);
430 /// assert!(df.is_empty());
431 /// # Ok::<(), PolarsError>(())
432 /// ```
433 pub fn pop(&mut self) -> Option<Column> {
434 self.clear_schema();
435
436 self.columns.pop()
437 }
438
439 /// Add a new column at index 0 that counts the rows.
440 ///
441 /// # Example
442 ///
443 /// ```
444 /// # use polars_core::prelude::*;
445 /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
446 /// assert_eq!(df1.shape(), (4, 1));
447 ///
448 /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
449 /// assert_eq!(df2.shape(), (4, 2));
450 /// println!("{}", df2);
451 ///
452 /// # Ok::<(), PolarsError>(())
453 /// ```
454 ///
455 /// Output:
456 ///
457 /// ```text
458 /// shape: (4, 2)
459 /// +-----+----------+
460 /// | Id | Name |
461 /// | --- | --- |
462 /// | u32 | str |
463 /// +=====+==========+
464 /// | 0 | James |
465 /// +-----+----------+
466 /// | 1 | Mary |
467 /// +-----+----------+
468 /// | 2 | John |
469 /// +-----+----------+
470 /// | 3 | Patricia |
471 /// +-----+----------+
472 /// ```
473 pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
474 let mut columns = Vec::with_capacity(self.columns.len() + 1);
475 let offset = offset.unwrap_or(0);
476
477 let mut ca = IdxCa::from_vec(
478 name,
479 (offset..(self.height() as IdxSize) + offset).collect(),
480 );
481 ca.set_sorted_flag(IsSorted::Ascending);
482 columns.push(ca.into_series().into());
483
484 columns.extend_from_slice(&self.columns);
485 DataFrame::new(columns)
486 }
487
488 /// Add a row index column in place.
489 pub fn with_row_index_mut(&mut self, name: PlSmallStr, offset: Option<IdxSize>) -> &mut Self {
490 let offset = offset.unwrap_or(0);
491 let mut ca = IdxCa::from_vec(
492 name,
493 (offset..(self.height() as IdxSize) + offset).collect(),
494 );
495 ca.set_sorted_flag(IsSorted::Ascending);
496
497 self.clear_schema();
498 self.columns.insert(0, ca.into_series().into());
499 self
500 }
501
502 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
503 /// `Series`.
504 ///
505 /// Calculates the height from the first column or `0` if no columns are given.
506 ///
507 /// # Safety
508 ///
509 /// It is the callers responsibility to uphold the contract of all `Series`
510 /// having an equal length and a unique name, if not this may panic down the line.
511 pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
512 let height = columns.first().map_or(0, Column::len);
513 unsafe { Self::new_no_checks(height, columns) }
514 }
515
516 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
517 /// `Series`.
518 ///
519 /// It is advised to use [DataFrame::new] in favor of this method.
520 ///
521 /// # Safety
522 ///
523 /// It is the callers responsibility to uphold the contract of all `Series`
524 /// having an equal length and a unique name, if not this may panic down the line.
525 pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
526 if cfg!(debug_assertions) {
527 DataFrame::validate_columns_slice(&columns).unwrap();
528 }
529
530 unsafe { Self::_new_no_checks_impl(height, columns) }
531 }
532
533 /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
534 /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
535 /// constructed with this method is generally highly unsafe and should not be long-lived.
536 #[allow(clippy::missing_safety_doc)]
537 pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
538 DataFrame {
539 height,
540 columns,
541 cached_schema: OnceLock::new(),
542 }
543 }
544
545 /// Shrink the capacity of this DataFrame to fit its length.
546 pub fn shrink_to_fit(&mut self) {
547 // Don't parallelize this. Memory overhead
548 for s in &mut self.columns {
549 s.shrink_to_fit();
550 }
551 }
552
553 /// Aggregate all the chunks in the DataFrame to a single chunk.
554 pub fn as_single_chunk(&mut self) -> &mut Self {
555 // Don't parallelize this. Memory overhead
556 for s in &mut self.columns {
557 if let Column::Series(s) = s {
558 *s = s.rechunk().into();
559 }
560 }
561 self
562 }
563
564 /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
565 /// This may lead to more peak memory consumption.
566 pub fn as_single_chunk_par(&mut self) -> &mut Self {
567 if self.columns.iter().any(|c| c.n_chunks() > 1) {
568 self.columns = self._apply_columns_par(&|s| s.rechunk());
569 }
570 self
571 }
572
573 /// Rechunks all columns to only have a single chunk.
574 pub fn rechunk_mut(&mut self) {
575 // SAFETY: We never adjust the length or names of the columns.
576 let columns = unsafe { self.get_columns_mut() };
577
578 for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
579 *col = col.rechunk();
580 }
581 }
582
583 pub fn _deshare_views_mut(&mut self) {
584 // SAFETY: We never adjust the length or names of the columns.
585 unsafe {
586 let columns = self.get_columns_mut();
587 for col in columns {
588 let Column::Series(s) = col else { continue };
589
590 if let Ok(ca) = s.binary() {
591 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
592 *col = Column::from(gc_ca.into_series());
593 } else if let Ok(ca) = s.str() {
594 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
595 *col = Column::from(gc_ca.into_series());
596 }
597 }
598 }
599 }
600
601 /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
602 pub fn rechunk_to_record_batch(
603 self,
604 compat_level: CompatLevel,
605 ) -> RecordBatchT<Box<dyn Array>> {
606 let height = self.height();
607
608 let (schema, arrays) = self
609 .columns
610 .into_iter()
611 .map(|col| {
612 let mut series = col.take_materialized_series();
613 // Rechunk to one chunk if necessary
614 if series.n_chunks() > 1 {
615 series = series.rechunk();
616 }
617 (
618 series.field().to_arrow(compat_level),
619 series.to_arrow(0, compat_level),
620 )
621 })
622 .collect();
623
624 RecordBatchT::new(height, Arc::new(schema), arrays)
625 }
626
627 /// Returns true if the chunks of the columns do not align and re-chunking should be done
628 pub fn should_rechunk(&self) -> bool {
629 // Fast check. It is also needed for correctness, as code below doesn't check if the number
630 // of chunks is equal.
631 if !self
632 .get_columns()
633 .iter()
634 .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
635 .all_equal()
636 {
637 return true;
638 }
639
640 // From here we check chunk lengths.
641 let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
642 match chunk_lengths.next() {
643 None => false,
644 Some(first_column_chunk_lengths) => {
645 // Fast Path for single Chunk Series
646 if first_column_chunk_lengths.size_hint().0 == 1 {
647 return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
648 }
649 // Always rechunk if we have more chunks than rows.
650 // except when we have an empty df containing a single chunk
651 let height = self.height();
652 let n_chunks = first_column_chunk_lengths.size_hint().0;
653 if n_chunks > height && !(height == 0 && n_chunks == 1) {
654 return true;
655 }
656 // Slow Path for multi Chunk series
657 let v: Vec<_> = first_column_chunk_lengths.collect();
658 for cl in chunk_lengths {
659 if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
660 return true;
661 }
662 }
663 false
664 },
665 }
666 }
667
668 /// Ensure all the chunks in the [`DataFrame`] are aligned.
669 pub fn align_chunks_par(&mut self) -> &mut Self {
670 if self.should_rechunk() {
671 self.as_single_chunk_par()
672 } else {
673 self
674 }
675 }
676
677 pub fn align_chunks(&mut self) -> &mut Self {
678 if self.should_rechunk() {
679 self.as_single_chunk()
680 } else {
681 self
682 }
683 }
684
685 /// Get the [`DataFrame`] schema.
686 ///
687 /// # Example
688 ///
689 /// ```rust
690 /// # use polars_core::prelude::*;
691 /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
692 /// "Diameter (m)" => [8.8e26, f64::INFINITY])?;
693 ///
694 /// let f1: Field = Field::new("Thing".into(), DataType::String);
695 /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
696 /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
697 ///
698 /// assert_eq!(&**df.schema(), &sc);
699 /// # Ok::<(), PolarsError>(())
700 /// ```
701 pub fn schema(&self) -> &SchemaRef {
702 let out = self.cached_schema.get_or_init(|| {
703 Arc::new(
704 self.columns
705 .iter()
706 .map(|x| (x.name().clone(), x.dtype().clone()))
707 .collect(),
708 )
709 });
710
711 debug_assert_eq!(out.len(), self.width());
712
713 out
714 }
715
716 /// Get a reference to the [`DataFrame`] columns.
717 ///
718 /// # Example
719 ///
720 /// ```rust
721 /// # use polars_core::prelude::*;
722 /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
723 /// "Symbol" => ["A", "C", "G", "T"])?;
724 /// let columns: &[Column] = df.get_columns();
725 ///
726 /// assert_eq!(columns[0].name(), "Name");
727 /// assert_eq!(columns[1].name(), "Symbol");
728 /// # Ok::<(), PolarsError>(())
729 /// ```
730 #[inline]
731 pub fn get_columns(&self) -> &[Column] {
732 &self.columns
733 }
734
735 #[inline]
736 /// Get mutable access to the underlying columns.
737 ///
738 /// # Safety
739 ///
740 /// The caller must ensure the length of all [`Series`] remains equal to `height` or
741 /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
742 /// The caller must ensure that the cached schema is cleared if it modifies the schema by
743 /// calling [`DataFrame::clear_schema`].
744 pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
745 &mut self.columns
746 }
747
748 #[inline]
749 /// Remove all the columns in the [`DataFrame`] but keep the `height`.
750 pub fn clear_columns(&mut self) {
751 unsafe { self.get_columns_mut() }.clear();
752 self.clear_schema();
753 }
754
755 #[inline]
756 /// Extend the columns without checking for name collisions or height.
757 ///
758 /// # Safety
759 ///
760 /// The caller needs to ensure that:
761 /// - Column names are unique within the resulting [`DataFrame`].
762 /// - The length of each appended column matches the height of the [`DataFrame`]. For
763 /// `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
764 /// with [`DataFrame::set_height`].
765 pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
766 unsafe { self.get_columns_mut() }.extend(iter);
767 self.clear_schema();
768 }
769
770 /// Take ownership of the underlying columns vec.
771 pub fn take_columns(self) -> Vec<Column> {
772 self.columns
773 }
774
775 /// Iterator over the columns as [`Series`].
776 ///
777 /// # Example
778 ///
779 /// ```rust
780 /// # use polars_core::prelude::*;
781 /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
782 /// let s2 = Column::new("Formula".into(), ["aĀ²+bĀ²=cĀ²", "H=-Ī£[P(x)log|P(x)|]"]);
783 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
784 ///
785 /// let mut iterator = df.iter();
786 ///
787 /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
788 /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
789 /// assert_eq!(iterator.next(), None);
790 /// # Ok::<(), PolarsError>(())
791 /// ```
792 pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
793 self.materialized_column_iter()
794 }
795
796 /// # Example
797 ///
798 /// ```rust
799 /// # use polars_core::prelude::*;
800 /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
801 /// "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
802 ///
803 /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
804 /// # Ok::<(), PolarsError>(())
805 /// ```
806 pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
807 self.columns.iter().map(|s| s.name()).collect()
808 }
809
810 /// Get the [`Vec<PlSmallStr>`] representing the column names.
811 pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
812 self.columns.iter().map(|s| s.name().clone()).collect()
813 }
814
815 pub fn get_column_names_str(&self) -> Vec<&str> {
816 self.columns.iter().map(|s| s.name().as_str()).collect()
817 }
818
819 /// Set the column names.
820 /// # Example
821 ///
822 /// ```rust
823 /// # use polars_core::prelude::*;
824 /// let mut df: DataFrame = df!("Mathematical set" => ["ā", "ā¤", "š»", "ā", "ā", "ā"])?;
825 /// df.set_column_names(["Set"])?;
826 ///
827 /// assert_eq!(df.get_column_names(), &["Set"]);
828 /// # Ok::<(), PolarsError>(())
829 /// ```
830 pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
831 where
832 I: IntoIterator<Item = S>,
833 S: Into<PlSmallStr>,
834 {
835 let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
836 self._set_column_names_impl(names.as_slice())
837 }
838
839 fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
840 polars_ensure!(
841 names.len() == self.width(),
842 ShapeMismatch: "{} column names provided for a DataFrame of width {}",
843 names.len(), self.width()
844 );
845 ensure_names_unique(names, |s| s.as_str())?;
846
847 let columns = mem::take(&mut self.columns);
848 self.columns = columns
849 .into_iter()
850 .zip(names)
851 .map(|(s, name)| {
852 let mut s = s;
853 s.rename(name.clone());
854 s
855 })
856 .collect();
857 self.clear_schema();
858 Ok(())
859 }
860
861 /// Get the data types of the columns in the [`DataFrame`].
862 ///
863 /// # Example
864 ///
865 /// ```rust
866 /// # use polars_core::prelude::*;
867 /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
868 /// "Fraction" => [0.965, 0.035])?;
869 ///
870 /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
871 /// # Ok::<(), PolarsError>(())
872 /// ```
873 pub fn dtypes(&self) -> Vec<DataType> {
874 self.columns.iter().map(|s| s.dtype().clone()).collect()
875 }
876
877 pub(crate) fn first_series_column(&self) -> Option<&Series> {
878 self.columns.iter().find_map(|col| col.as_series())
879 }
880
881 /// The number of chunks for the first column.
882 pub fn first_col_n_chunks(&self) -> usize {
883 match self.first_series_column() {
884 None if self.columns.is_empty() => 0,
885 None => 1,
886 Some(s) => s.n_chunks(),
887 }
888 }
889
890 /// The highest number of chunks for any column.
891 pub fn max_n_chunks(&self) -> usize {
892 self.columns
893 .iter()
894 .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
895 .max()
896 .unwrap_or(0)
897 }
898
899 /// Get a reference to the schema fields of the [`DataFrame`].
900 ///
901 /// # Example
902 ///
903 /// ```rust
904 /// # use polars_core::prelude::*;
905 /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
906 /// "Fraction" => [0.708, 0.292])?;
907 ///
908 /// let f1: Field = Field::new("Surface type".into(), DataType::String);
909 /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
910 ///
911 /// assert_eq!(earth.fields(), &[f1, f2]);
912 /// # Ok::<(), PolarsError>(())
913 /// ```
914 pub fn fields(&self) -> Vec<Field> {
915 self.columns
916 .iter()
917 .map(|s| s.field().into_owned())
918 .collect()
919 }
920
921 /// Get (height, width) of the [`DataFrame`].
922 ///
923 /// # Example
924 ///
925 /// ```rust
926 /// # use polars_core::prelude::*;
927 /// let df0: DataFrame = DataFrame::default();
928 /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
929 /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
930 /// "2" => [1, 2, 3, 4, 5])?;
931 ///
932 /// assert_eq!(df0.shape(), (0 ,0));
933 /// assert_eq!(df1.shape(), (5, 1));
934 /// assert_eq!(df2.shape(), (5, 2));
935 /// # Ok::<(), PolarsError>(())
936 /// ```
937 pub fn shape(&self) -> (usize, usize) {
938 (self.height, self.columns.len())
939 }
940
941 /// Get the width of the [`DataFrame`] which is the number of columns.
942 ///
943 /// # Example
944 ///
945 /// ```rust
946 /// # use polars_core::prelude::*;
947 /// let df0: DataFrame = DataFrame::default();
948 /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
949 /// let df2: DataFrame = df!("Series 1" => [0; 0],
950 /// "Series 2" => [0; 0])?;
951 ///
952 /// assert_eq!(df0.width(), 0);
953 /// assert_eq!(df1.width(), 1);
954 /// assert_eq!(df2.width(), 2);
955 /// # Ok::<(), PolarsError>(())
956 /// ```
957 pub fn width(&self) -> usize {
958 self.columns.len()
959 }
960
961 /// Get the height of the [`DataFrame`] which is the number of rows.
962 ///
963 /// # Example
964 ///
965 /// ```rust
966 /// # use polars_core::prelude::*;
967 /// let df0: DataFrame = DataFrame::default();
968 /// let df1: DataFrame = df!("Currency" => ["ā¬", "$"])?;
969 /// let df2: DataFrame = df!("Currency" => ["ā¬", "$", "Ā„", "Ā£", "āæ"])?;
970 ///
971 /// assert_eq!(df0.height(), 0);
972 /// assert_eq!(df1.height(), 2);
973 /// assert_eq!(df2.height(), 5);
974 /// # Ok::<(), PolarsError>(())
975 /// ```
976 pub fn height(&self) -> usize {
977 self.height
978 }
979
980 /// Returns the size as number of rows * number of columns
981 pub fn size(&self) -> usize {
982 let s = self.shape();
983 s.0 * s.1
984 }
985
986 /// Returns `true` if the [`DataFrame`] contains no rows.
987 ///
988 /// # Example
989 ///
990 /// ```rust
991 /// # use polars_core::prelude::*;
992 /// let df1: DataFrame = DataFrame::default();
993 /// assert!(df1.is_empty());
994 ///
995 /// let df2: DataFrame = df!("First name" => ["Forever"],
996 /// "Last name" => ["Alone"])?;
997 /// assert!(!df2.is_empty());
998 /// # Ok::<(), PolarsError>(())
999 /// ```
1000 pub fn is_empty(&self) -> bool {
1001 matches!(self.shape(), (0, _) | (_, 0))
1002 }
1003
1004 /// Set the height (i.e. number of rows) of this [`DataFrame`].
1005 ///
1006 /// # Safety
1007 ///
1008 /// This needs to be equal to the length of all the columns.
1009 pub unsafe fn set_height(&mut self, height: usize) {
1010 self.height = height;
1011 }
1012
1013 /// Add multiple [`Series`] to a [`DataFrame`].
1014 /// The added `Series` are required to have the same length.
1015 ///
1016 /// # Example
1017 ///
1018 /// ```rust
1019 /// # use polars_core::prelude::*;
1020 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1021 /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1022 /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1023 ///
1024 /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1025 /// assert_eq!(df2.shape(), (3, 3));
1026 /// println!("{}", df2);
1027 /// # Ok::<(), PolarsError>(())
1028 /// ```
1029 ///
1030 /// Output:
1031 ///
1032 /// ```text
1033 /// shape: (3, 3)
1034 /// +---------+--------+----------+
1035 /// | Element | Proton | Electron |
1036 /// | --- | --- | --- |
1037 /// | str | i32 | i32 |
1038 /// +=========+========+==========+
1039 /// | Copper | 29 | 29 |
1040 /// +---------+--------+----------+
1041 /// | Silver | 47 | 47 |
1042 /// +---------+--------+----------+
1043 /// | Gold | 79 | 79 |
1044 /// +---------+--------+----------+
1045 /// ```
1046 pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1047 let mut new_cols = self.columns.clone();
1048 new_cols.extend_from_slice(columns);
1049 DataFrame::new(new_cols)
1050 }
1051
1052 /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1053 ///
1054 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1055 ///
1056 /// # Example
1057 ///
1058 /// ```rust
1059 /// # use polars_core::prelude::*;
1060 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1061 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1062 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1063 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1064 ///
1065 /// let df3: DataFrame = df1.vstack(&df2)?;
1066 ///
1067 /// assert_eq!(df3.shape(), (5, 2));
1068 /// println!("{}", df3);
1069 /// # Ok::<(), PolarsError>(())
1070 /// ```
1071 ///
1072 /// Output:
1073 ///
1074 /// ```text
1075 /// shape: (5, 2)
1076 /// +-----------+-------------------+
1077 /// | Element | Melting Point (K) |
1078 /// | --- | --- |
1079 /// | str | f64 |
1080 /// +===========+===================+
1081 /// | Copper | 1357.77 |
1082 /// +-----------+-------------------+
1083 /// | Silver | 1234.93 |
1084 /// +-----------+-------------------+
1085 /// | Gold | 1337.33 |
1086 /// +-----------+-------------------+
1087 /// | Platinum | 2041.4 |
1088 /// +-----------+-------------------+
1089 /// | Palladium | 1828.05 |
1090 /// +-----------+-------------------+
1091 /// ```
1092 pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1093 let mut df = self.clone();
1094 df.vstack_mut(other)?;
1095 Ok(df)
1096 }
1097
1098 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1099 ///
1100 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1101 ///
1102 /// # Example
1103 ///
1104 /// ```rust
1105 /// # use polars_core::prelude::*;
1106 /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1107 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1108 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1109 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1110 ///
1111 /// df1.vstack_mut(&df2)?;
1112 ///
1113 /// assert_eq!(df1.shape(), (5, 2));
1114 /// println!("{}", df1);
1115 /// # Ok::<(), PolarsError>(())
1116 /// ```
1117 ///
1118 /// Output:
1119 ///
1120 /// ```text
1121 /// shape: (5, 2)
1122 /// +-----------+-------------------+
1123 /// | Element | Melting Point (K) |
1124 /// | --- | --- |
1125 /// | str | f64 |
1126 /// +===========+===================+
1127 /// | Copper | 1357.77 |
1128 /// +-----------+-------------------+
1129 /// | Silver | 1234.93 |
1130 /// +-----------+-------------------+
1131 /// | Gold | 1337.33 |
1132 /// +-----------+-------------------+
1133 /// | Platinum | 2041.4 |
1134 /// +-----------+-------------------+
1135 /// | Palladium | 1828.05 |
1136 /// +-----------+-------------------+
1137 /// ```
1138 pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1139 if self.width() != other.width() {
1140 polars_ensure!(
1141 self.width() == 0,
1142 ShapeMismatch:
1143 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1144 self.width(), other.width(),
1145 );
1146 self.columns.clone_from(&other.columns);
1147 self.height = other.height;
1148 return Ok(self);
1149 }
1150
1151 self.columns
1152 .iter_mut()
1153 .zip(other.columns.iter())
1154 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1155 ensure_can_extend(&*left, right)?;
1156 left.append(right).map_err(|e| {
1157 e.context(format!("failed to vstack column '{}'", right.name()).into())
1158 })?;
1159 Ok(())
1160 })?;
1161 self.height += other.height;
1162 Ok(self)
1163 }
1164
1165 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1166 ///
1167 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1168 ///
1169 /// # Panics
1170 /// Panics if the schema's don't match.
1171 pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1172 self.columns
1173 .iter_mut()
1174 .zip(other.columns.iter())
1175 .for_each(|(left, right)| {
1176 left.append(right)
1177 .map_err(|e| {
1178 e.context(format!("failed to vstack column '{}'", right.name()).into())
1179 })
1180 .expect("should not fail");
1181 });
1182 self.height += other.height;
1183 }
1184
1185 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1186 ///
1187 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1188 ///
1189 /// # Panics
1190 /// Panics if the schema's don't match.
1191 pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1192 self.columns
1193 .iter_mut()
1194 .zip(other.columns)
1195 .for_each(|(left, right)| {
1196 left.append_owned(right).expect("should not fail");
1197 });
1198 self.height += other.height;
1199 }
1200
1201 /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1202 ///
1203 /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1204 /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1205 ///
1206 /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1207 /// and thus will yield faster queries.
1208 ///
1209 /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1210 /// online operations where you add `n` rows and rerun a query.
1211 ///
1212 /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1213 /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1214 /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1215 pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1216 polars_ensure!(
1217 self.width() == other.width(),
1218 ShapeMismatch:
1219 "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1220 self.width(), other.width(),
1221 );
1222
1223 self.columns
1224 .iter_mut()
1225 .zip(other.columns.iter())
1226 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1227 ensure_can_extend(&*left, right)?;
1228 left.extend(right).map_err(|e| {
1229 e.context(format!("failed to extend column '{}'", right.name()).into())
1230 })?;
1231 Ok(())
1232 })?;
1233 self.height += other.height;
1234 self.clear_schema();
1235 Ok(())
1236 }
1237
1238 /// Remove a column by name and return the column removed.
1239 ///
1240 /// # Example
1241 ///
1242 /// ```rust
1243 /// # use polars_core::prelude::*;
1244 /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1245 /// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1246 ///
1247 /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1248 /// assert!(s1.is_err());
1249 ///
1250 /// let s2: Column = df.drop_in_place("Animal")?;
1251 /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1252 /// # Ok::<(), PolarsError>(())
1253 /// ```
1254 pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1255 let idx = self.check_name_to_idx(name)?;
1256 self.clear_schema();
1257 Ok(self.columns.remove(idx))
1258 }
1259
1260 /// Return a new [`DataFrame`] where all null values are dropped.
1261 ///
1262 /// # Example
1263 ///
1264 /// ```no_run
1265 /// # use polars_core::prelude::*;
1266 /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1267 /// "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1268 /// assert_eq!(df1.shape(), (3, 2));
1269 ///
1270 /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1271 /// assert_eq!(df2.shape(), (1, 2));
1272 /// println!("{}", df2);
1273 /// # Ok::<(), PolarsError>(())
1274 /// ```
1275 ///
1276 /// Output:
1277 ///
1278 /// ```text
1279 /// shape: (1, 2)
1280 /// +---------+---------------------+
1281 /// | Country | Tax revenue (% GDP) |
1282 /// | --- | --- |
1283 /// | str | f64 |
1284 /// +=========+=====================+
1285 /// | Malta | 32.7 |
1286 /// +---------+---------------------+
1287 /// ```
1288 pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1289 where
1290 for<'a> &'a S: Into<PlSmallStr>,
1291 {
1292 if let Some(v) = subset {
1293 let v = self.select_columns(v)?;
1294 self._drop_nulls_impl(v.as_slice())
1295 } else {
1296 self._drop_nulls_impl(self.columns.as_slice())
1297 }
1298 }
1299
1300 fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1301 // fast path for no nulls in df
1302 if subset.iter().all(|s| !s.has_nulls()) {
1303 return Ok(self.clone());
1304 }
1305
1306 let mut iter = subset.iter();
1307
1308 let mask = iter
1309 .next()
1310 .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1311 let mut mask = mask.is_not_null();
1312
1313 for c in iter {
1314 mask = mask & c.is_not_null();
1315 }
1316 self.filter(&mask)
1317 }
1318
1319 /// Drop a column by name.
1320 /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1321 /// the current one in place.
1322 ///
1323 /// # Example
1324 ///
1325 /// ```rust
1326 /// # use polars_core::prelude::*;
1327 /// let df1: DataFrame = df!("Ray type" => ["Ī±", "Ī²", "X", "Ī³"])?;
1328 /// let df2: DataFrame = df1.drop("Ray type")?;
1329 ///
1330 /// assert!(df2.is_empty());
1331 /// # Ok::<(), PolarsError>(())
1332 /// ```
1333 pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1334 let idx = self.check_name_to_idx(name)?;
1335 let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1336
1337 self.columns.iter().enumerate().for_each(|(i, s)| {
1338 if i != idx {
1339 new_cols.push(s.clone())
1340 }
1341 });
1342
1343 Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1344 }
1345
1346 /// Drop columns that are in `names`.
1347 pub fn drop_many<I, S>(&self, names: I) -> Self
1348 where
1349 I: IntoIterator<Item = S>,
1350 S: Into<PlSmallStr>,
1351 {
1352 let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1353 self.drop_many_amortized(&names)
1354 }
1355
1356 /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1357 pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1358 if names.is_empty() {
1359 return self.clone();
1360 }
1361 let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1362 self.columns.iter().for_each(|s| {
1363 if !names.contains(s.name()) {
1364 new_cols.push(s.clone())
1365 }
1366 });
1367
1368 unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1369 }
1370
1371 /// Insert a new column at a given index without checking for duplicates.
1372 /// This can leave the [`DataFrame`] at an invalid state
1373 fn insert_column_no_name_check(
1374 &mut self,
1375 index: usize,
1376 column: Column,
1377 ) -> PolarsResult<&mut Self> {
1378 polars_ensure!(
1379 self.width() == 0 || column.len() == self.height(),
1380 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1381 column.len(), self.height(),
1382 );
1383
1384 if self.width() == 0 {
1385 self.height = column.len();
1386 }
1387
1388 self.columns.insert(index, column);
1389 self.clear_schema();
1390 Ok(self)
1391 }
1392
1393 /// Insert a new column at a given index.
1394 pub fn insert_column<S: IntoColumn>(
1395 &mut self,
1396 index: usize,
1397 column: S,
1398 ) -> PolarsResult<&mut Self> {
1399 let column = column.into_column();
1400 self.check_already_present(column.name().as_str())?;
1401 self.insert_column_no_name_check(index, column)
1402 }
1403
1404 fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1405 if let Some(idx) = self.get_column_index(column.name().as_str()) {
1406 self.replace_column(idx, column)?;
1407 } else {
1408 if self.width() == 0 {
1409 self.height = column.len();
1410 }
1411
1412 self.columns.push(column);
1413 self.clear_schema();
1414 }
1415 Ok(())
1416 }
1417
1418 /// Add a new column to this [`DataFrame`] or replace an existing one.
1419 pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1420 fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1421 let height = df.height();
1422 if column.len() == 1 && height > 1 {
1423 column = column.new_from_index(0, height);
1424 }
1425
1426 if column.len() == height || df.get_columns().is_empty() {
1427 df.add_column_by_search(column)?;
1428 Ok(df)
1429 }
1430 // special case for literals
1431 else if height == 0 && column.len() == 1 {
1432 let s = column.clear();
1433 df.add_column_by_search(s)?;
1434 Ok(df)
1435 } else {
1436 polars_bail!(
1437 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1438 column.len(), height,
1439 );
1440 }
1441 }
1442 let column = column.into_column();
1443 inner(self, column)
1444 }
1445
1446 /// Adds a column to the [`DataFrame`] without doing any checks
1447 /// on length or duplicates.
1448 ///
1449 /// # Safety
1450 /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1451 pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1452 debug_assert!(self.width() == 0 || self.height() == column.len());
1453 debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1454
1455 // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1456 // properly for `width` == 0.
1457 if self.width() == 0 {
1458 unsafe { self.set_height(column.len()) };
1459 }
1460 unsafe { self.get_columns_mut() }.push(column);
1461 self.clear_schema();
1462
1463 self
1464 }
1465
1466 // Note: Schema can be both input or output_schema
1467 fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1468 let name = c.name();
1469 if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1470 if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1471 // Given schema is output_schema and we can push.
1472 if idx == self.columns.len() {
1473 if self.width() == 0 {
1474 self.height = c.len();
1475 }
1476
1477 self.columns.push(c);
1478 self.clear_schema();
1479 }
1480 // Schema is incorrect fallback to search
1481 else {
1482 debug_assert!(false);
1483 self.add_column_by_search(c)?;
1484 }
1485 } else {
1486 self.replace_column(idx, c)?;
1487 }
1488 } else {
1489 if self.width() == 0 {
1490 self.height = c.len();
1491 }
1492
1493 self.columns.push(c);
1494 self.clear_schema();
1495 }
1496
1497 Ok(())
1498 }
1499
1500 // Note: Schema can be both input or output_schema
1501 pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1502 for (i, s) in series.into_iter().enumerate() {
1503 // we need to branch here
1504 // because users can add multiple columns with the same name
1505 if i == 0 || schema.get(s.name().as_str()).is_some() {
1506 self.with_column_and_schema(s.into_column(), schema)?;
1507 } else {
1508 self.with_column(s.clone().into_column())?;
1509 }
1510 }
1511 Ok(())
1512 }
1513
1514 pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1515 for (i, s) in columns.into_iter().enumerate() {
1516 // we need to branch here
1517 // because users can add multiple columns with the same name
1518 if i == 0 || schema.get(s.name().as_str()).is_some() {
1519 self.with_column_and_schema(s, schema)?;
1520 } else {
1521 self.with_column(s.clone())?;
1522 }
1523 }
1524
1525 Ok(())
1526 }
1527
1528 /// Add a new column to this [`DataFrame`] or replace an existing one.
1529 /// Uses an existing schema to amortize lookups.
1530 /// If the schema is incorrect, we will fallback to linear search.
1531 ///
1532 /// Note: Schema can be both input or output_schema
1533 pub fn with_column_and_schema<C: IntoColumn>(
1534 &mut self,
1535 column: C,
1536 schema: &Schema,
1537 ) -> PolarsResult<&mut Self> {
1538 let mut column = column.into_column();
1539
1540 let height = self.height();
1541 if column.len() == 1 && height > 1 {
1542 column = column.new_from_index(0, height);
1543 }
1544
1545 if column.len() == height || self.columns.is_empty() {
1546 self.add_column_by_schema(column, schema)?;
1547 Ok(self)
1548 }
1549 // special case for literals
1550 else if height == 0 && column.len() == 1 {
1551 let s = column.clear();
1552 self.add_column_by_schema(s, schema)?;
1553 Ok(self)
1554 } else {
1555 polars_bail!(
1556 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1557 column.len(), height,
1558 );
1559 }
1560 }
1561
1562 /// Get a row in the [`DataFrame`]. Beware this is slow.
1563 ///
1564 /// # Example
1565 ///
1566 /// ```
1567 /// # use polars_core::prelude::*;
1568 /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1569 /// df.get(idx)
1570 /// }
1571 /// ```
1572 pub fn get(&self, idx: usize) -> Option<Vec<AnyValue>> {
1573 match self.columns.first() {
1574 Some(s) => {
1575 if s.len() <= idx {
1576 return None;
1577 }
1578 },
1579 None => return None,
1580 }
1581 // SAFETY: we just checked bounds
1582 unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1583 }
1584
1585 /// Select a [`Series`] by index.
1586 ///
1587 /// # Example
1588 ///
1589 /// ```rust
1590 /// # use polars_core::prelude::*;
1591 /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1592 /// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1593 ///
1594 /// let s1: Option<&Column> = df.select_at_idx(0);
1595 /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1596 ///
1597 /// assert_eq!(s1, Some(&s2));
1598 /// # Ok::<(), PolarsError>(())
1599 /// ```
1600 pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1601 self.columns.get(idx)
1602 }
1603
1604 /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1605 ///
1606 /// # Examples
1607 ///
1608 /// ```rust
1609 /// # use polars_core::prelude::*;
1610 /// let df = df! {
1611 /// "0" => [0, 0, 0],
1612 /// "1" => [1, 1, 1],
1613 /// "2" => [2, 2, 2]
1614 /// }?;
1615 ///
1616 /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1617 /// assert!(df.equals(&df.select_by_range(..)?));
1618 /// # Ok::<(), PolarsError>(())
1619 /// ```
1620 pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1621 where
1622 R: ops::RangeBounds<usize>,
1623 {
1624 // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1625 // because it is the nightly feature. We should change here if this function were stable.
1626 fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1627 where
1628 R: ops::RangeBounds<usize>,
1629 {
1630 let len = bounds.end;
1631
1632 let start: ops::Bound<&usize> = range.start_bound();
1633 let start = match start {
1634 ops::Bound::Included(&start) => start,
1635 ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1636 panic!("attempted to index slice from after maximum usize");
1637 }),
1638 ops::Bound::Unbounded => 0,
1639 };
1640
1641 let end: ops::Bound<&usize> = range.end_bound();
1642 let end = match end {
1643 ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1644 panic!("attempted to index slice up to maximum usize");
1645 }),
1646 ops::Bound::Excluded(&end) => end,
1647 ops::Bound::Unbounded => len,
1648 };
1649
1650 if start > end {
1651 panic!("slice index starts at {start} but ends at {end}");
1652 }
1653 if end > len {
1654 panic!("range end index {end} out of range for slice of length {len}",);
1655 }
1656
1657 ops::Range { start, end }
1658 }
1659
1660 let colnames = self.get_column_names_owned();
1661 let range = get_range(range, ..colnames.len());
1662
1663 self._select_impl(&colnames[range])
1664 }
1665
1666 /// Get column index of a [`Series`] by name.
1667 /// # Example
1668 ///
1669 /// ```rust
1670 /// # use polars_core::prelude::*;
1671 /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1672 /// "Health" => [100, 200, 500],
1673 /// "Mana" => [250, 100, 0],
1674 /// "Strength" => [30, 150, 300])?;
1675 ///
1676 /// assert_eq!(df.get_column_index("Name"), Some(0));
1677 /// assert_eq!(df.get_column_index("Health"), Some(1));
1678 /// assert_eq!(df.get_column_index("Mana"), Some(2));
1679 /// assert_eq!(df.get_column_index("Strength"), Some(3));
1680 /// assert_eq!(df.get_column_index("Haste"), None);
1681 /// # Ok::<(), PolarsError>(())
1682 /// ```
1683 pub fn get_column_index(&self, name: &str) -> Option<usize> {
1684 let schema = self.schema();
1685 if let Some(idx) = schema.index_of(name) {
1686 if self
1687 .get_columns()
1688 .get(idx)
1689 .is_some_and(|c| c.name() == name)
1690 {
1691 return Some(idx);
1692 }
1693 }
1694
1695 self.columns.iter().position(|s| s.name().as_str() == name)
1696 }
1697
1698 /// Get column index of a [`Series`] by name.
1699 pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1700 self.get_column_index(name)
1701 .ok_or_else(|| polars_err!(col_not_found = name))
1702 }
1703
1704 /// Select a single column by name.
1705 ///
1706 /// # Example
1707 ///
1708 /// ```rust
1709 /// # use polars_core::prelude::*;
1710 /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1711 /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1712 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1713 ///
1714 /// assert_eq!(df.column("Password")?, &s1);
1715 /// # Ok::<(), PolarsError>(())
1716 /// ```
1717 pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1718 let idx = self.try_get_column_index(name)?;
1719 Ok(self.select_at_idx(idx).unwrap())
1720 }
1721
1722 /// Selected multiple columns by name.
1723 ///
1724 /// # Example
1725 ///
1726 /// ```rust
1727 /// # use polars_core::prelude::*;
1728 /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1729 /// "Max weight (kg)" => [16.0, 35.89])?;
1730 /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1731 ///
1732 /// assert_eq!(&df[0], sv[0]);
1733 /// assert_eq!(&df[1], sv[1]);
1734 /// # Ok::<(), PolarsError>(())
1735 /// ```
1736 pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1737 where
1738 I: IntoIterator<Item = S>,
1739 S: AsRef<str>,
1740 {
1741 names
1742 .into_iter()
1743 .map(|name| self.column(name.as_ref()))
1744 .collect()
1745 }
1746
1747 /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1748 ///
1749 /// # Examples
1750 ///
1751 /// ```
1752 /// # use polars_core::prelude::*;
1753 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1754 /// df.select(["foo", "bar"])
1755 /// }
1756 /// ```
1757 pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1758 where
1759 I: IntoIterator<Item = S>,
1760 S: Into<PlSmallStr>,
1761 {
1762 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1763 self._select_impl(cols.as_slice())
1764 }
1765
1766 pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1767 ensure_names_unique(cols, |s| s.as_str())?;
1768 self._select_impl_unchecked(cols)
1769 }
1770
1771 pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1772 let selected = self.select_columns_impl(cols)?;
1773 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1774 }
1775
1776 /// Select with a known schema. The schema names must match the column names of this DataFrame.
1777 pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1778 where
1779 I: IntoIterator<Item = S>,
1780 S: Into<PlSmallStr>,
1781 {
1782 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1783 self._select_with_schema_impl(&cols, schema, true)
1784 }
1785
1786 /// Select with a known schema without checking for duplicates in `selection`.
1787 /// The schema names must match the column names of this DataFrame.
1788 pub fn select_with_schema_unchecked<I, S>(
1789 &self,
1790 selection: I,
1791 schema: &Schema,
1792 ) -> PolarsResult<Self>
1793 where
1794 I: IntoIterator<Item = S>,
1795 S: Into<PlSmallStr>,
1796 {
1797 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1798 self._select_with_schema_impl(&cols, schema, false)
1799 }
1800
1801 /// * The schema names must match the column names of this DataFrame.
1802 pub fn _select_with_schema_impl(
1803 &self,
1804 cols: &[PlSmallStr],
1805 schema: &Schema,
1806 check_duplicates: bool,
1807 ) -> PolarsResult<Self> {
1808 if check_duplicates {
1809 ensure_names_unique(cols, |s| s.as_str())?;
1810 }
1811
1812 let selected = self.select_columns_impl_with_schema(cols, schema)?;
1813 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1814 }
1815
1816 /// A non generic implementation to reduce compiler bloat.
1817 fn select_columns_impl_with_schema(
1818 &self,
1819 cols: &[PlSmallStr],
1820 schema: &Schema,
1821 ) -> PolarsResult<Vec<Column>> {
1822 if cfg!(debug_assertions) {
1823 ensure_matching_schema_names(schema, self.schema())?;
1824 }
1825
1826 cols.iter()
1827 .map(|name| {
1828 let index = schema.try_get_full(name.as_str())?.0;
1829 Ok(self.columns[index].clone())
1830 })
1831 .collect()
1832 }
1833
1834 pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1835 where
1836 I: IntoIterator<Item = S>,
1837 S: Into<PlSmallStr>,
1838 {
1839 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1840 self.select_physical_impl(&cols)
1841 }
1842
1843 fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1844 ensure_names_unique(cols, |s| s.as_str())?;
1845 let selected = self.select_columns_physical_impl(cols)?;
1846 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1847 }
1848
1849 /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1850 ///
1851 /// # Example
1852 ///
1853 /// ```rust
1854 /// # use polars_core::prelude::*;
1855 /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1856 /// "Carbon" => [1, 2, 3],
1857 /// "Hydrogen" => [4, 6, 8])?;
1858 /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1859 ///
1860 /// assert_eq!(df["Carbon"], sv[0]);
1861 /// assert_eq!(df["Hydrogen"], sv[1]);
1862 /// # Ok::<(), PolarsError>(())
1863 /// ```
1864 pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1865 let cols = selection.into_vec();
1866 self.select_columns_impl(&cols)
1867 }
1868
1869 fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1870 self.columns
1871 .iter()
1872 .enumerate()
1873 .map(|(i, s)| (s.name().as_str(), i))
1874 .collect()
1875 }
1876
1877 /// A non generic implementation to reduce compiler bloat.
1878 fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1879 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1880 let name_to_idx = self._names_to_idx_map();
1881 cols.iter()
1882 .map(|name| {
1883 let idx = *name_to_idx
1884 .get(name.as_str())
1885 .ok_or_else(|| polars_err!(col_not_found = name))?;
1886 Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1887 })
1888 .collect::<PolarsResult<Vec<_>>>()?
1889 } else {
1890 cols.iter()
1891 .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1892 .collect::<PolarsResult<Vec<_>>>()?
1893 };
1894
1895 Ok(selected)
1896 }
1897
1898 /// A non generic implementation to reduce compiler bloat.
1899 fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1900 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1901 // we hash, because there are user that having millions of columns.
1902 // # https://github.com/pola-rs/polars/issues/1023
1903 let name_to_idx = self._names_to_idx_map();
1904
1905 cols.iter()
1906 .map(|name| {
1907 let idx = *name_to_idx
1908 .get(name.as_str())
1909 .ok_or_else(|| polars_err!(col_not_found = name))?;
1910 Ok(self.select_at_idx(idx).unwrap().clone())
1911 })
1912 .collect::<PolarsResult<Vec<_>>>()?
1913 } else {
1914 cols.iter()
1915 .map(|c| self.column(c.as_str()).cloned())
1916 .collect::<PolarsResult<Vec<_>>>()?
1917 };
1918
1919 Ok(selected)
1920 }
1921
1922 fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1923 // If there is a filtered column just see how many columns there are left.
1924 if let Some(fst) = filtered.first() {
1925 return fst.len();
1926 }
1927
1928 // Otherwise, count the number of values that would be filtered and return that height.
1929 let num_trues = mask.num_trues();
1930 if mask.len() == self.height() {
1931 num_trues
1932 } else {
1933 // This is for broadcasting masks
1934 debug_assert!(num_trues == 0 || num_trues == 1);
1935 self.height() * num_trues
1936 }
1937 }
1938
1939 /// Take the [`DataFrame`] rows by a boolean mask.
1940 ///
1941 /// # Example
1942 ///
1943 /// ```
1944 /// # use polars_core::prelude::*;
1945 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1946 /// let mask = df.column("sepal_width")?.is_not_null();
1947 /// df.filter(&mask)
1948 /// }
1949 /// ```
1950 pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1951 let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
1952 let height = self.filter_height(&new_col, mask);
1953
1954 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1955 }
1956
1957 /// Same as `filter` but does not parallelize.
1958 pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1959 let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
1960 let height = self.filter_height(&new_col, mask);
1961
1962 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1963 }
1964
1965 /// Take [`DataFrame`] rows by index values.
1966 ///
1967 /// # Example
1968 ///
1969 /// ```
1970 /// # use polars_core::prelude::*;
1971 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1972 /// let idx = IdxCa::new("idx".into(), [0, 1, 9]);
1973 /// df.take(&idx)
1974 /// }
1975 /// ```
1976 pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
1977 let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
1978
1979 Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
1980 }
1981
1982 /// # Safety
1983 /// The indices must be in-bounds.
1984 pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
1985 self.take_unchecked_impl(idx, true)
1986 }
1987
1988 /// # Safety
1989 /// The indices must be in-bounds.
1990 pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
1991 let cols = if allow_threads {
1992 POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
1993 } else {
1994 self._apply_columns(&|s| s.take_unchecked(idx))
1995 };
1996 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
1997 }
1998
1999 /// # Safety
2000 /// The indices must be in-bounds.
2001 pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2002 self.take_slice_unchecked_impl(idx, true)
2003 }
2004
2005 /// # Safety
2006 /// The indices must be in-bounds.
2007 pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2008 let cols = if allow_threads {
2009 POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2010 } else {
2011 self._apply_columns(&|s| s.take_slice_unchecked(idx))
2012 };
2013 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2014 }
2015
2016 /// Rename a column in the [`DataFrame`].
2017 ///
2018 /// # Example
2019 ///
2020 /// ```
2021 /// # use polars_core::prelude::*;
2022 /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2023 /// let original_name = "foo";
2024 /// let new_name = "bar";
2025 /// df.rename(original_name, new_name.into())
2026 /// }
2027 /// ```
2028 pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2029 if column == name.as_str() {
2030 return Ok(self);
2031 }
2032 polars_ensure!(
2033 !self.schema().contains(&name),
2034 Duplicate: "column rename attempted with already existing name \"{name}\""
2035 );
2036
2037 self.get_column_index(column)
2038 .and_then(|idx| self.columns.get_mut(idx))
2039 .ok_or_else(|| polars_err!(col_not_found = column))
2040 .map(|c| c.rename(name))?;
2041 Ok(self)
2042 }
2043
2044 /// Sort [`DataFrame`] in place.
2045 ///
2046 /// See [`DataFrame::sort`] for more instruction.
2047 pub fn sort_in_place(
2048 &mut self,
2049 by: impl IntoVec<PlSmallStr>,
2050 sort_options: SortMultipleOptions,
2051 ) -> PolarsResult<&mut Self> {
2052 let by_column = self.select_columns(by)?;
2053 self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2054 Ok(self)
2055 }
2056
2057 #[doc(hidden)]
2058 /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2059 pub fn sort_impl(
2060 &self,
2061 by_column: Vec<Column>,
2062 mut sort_options: SortMultipleOptions,
2063 slice: Option<(i64, usize)>,
2064 ) -> PolarsResult<Self> {
2065 if by_column.is_empty() {
2066 // If no columns selected, any order (including original order) is correct.
2067 return if let Some((offset, len)) = slice {
2068 Ok(self.slice(offset, len))
2069 } else {
2070 Ok(self.clone())
2071 };
2072 }
2073
2074 // note that the by_column argument also contains evaluated expression from
2075 // polars-lazy that may not even be present in this dataframe. therefore
2076 // when we try to set the first columns as sorted, we ignore the error as
2077 // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2078 let first_descending = sort_options.descending[0];
2079 let first_by_column = by_column[0].name().to_string();
2080
2081 let set_sorted = |df: &mut DataFrame| {
2082 // Mark the first sort column as sorted; if the column does not exist it
2083 // is ok, because we sorted by an expression not present in the dataframe
2084 let _ = df.apply(&first_by_column, |s| {
2085 let mut s = s.clone();
2086 if first_descending {
2087 s.set_sorted_flag(IsSorted::Descending)
2088 } else {
2089 s.set_sorted_flag(IsSorted::Ascending)
2090 }
2091 s
2092 });
2093 };
2094 if self.is_empty() {
2095 let mut out = self.clone();
2096 set_sorted(&mut out);
2097 return Ok(out);
2098 }
2099
2100 if let Some((0, k)) = slice {
2101 if k < self.len() {
2102 return self.bottom_k_impl(k, by_column, sort_options);
2103 }
2104 }
2105 // Check if the required column is already sorted; if so we can exit early
2106 // We can do so when there is only one column to sort by, for multiple columns
2107 // it will be complicated to do so
2108 #[cfg(feature = "dtype-categorical")]
2109 let is_not_categorical_enum =
2110 !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2111 || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2112
2113 #[cfg(not(feature = "dtype-categorical"))]
2114 #[allow(non_upper_case_globals)]
2115 const is_not_categorical_enum: bool = true;
2116
2117 if by_column.len() == 1 && is_not_categorical_enum {
2118 let required_sorting = if sort_options.descending[0] {
2119 IsSorted::Descending
2120 } else {
2121 IsSorted::Ascending
2122 };
2123 // If null count is 0 then nulls_last doesnt matter
2124 // Safe to get value at last position since the dataframe is not empty (taken care above)
2125 let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2126 && ((by_column[0].null_count() == 0)
2127 || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2128 == sort_options.nulls_last[0]);
2129
2130 if no_sorting_required {
2131 return if let Some((offset, len)) = slice {
2132 Ok(self.slice(offset, len))
2133 } else {
2134 Ok(self.clone())
2135 };
2136 }
2137 }
2138
2139 #[cfg(feature = "dtype-struct")]
2140 let has_struct = by_column
2141 .iter()
2142 .any(|s| matches!(s.dtype(), DataType::Struct(_)));
2143
2144 #[cfg(not(feature = "dtype-struct"))]
2145 #[allow(non_upper_case_globals)]
2146 const has_struct: bool = false;
2147
2148 // a lot of indirection in both sorting and take
2149 let mut df = self.clone();
2150 let df = df.as_single_chunk_par();
2151 let mut take = match (by_column.len(), has_struct) {
2152 (1, false) => {
2153 let s = &by_column[0];
2154 let options = SortOptions {
2155 descending: sort_options.descending[0],
2156 nulls_last: sort_options.nulls_last[0],
2157 multithreaded: sort_options.multithreaded,
2158 maintain_order: sort_options.maintain_order,
2159 limit: sort_options.limit,
2160 };
2161 // fast path for a frame with a single series
2162 // no need to compute the sort indices and then take by these indices
2163 // simply sort and return as frame
2164 if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2165 let mut out = s.sort_with(options)?;
2166 if let Some((offset, len)) = slice {
2167 out = out.slice(offset, len);
2168 }
2169 return Ok(out.into_frame());
2170 }
2171 s.arg_sort(options)
2172 },
2173 _ => {
2174 if sort_options.nulls_last.iter().all(|&x| x)
2175 || has_struct
2176 || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2177 {
2178 argsort_multiple_row_fmt(
2179 &by_column,
2180 sort_options.descending,
2181 sort_options.nulls_last,
2182 sort_options.multithreaded,
2183 )?
2184 } else {
2185 let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2186 first
2187 .as_materialized_series()
2188 .arg_sort_multiple(&other, &sort_options)?
2189 }
2190 },
2191 };
2192
2193 if let Some((offset, len)) = slice {
2194 take = take.slice(offset, len);
2195 }
2196
2197 // SAFETY:
2198 // the created indices are in bounds
2199 let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2200 set_sorted(&mut df);
2201 Ok(df)
2202 }
2203
2204 /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2205 ///
2206 /// This dataframe does not necessarily have a specified schema and may be changed at any
2207 /// point. It is primarily used for debugging.
2208 pub fn _to_metadata(&self) -> DataFrame {
2209 let num_columns = self.columns.len();
2210
2211 let mut column_names =
2212 StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2213 let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2214 let mut sorted_asc_ca =
2215 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2216 let mut sorted_dsc_ca =
2217 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2218 let mut fast_explode_list_ca =
2219 BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2220 let mut materialized_at_ca =
2221 StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2222
2223 for col in &self.columns {
2224 let flags = col.get_flags();
2225
2226 let (repr, materialized_at) = match col {
2227 Column::Series(s) => ("series", s.materialized_at()),
2228 Column::Partitioned(_) => ("partitioned", None),
2229 Column::Scalar(_) => ("scalar", None),
2230 };
2231 let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2232 let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2233 let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2234
2235 column_names.append_value(col.name().clone());
2236 repr_ca.append_value(repr);
2237 sorted_asc_ca.append_value(sorted_asc);
2238 sorted_dsc_ca.append_value(sorted_dsc);
2239 fast_explode_list_ca.append_value(fast_explode_list);
2240 materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2241 }
2242
2243 unsafe {
2244 DataFrame::new_no_checks(
2245 self.width(),
2246 vec![
2247 column_names.finish().into_column(),
2248 repr_ca.finish().into_column(),
2249 sorted_asc_ca.finish().into_column(),
2250 sorted_dsc_ca.finish().into_column(),
2251 fast_explode_list_ca.finish().into_column(),
2252 materialized_at_ca.finish().into_column(),
2253 ],
2254 )
2255 }
2256 }
2257
2258 /// Return a sorted clone of this [`DataFrame`].
2259 ///
2260 /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2261 /// # Example
2262 ///
2263 /// Sort by a single column with default options:
2264 /// ```
2265 /// # use polars_core::prelude::*;
2266 /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2267 /// df.sort(["sepal_width"], Default::default())
2268 /// }
2269 /// ```
2270 /// Sort by a single column with specific order:
2271 /// ```
2272 /// # use polars_core::prelude::*;
2273 /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2274 /// df.sort(
2275 /// ["sepal_width"],
2276 /// SortMultipleOptions::new()
2277 /// .with_order_descending(descending)
2278 /// )
2279 /// }
2280 /// ```
2281 /// Sort by multiple columns with specifying order for each column:
2282 /// ```
2283 /// # use polars_core::prelude::*;
2284 /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2285 /// df.sort(
2286 /// ["sepal_width", "sepal_length"],
2287 /// SortMultipleOptions::new()
2288 /// .with_order_descending_multi([false, true])
2289 /// )
2290 /// }
2291 /// ```
2292 /// See [`SortMultipleOptions`] for more options.
2293 ///
2294 /// Also see [`DataFrame::sort_in_place`].
2295 pub fn sort(
2296 &self,
2297 by: impl IntoVec<PlSmallStr>,
2298 sort_options: SortMultipleOptions,
2299 ) -> PolarsResult<Self> {
2300 let mut df = self.clone();
2301 df.sort_in_place(by, sort_options)?;
2302 Ok(df)
2303 }
2304
2305 /// Replace a column with a [`Series`].
2306 ///
2307 /// # Example
2308 ///
2309 /// ```rust
2310 /// # use polars_core::prelude::*;
2311 /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2312 /// "Area (kmĀ²)" => [9_833_520, 9_596_961])?;
2313 /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2314 ///
2315 /// assert!(df.replace("Nation", s.clone()).is_err());
2316 /// assert!(df.replace("Country", s).is_ok());
2317 /// # Ok::<(), PolarsError>(())
2318 /// ```
2319 pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2320 self.apply(column, |_| new_col.into_series())
2321 }
2322
2323 /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2324 /// is that now the value of `column: &str` determines the name of the column and not the name
2325 /// of the `Series` passed to this method.
2326 pub fn replace_or_add<S: IntoSeries>(
2327 &mut self,
2328 column: PlSmallStr,
2329 new_col: S,
2330 ) -> PolarsResult<&mut Self> {
2331 let mut new_col = new_col.into_series();
2332 new_col.rename(column);
2333 self.with_column(new_col)
2334 }
2335
2336 /// Replace column at index `idx` with a [`Series`].
2337 ///
2338 /// # Example
2339 ///
2340 /// ```ignored
2341 /// # use polars_core::prelude::*;
2342 /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2343 /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2344 /// let mut df = DataFrame::new(vec![s0, s1])?;
2345 ///
2346 /// // Add 32 to get lowercase ascii values
2347 /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2348 /// # Ok::<(), PolarsError>(())
2349 /// ```
2350 pub fn replace_column<C: IntoColumn>(
2351 &mut self,
2352 index: usize,
2353 new_column: C,
2354 ) -> PolarsResult<&mut Self> {
2355 polars_ensure!(
2356 index < self.width(),
2357 ShapeMismatch:
2358 "unable to replace at index {}, the DataFrame has only {} columns",
2359 index, self.width(),
2360 );
2361 let mut new_column = new_column.into_column();
2362 polars_ensure!(
2363 new_column.len() == self.height(),
2364 ShapeMismatch:
2365 "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2366 new_column.len(), self.height(),
2367 );
2368 let old_col = &mut self.columns[index];
2369 mem::swap(old_col, &mut new_column);
2370 self.clear_schema();
2371 Ok(self)
2372 }
2373
2374 /// Apply a closure to a column. This is the recommended way to do in place modification.
2375 ///
2376 /// # Example
2377 ///
2378 /// ```rust
2379 /// # use polars_core::prelude::*;
2380 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2381 /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2382 /// let mut df = DataFrame::new(vec![s0, s1])?;
2383 ///
2384 /// fn str_to_len(str_val: &Column) -> Column {
2385 /// str_val.str()
2386 /// .unwrap()
2387 /// .into_iter()
2388 /// .map(|opt_name: Option<&str>| {
2389 /// opt_name.map(|name: &str| name.len() as u32)
2390 /// })
2391 /// .collect::<UInt32Chunked>()
2392 /// .into_column()
2393 /// }
2394 ///
2395 /// // Replace the names column by the length of the names.
2396 /// df.apply("names", str_to_len);
2397 /// # Ok::<(), PolarsError>(())
2398 /// ```
2399 /// Results in:
2400 ///
2401 /// ```text
2402 /// +--------+-------+
2403 /// | foo | |
2404 /// | --- | names |
2405 /// | str | u32 |
2406 /// +========+=======+
2407 /// | "ham" | 4 |
2408 /// +--------+-------+
2409 /// | "spam" | 6 |
2410 /// +--------+-------+
2411 /// | "egg" | 3 |
2412 /// +--------+-------+
2413 /// ```
2414 pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2415 where
2416 F: FnOnce(&Column) -> C,
2417 C: IntoColumn,
2418 {
2419 let idx = self.check_name_to_idx(name)?;
2420 self.apply_at_idx(idx, f)
2421 }
2422
2423 /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2424 /// modification.
2425 ///
2426 /// # Example
2427 ///
2428 /// ```rust
2429 /// # use polars_core::prelude::*;
2430 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2431 /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2432 /// let mut df = DataFrame::new(vec![s0, s1])?;
2433 ///
2434 /// // Add 32 to get lowercase ascii values
2435 /// df.apply_at_idx(1, |s| s + 32);
2436 /// # Ok::<(), PolarsError>(())
2437 /// ```
2438 /// Results in:
2439 ///
2440 /// ```text
2441 /// +--------+-------+
2442 /// | foo | ascii |
2443 /// | --- | --- |
2444 /// | str | i32 |
2445 /// +========+=======+
2446 /// | "ham" | 102 |
2447 /// +--------+-------+
2448 /// | "spam" | 111 |
2449 /// +--------+-------+
2450 /// | "egg" | 111 |
2451 /// +--------+-------+
2452 /// ```
2453 pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2454 where
2455 F: FnOnce(&Column) -> C,
2456 C: IntoColumn,
2457 {
2458 let df_height = self.height();
2459 let width = self.width();
2460 let col = self.columns.get_mut(idx).ok_or_else(|| {
2461 polars_err!(
2462 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2463 idx, width
2464 )
2465 })?;
2466 let name = col.name().clone();
2467 let new_col = f(col).into_column();
2468 match new_col.len() {
2469 1 => {
2470 let new_col = new_col.new_from_index(0, df_height);
2471 let _ = mem::replace(col, new_col);
2472 },
2473 len if (len == df_height) => {
2474 let _ = mem::replace(col, new_col);
2475 },
2476 len => polars_bail!(
2477 ShapeMismatch:
2478 "resulting Series has length {} while the DataFrame has height {}",
2479 len, df_height
2480 ),
2481 }
2482
2483 // make sure the name remains the same after applying the closure
2484 unsafe {
2485 let col = self.columns.get_unchecked_mut(idx);
2486 col.rename(name);
2487 }
2488 Ok(self)
2489 }
2490
2491 /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2492 /// modification.
2493 ///
2494 /// # Example
2495 ///
2496 /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2497 ///
2498 /// ```rust
2499 /// # use polars_core::prelude::*;
2500 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2501 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2502 /// let mut df = DataFrame::new(vec![s0, s1])?;
2503 ///
2504 /// let idx = vec![0, 1, 4];
2505 ///
2506 /// df.try_apply("foo", |c| {
2507 /// c.str()?
2508 /// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2509 /// });
2510 /// # Ok::<(), PolarsError>(())
2511 /// ```
2512 /// Results in:
2513 ///
2514 /// ```text
2515 /// +---------------------+--------+
2516 /// | foo | values |
2517 /// | --- | --- |
2518 /// | str | i32 |
2519 /// +=====================+========+
2520 /// | "ham-is-modified" | 1 |
2521 /// +---------------------+--------+
2522 /// | "spam-is-modified" | 2 |
2523 /// +---------------------+--------+
2524 /// | "egg" | 3 |
2525 /// +---------------------+--------+
2526 /// | "bacon" | 4 |
2527 /// +---------------------+--------+
2528 /// | "quack-is-modified" | 5 |
2529 /// +---------------------+--------+
2530 /// ```
2531 pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2532 where
2533 F: FnOnce(&Column) -> PolarsResult<C>,
2534 C: IntoColumn,
2535 {
2536 let width = self.width();
2537 let col = self.columns.get_mut(idx).ok_or_else(|| {
2538 polars_err!(
2539 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2540 idx, width
2541 )
2542 })?;
2543 let name = col.name().clone();
2544
2545 let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2546
2547 // make sure the name remains the same after applying the closure
2548 unsafe {
2549 let col = self.columns.get_unchecked_mut(idx);
2550 col.rename(name);
2551 }
2552 Ok(self)
2553 }
2554
2555 /// Apply a closure that may fail to a column. This is the recommended way to do in place
2556 /// modification.
2557 ///
2558 /// # Example
2559 ///
2560 /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2561 ///
2562 /// ```rust
2563 /// # use polars_core::prelude::*;
2564 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2565 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2566 /// let mut df = DataFrame::new(vec![s0, s1])?;
2567 ///
2568 /// // create a mask
2569 /// let values = df.column("values")?.as_materialized_series();
2570 /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2571 ///
2572 /// df.try_apply("foo", |c| {
2573 /// c.str()?
2574 /// .set(&mask, Some("not_within_bounds"))
2575 /// });
2576 /// # Ok::<(), PolarsError>(())
2577 /// ```
2578 /// Results in:
2579 ///
2580 /// ```text
2581 /// +---------------------+--------+
2582 /// | foo | values |
2583 /// | --- | --- |
2584 /// | str | i32 |
2585 /// +=====================+========+
2586 /// | "not_within_bounds" | 1 |
2587 /// +---------------------+--------+
2588 /// | "spam" | 2 |
2589 /// +---------------------+--------+
2590 /// | "egg" | 3 |
2591 /// +---------------------+--------+
2592 /// | "bacon" | 4 |
2593 /// +---------------------+--------+
2594 /// | "not_within_bounds" | 5 |
2595 /// +---------------------+--------+
2596 /// ```
2597 pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2598 where
2599 F: FnOnce(&Series) -> PolarsResult<C>,
2600 C: IntoColumn,
2601 {
2602 let idx = self.try_get_column_index(column)?;
2603 self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2604 }
2605
2606 /// Slice the [`DataFrame`] along the rows.
2607 ///
2608 /// # Example
2609 ///
2610 /// ```rust
2611 /// # use polars_core::prelude::*;
2612 /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2613 /// "Color" => ["Green", "Red", "White", "White", "Red"])?;
2614 /// let sl: DataFrame = df.slice(2, 3);
2615 ///
2616 /// assert_eq!(sl.shape(), (3, 2));
2617 /// println!("{}", sl);
2618 /// # Ok::<(), PolarsError>(())
2619 /// ```
2620 /// Output:
2621 /// ```text
2622 /// shape: (3, 2)
2623 /// +-------+-------+
2624 /// | Fruit | Color |
2625 /// | --- | --- |
2626 /// | str | str |
2627 /// +=======+=======+
2628 /// | Grape | White |
2629 /// +-------+-------+
2630 /// | Fig | White |
2631 /// +-------+-------+
2632 /// | Fig | Red |
2633 /// +-------+-------+
2634 /// ```
2635 #[must_use]
2636 pub fn slice(&self, offset: i64, length: usize) -> Self {
2637 if offset == 0 && length == self.height() {
2638 return self.clone();
2639 }
2640 if length == 0 {
2641 return self.clear();
2642 }
2643 let col = self
2644 .columns
2645 .iter()
2646 .map(|s| s.slice(offset, length))
2647 .collect::<Vec<_>>();
2648
2649 let height = if let Some(fst) = col.first() {
2650 fst.len()
2651 } else {
2652 let (_, length) = slice_offsets(offset, length, self.height());
2653 length
2654 };
2655
2656 unsafe { DataFrame::new_no_checks(height, col) }
2657 }
2658
2659 /// Split [`DataFrame`] at the given `offset`.
2660 pub fn split_at(&self, offset: i64) -> (Self, Self) {
2661 let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2662
2663 let (idx, _) = slice_offsets(offset, 0, self.height());
2664
2665 let a = unsafe { DataFrame::new_no_checks(idx, a) };
2666 let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2667 (a, b)
2668 }
2669
2670 pub fn clear(&self) -> Self {
2671 let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2672 unsafe { DataFrame::new_no_checks(0, col) }
2673 }
2674
2675 #[must_use]
2676 pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2677 if offset == 0 && length == self.height() {
2678 return self.clone();
2679 }
2680 let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2681 unsafe { DataFrame::new_no_checks(length, columns) }
2682 }
2683
2684 #[must_use]
2685 pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2686 if offset == 0 && length == self.height() {
2687 return self.clone();
2688 }
2689 // @scalar-opt
2690 let columns = self._apply_columns(&|s| {
2691 let mut out = s.slice(offset, length);
2692 out.shrink_to_fit();
2693 out
2694 });
2695 unsafe { DataFrame::new_no_checks(length, columns) }
2696 }
2697
2698 /// Get the head of the [`DataFrame`].
2699 ///
2700 /// # Example
2701 ///
2702 /// ```rust
2703 /// # use polars_core::prelude::*;
2704 /// let countries: DataFrame =
2705 /// df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2706 /// "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2707 /// "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2708 /// "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2709 /// assert_eq!(countries.shape(), (5, 4));
2710 ///
2711 /// println!("{}", countries.head(Some(3)));
2712 /// # Ok::<(), PolarsError>(())
2713 /// ```
2714 ///
2715 /// Output:
2716 ///
2717 /// ```text
2718 /// shape: (3, 4)
2719 /// +--------------------+---------------+---------------+------------+
2720 /// | Rank by GDP (2021) | Continent | Country | Capital |
2721 /// | --- | --- | --- | --- |
2722 /// | i32 | str | str | str |
2723 /// +====================+===============+===============+============+
2724 /// | 1 | North America | United States | Washington |
2725 /// +--------------------+---------------+---------------+------------+
2726 /// | 2 | Asia | China | Beijing |
2727 /// +--------------------+---------------+---------------+------------+
2728 /// | 3 | Asia | Japan | Tokyo |
2729 /// +--------------------+---------------+---------------+------------+
2730 /// ```
2731 #[must_use]
2732 pub fn head(&self, length: Option<usize>) -> Self {
2733 let col = self
2734 .columns
2735 .iter()
2736 .map(|c| c.head(length))
2737 .collect::<Vec<_>>();
2738
2739 let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2740 let height = usize::min(height, self.height());
2741 unsafe { DataFrame::new_no_checks(height, col) }
2742 }
2743
2744 /// Get the tail of the [`DataFrame`].
2745 ///
2746 /// # Example
2747 ///
2748 /// ```rust
2749 /// # use polars_core::prelude::*;
2750 /// let countries: DataFrame =
2751 /// df!("Rank (2021)" => [105, 106, 107, 108, 109],
2752 /// "Apple Price (ā¬/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2753 /// "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2754 /// assert_eq!(countries.shape(), (5, 3));
2755 ///
2756 /// println!("{}", countries.tail(Some(2)));
2757 /// # Ok::<(), PolarsError>(())
2758 /// ```
2759 ///
2760 /// Output:
2761 ///
2762 /// ```text
2763 /// shape: (2, 3)
2764 /// +-------------+--------------------+---------+
2765 /// | Rank (2021) | Apple Price (ā¬/kg) | Country |
2766 /// | --- | --- | --- |
2767 /// | i32 | f64 | str |
2768 /// +=============+====================+=========+
2769 /// | 108 | 0.63 | Syria |
2770 /// +-------------+--------------------+---------+
2771 /// | 109 | 0.63 | Turkey |
2772 /// +-------------+--------------------+---------+
2773 /// ```
2774 #[must_use]
2775 pub fn tail(&self, length: Option<usize>) -> Self {
2776 let col = self
2777 .columns
2778 .iter()
2779 .map(|c| c.tail(length))
2780 .collect::<Vec<_>>();
2781
2782 let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2783 let height = usize::min(height, self.height());
2784 unsafe { DataFrame::new_no_checks(height, col) }
2785 }
2786
2787 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2788 ///
2789 /// # Panics
2790 ///
2791 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2792 ///
2793 /// This responsibility is left to the caller as we don't want to take mutable references here,
2794 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2795 /// as well.
2796 pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter {
2797 debug_assert!(!self.should_rechunk(), "expected equal chunks");
2798 // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2799 // as we must allocate arrow strings/binaries.
2800 let must_convert = compat_level.0 == 0;
2801 let parallel = parallel
2802 && must_convert
2803 && self.columns.len() > 1
2804 && self
2805 .columns
2806 .iter()
2807 .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2808
2809 RecordBatchIter {
2810 columns: &self.columns,
2811 schema: Arc::new(
2812 self.columns
2813 .iter()
2814 .map(|c| c.field().to_arrow(compat_level))
2815 .collect(),
2816 ),
2817 idx: 0,
2818 n_chunks: self.first_col_n_chunks(),
2819 compat_level,
2820 parallel,
2821 }
2822 }
2823
2824 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2825 ///
2826 /// # Panics
2827 ///
2828 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2829 ///
2830 /// This responsibility is left to the caller as we don't want to take mutable references here,
2831 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2832 /// as well.
2833 pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2834 PhysRecordBatchIter {
2835 schema: Arc::new(
2836 self.get_columns()
2837 .iter()
2838 .map(|c| c.field().to_arrow(CompatLevel::newest()))
2839 .collect(),
2840 ),
2841 arr_iters: self
2842 .materialized_column_iter()
2843 .map(|s| s.chunks().iter())
2844 .collect(),
2845 }
2846 }
2847
2848 /// Get a [`DataFrame`] with all the columns in reversed order.
2849 #[must_use]
2850 pub fn reverse(&self) -> Self {
2851 let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2852 unsafe { DataFrame::new_no_checks(self.height(), col) }
2853 }
2854
2855 /// Shift the values by a given period and fill the parts that will be empty due to this operation
2856 /// with `Nones`.
2857 ///
2858 /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2859 #[must_use]
2860 pub fn shift(&self, periods: i64) -> Self {
2861 let col = self._apply_columns_par(&|s| s.shift(periods));
2862 unsafe { DataFrame::new_no_checks(self.height(), col) }
2863 }
2864
2865 /// Replace None values with one of the following strategies:
2866 /// * Forward fill (replace None with the previous value)
2867 /// * Backward fill (replace None with the next value)
2868 /// * Mean fill (replace None with the mean of the whole array)
2869 /// * Min fill (replace None with the minimum of the whole array)
2870 /// * Max fill (replace None with the maximum of the whole array)
2871 ///
2872 /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2873 pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2874 let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2875
2876 Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2877 }
2878
2879 /// Pipe different functions/ closure operations that work on a DataFrame together.
2880 pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2881 where
2882 F: Fn(DataFrame) -> PolarsResult<B>,
2883 {
2884 f(self)
2885 }
2886
2887 /// Pipe different functions/ closure operations that work on a DataFrame together.
2888 pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2889 where
2890 F: Fn(&mut DataFrame) -> PolarsResult<B>,
2891 {
2892 f(self)
2893 }
2894
2895 /// Pipe different functions/ closure operations that work on a DataFrame together.
2896 pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2897 where
2898 F: Fn(DataFrame, Args) -> PolarsResult<B>,
2899 {
2900 f(self, args)
2901 }
2902
2903 /// Drop duplicate rows from a [`DataFrame`].
2904 /// *This fails when there is a column of type List in DataFrame*
2905 ///
2906 /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2907 ///
2908 /// # Example
2909 ///
2910 /// ```no_run
2911 /// # use polars_core::prelude::*;
2912 /// let df = df! {
2913 /// "flt" => [1., 1., 2., 2., 3., 3.],
2914 /// "int" => [1, 1, 2, 2, 3, 3, ],
2915 /// "str" => ["a", "a", "b", "b", "c", "c"]
2916 /// }?;
2917 ///
2918 /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2919 /// # Ok::<(), PolarsError>(())
2920 /// ```
2921 /// Returns
2922 ///
2923 /// ```text
2924 /// +-----+-----+-----+
2925 /// | flt | int | str |
2926 /// | --- | --- | --- |
2927 /// | f64 | i32 | str |
2928 /// +=====+=====+=====+
2929 /// | 1 | 1 | "a" |
2930 /// +-----+-----+-----+
2931 /// | 2 | 2 | "b" |
2932 /// +-----+-----+-----+
2933 /// | 3 | 3 | "c" |
2934 /// +-----+-----+-----+
2935 /// ```
2936 #[cfg(feature = "algorithm_group_by")]
2937 pub fn unique_stable(
2938 &self,
2939 subset: Option<&[String]>,
2940 keep: UniqueKeepStrategy,
2941 slice: Option<(i64, usize)>,
2942 ) -> PolarsResult<DataFrame> {
2943 self.unique_impl(
2944 true,
2945 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2946 keep,
2947 slice,
2948 )
2949 }
2950
2951 /// Unstable distinct. See [`DataFrame::unique_stable`].
2952 #[cfg(feature = "algorithm_group_by")]
2953 pub fn unique<I, S>(
2954 &self,
2955 subset: Option<&[String]>,
2956 keep: UniqueKeepStrategy,
2957 slice: Option<(i64, usize)>,
2958 ) -> PolarsResult<DataFrame> {
2959 self.unique_impl(
2960 false,
2961 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2962 keep,
2963 slice,
2964 )
2965 }
2966
2967 #[cfg(feature = "algorithm_group_by")]
2968 pub fn unique_impl(
2969 &self,
2970 maintain_order: bool,
2971 subset: Option<Vec<PlSmallStr>>,
2972 keep: UniqueKeepStrategy,
2973 slice: Option<(i64, usize)>,
2974 ) -> PolarsResult<Self> {
2975 let names = subset.unwrap_or_else(|| self.get_column_names_owned());
2976 let mut df = self.clone();
2977 // take on multiple chunks is terrible
2978 df.as_single_chunk_par();
2979
2980 let columns = match (keep, maintain_order) {
2981 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
2982 let gb = df.group_by_stable(names)?;
2983 let groups = gb.get_groups();
2984 let (offset, len) = slice.unwrap_or((0, groups.len()));
2985 let groups = groups.slice(offset, len);
2986 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
2987 },
2988 (UniqueKeepStrategy::Last, true) => {
2989 // maintain order by last values, so the sorted groups are not correct as they
2990 // are sorted by the first value
2991 let gb = df.group_by(names)?;
2992 let groups = gb.get_groups();
2993
2994 let func = |g: GroupsIndicator| match g {
2995 GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
2996 GroupsIndicator::Slice([first, len]) => first + len - 1,
2997 };
2998
2999 let last_idx: NoNull<IdxCa> = match slice {
3000 None => groups.iter().map(func).collect(),
3001 Some((offset, len)) => {
3002 let (offset, len) = slice_offsets(offset, len, groups.len());
3003 groups.iter().skip(offset).take(len).map(func).collect()
3004 },
3005 };
3006
3007 let last_idx = last_idx.sort(false);
3008 return Ok(unsafe { df.take_unchecked(&last_idx) });
3009 },
3010 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3011 let gb = df.group_by(names)?;
3012 let groups = gb.get_groups();
3013 let (offset, len) = slice.unwrap_or((0, groups.len()));
3014 let groups = groups.slice(offset, len);
3015 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3016 },
3017 (UniqueKeepStrategy::Last, false) => {
3018 let gb = df.group_by(names)?;
3019 let groups = gb.get_groups();
3020 let (offset, len) = slice.unwrap_or((0, groups.len()));
3021 let groups = groups.slice(offset, len);
3022 df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3023 },
3024 (UniqueKeepStrategy::None, _) => {
3025 let df_part = df.select(names)?;
3026 let mask = df_part.is_unique()?;
3027 let mask = match slice {
3028 None => mask,
3029 Some((offset, len)) => mask.slice(offset, len),
3030 };
3031 return df.filter(&mask);
3032 },
3033 };
3034
3035 let height = Self::infer_height(&columns);
3036 Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3037 }
3038
3039 /// Get a mask of all the unique rows in the [`DataFrame`].
3040 ///
3041 /// # Example
3042 ///
3043 /// ```no_run
3044 /// # use polars_core::prelude::*;
3045 /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3046 /// "ISIN" => ["US0378331005", "US5949181045"])?;
3047 /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3048 ///
3049 /// assert!(ca.all());
3050 /// # Ok::<(), PolarsError>(())
3051 /// ```
3052 #[cfg(feature = "algorithm_group_by")]
3053 pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3054 let gb = self.group_by(self.get_column_names_owned())?;
3055 let groups = gb.get_groups();
3056 Ok(is_unique_helper(
3057 groups,
3058 self.height() as IdxSize,
3059 true,
3060 false,
3061 ))
3062 }
3063
3064 /// Get a mask of all the duplicated rows in the [`DataFrame`].
3065 ///
3066 /// # Example
3067 ///
3068 /// ```no_run
3069 /// # use polars_core::prelude::*;
3070 /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3071 /// "ISIN" => ["US02079K3059", "US02079K1079"])?;
3072 /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3073 ///
3074 /// assert!(!ca.all());
3075 /// # Ok::<(), PolarsError>(())
3076 /// ```
3077 #[cfg(feature = "algorithm_group_by")]
3078 pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3079 let gb = self.group_by(self.get_column_names_owned())?;
3080 let groups = gb.get_groups();
3081 Ok(is_unique_helper(
3082 groups,
3083 self.height() as IdxSize,
3084 false,
3085 true,
3086 ))
3087 }
3088
3089 /// Create a new [`DataFrame`] that shows the null counts per column.
3090 #[must_use]
3091 pub fn null_count(&self) -> Self {
3092 let cols = self
3093 .columns
3094 .iter()
3095 .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3096 .collect();
3097 unsafe { Self::new_no_checks(1, cols) }
3098 }
3099
3100 /// Hash and combine the row values
3101 #[cfg(feature = "row_hash")]
3102 pub fn hash_rows(
3103 &mut self,
3104 hasher_builder: Option<PlRandomState>,
3105 ) -> PolarsResult<UInt64Chunked> {
3106 let dfs = split_df(self, POOL.current_num_threads(), false);
3107 let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3108
3109 let mut iter = cas.into_iter();
3110 let mut acc_ca = iter.next().unwrap();
3111 for ca in iter {
3112 acc_ca.append(&ca)?;
3113 }
3114 Ok(acc_ca.rechunk().into_owned())
3115 }
3116
3117 /// Get the supertype of the columns in this DataFrame
3118 pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3119 self.columns
3120 .iter()
3121 .map(|s| Ok(s.dtype().clone()))
3122 .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3123 }
3124
3125 /// Take by index values given by the slice `idx`.
3126 /// # Warning
3127 /// Be careful with allowing threads when calling this in a large hot loop
3128 /// every thread split may be on rayon stack and lead to SO
3129 #[doc(hidden)]
3130 pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3131 self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3132 }
3133
3134 /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3135 /// if the index value in `idx` are sorted. This will maintain sorted flags.
3136 ///
3137 /// # Warning
3138 /// Be careful with allowing threads when calling this in a large hot loop
3139 /// every thread split may be on rayon stack and lead to SO
3140 #[doc(hidden)]
3141 pub unsafe fn _take_unchecked_slice_sorted(
3142 &self,
3143 idx: &[IdxSize],
3144 allow_threads: bool,
3145 sorted: IsSorted,
3146 ) -> Self {
3147 #[cfg(debug_assertions)]
3148 {
3149 if idx.len() > 2 {
3150 match sorted {
3151 IsSorted::Ascending => {
3152 assert!(idx[0] <= idx[idx.len() - 1]);
3153 },
3154 IsSorted::Descending => {
3155 assert!(idx[0] >= idx[idx.len() - 1]);
3156 },
3157 _ => {},
3158 }
3159 }
3160 }
3161 let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3162 ca.set_sorted_flag(sorted);
3163 self.take_unchecked_impl(&ca, allow_threads)
3164 }
3165
3166 #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3167 #[doc(hidden)]
3168 pub fn _partition_by_impl(
3169 &self,
3170 cols: &[PlSmallStr],
3171 stable: bool,
3172 include_key: bool,
3173 ) -> PolarsResult<Vec<DataFrame>> {
3174 let groups = if stable {
3175 self.group_by_stable(cols.iter().cloned())?.take_groups()
3176 } else {
3177 self.group_by(cols.iter().cloned())?.take_groups()
3178 };
3179
3180 // drop key columns prior to calculation if requested
3181 let df = if include_key {
3182 self.clone()
3183 } else {
3184 self.drop_many(cols.iter().cloned())
3185 };
3186
3187 // don't parallelize this
3188 // there is a lot of parallelization in take and this may easily SO
3189 POOL.install(|| {
3190 match groups.as_ref() {
3191 GroupsType::Idx(idx) => {
3192 // Rechunk as the gather may rechunk for every group #17562.
3193 let mut df = df.clone();
3194 df.as_single_chunk_par();
3195 Ok(idx
3196 .into_par_iter()
3197 .map(|(_, group)| {
3198 // groups are in bounds
3199 unsafe {
3200 df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3201 }
3202 })
3203 .collect())
3204 },
3205 GroupsType::Slice { groups, .. } => Ok(groups
3206 .into_par_iter()
3207 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3208 .collect()),
3209 }
3210 })
3211 }
3212
3213 /// Split into multiple DataFrames partitioned by groups
3214 #[cfg(feature = "partition_by")]
3215 pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3216 where
3217 I: IntoIterator<Item = S>,
3218 S: Into<PlSmallStr>,
3219 {
3220 let cols = cols
3221 .into_iter()
3222 .map(Into::into)
3223 .collect::<Vec<PlSmallStr>>();
3224 self._partition_by_impl(cols.as_slice(), false, include_key)
3225 }
3226
3227 /// Split into multiple DataFrames partitioned by groups
3228 /// Order of the groups are maintained.
3229 #[cfg(feature = "partition_by")]
3230 pub fn partition_by_stable<I, S>(
3231 &self,
3232 cols: I,
3233 include_key: bool,
3234 ) -> PolarsResult<Vec<DataFrame>>
3235 where
3236 I: IntoIterator<Item = S>,
3237 S: Into<PlSmallStr>,
3238 {
3239 let cols = cols
3240 .into_iter()
3241 .map(Into::into)
3242 .collect::<Vec<PlSmallStr>>();
3243 self._partition_by_impl(cols.as_slice(), true, include_key)
3244 }
3245
3246 /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3247 /// inserted as columns.
3248 #[cfg(feature = "dtype-struct")]
3249 pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3250 let cols = cols.into_vec();
3251 self.unnest_impl(cols.into_iter().collect())
3252 }
3253
3254 #[cfg(feature = "dtype-struct")]
3255 fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3256 let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3257 let mut count = 0;
3258 for s in &self.columns {
3259 if cols.contains(s.name()) {
3260 let ca = s.struct_()?.clone();
3261 new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3262 count += 1;
3263 } else {
3264 new_cols.push(s.clone())
3265 }
3266 }
3267 if count != cols.len() {
3268 // one or more columns not found
3269 // the code below will return an error with the missing name
3270 let schema = self.schema();
3271 for col in cols {
3272 let _ = schema
3273 .get(col.as_str())
3274 .ok_or_else(|| polars_err!(col_not_found = col))?;
3275 }
3276 }
3277 DataFrame::new(new_cols)
3278 }
3279
3280 pub(crate) fn infer_height(cols: &[Column]) -> usize {
3281 cols.first().map_or(0, Column::len)
3282 }
3283
3284 pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3285 // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3286 // append_chunk or something like this. It is just quite difficult to make that safe.
3287 let df = DataFrame::from(rb);
3288 polars_ensure!(
3289 self.schema() == df.schema(),
3290 SchemaMismatch: "cannot append record batch with different schema",
3291 );
3292 self.vstack_mut_owned_unchecked(df);
3293 Ok(())
3294 }
3295}
3296
3297pub struct RecordBatchIter<'a> {
3298 columns: &'a Vec<Column>,
3299 schema: ArrowSchemaRef,
3300 idx: usize,
3301 n_chunks: usize,
3302 compat_level: CompatLevel,
3303 parallel: bool,
3304}
3305
3306impl Iterator for RecordBatchIter<'_> {
3307 type Item = RecordBatch;
3308
3309 fn next(&mut self) -> Option<Self::Item> {
3310 if self.idx >= self.n_chunks {
3311 return None;
3312 }
3313
3314 // Create a batch of the columns with the same chunk no.
3315 let batch_cols: Vec<ArrayRef> = if self.parallel {
3316 let iter = self
3317 .columns
3318 .par_iter()
3319 .map(Column::as_materialized_series)
3320 .map(|s| s.to_arrow(self.idx, self.compat_level));
3321 POOL.install(|| iter.collect())
3322 } else {
3323 self.columns
3324 .iter()
3325 .map(Column::as_materialized_series)
3326 .map(|s| s.to_arrow(self.idx, self.compat_level))
3327 .collect()
3328 };
3329 self.idx += 1;
3330
3331 let length = batch_cols.first().map_or(0, |arr| arr.len());
3332 Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3333 }
3334
3335 fn size_hint(&self) -> (usize, Option<usize>) {
3336 let n = self.n_chunks - self.idx;
3337 (n, Some(n))
3338 }
3339}
3340
3341pub struct PhysRecordBatchIter<'a> {
3342 schema: ArrowSchemaRef,
3343 arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3344}
3345
3346impl Iterator for PhysRecordBatchIter<'_> {
3347 type Item = RecordBatch;
3348
3349 fn next(&mut self) -> Option<Self::Item> {
3350 let arrs = self
3351 .arr_iters
3352 .iter_mut()
3353 .map(|phys_iter| phys_iter.next().cloned())
3354 .collect::<Option<Vec<_>>>()?;
3355
3356 let length = arrs.first().map_or(0, |arr| arr.len());
3357 Some(RecordBatch::new(length, self.schema.clone(), arrs))
3358 }
3359
3360 fn size_hint(&self) -> (usize, Option<usize>) {
3361 if let Some(iter) = self.arr_iters.first() {
3362 iter.size_hint()
3363 } else {
3364 (0, None)
3365 }
3366 }
3367}
3368
3369impl Default for DataFrame {
3370 fn default() -> Self {
3371 DataFrame::empty()
3372 }
3373}
3374
3375impl From<DataFrame> for Vec<Column> {
3376 fn from(df: DataFrame) -> Self {
3377 df.columns
3378 }
3379}
3380
3381// utility to test if we can vstack/extend the columns
3382fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3383 polars_ensure!(
3384 left.name() == right.name(),
3385 ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3386 left.name(), right.name(),
3387 );
3388 Ok(())
3389}
3390
3391#[cfg(test)]
3392mod test {
3393 use super::*;
3394
3395 fn create_frame() -> DataFrame {
3396 let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3397 let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3398 DataFrame::new(vec![s0, s1]).unwrap()
3399 }
3400
3401 #[test]
3402 #[cfg_attr(miri, ignore)]
3403 fn test_recordbatch_iterator() {
3404 let df = df!(
3405 "foo" => [1, 2, 3, 4, 5]
3406 )
3407 .unwrap();
3408 let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3409 assert_eq!(5, iter.next().unwrap().len());
3410 assert!(iter.next().is_none());
3411 }
3412
3413 #[test]
3414 #[cfg_attr(miri, ignore)]
3415 fn test_select() {
3416 let df = create_frame();
3417 assert_eq!(
3418 df.column("days")
3419 .unwrap()
3420 .as_series()
3421 .unwrap()
3422 .equal(1)
3423 .unwrap()
3424 .sum(),
3425 Some(1)
3426 );
3427 }
3428
3429 #[test]
3430 #[cfg_attr(miri, ignore)]
3431 fn test_filter_broadcast_on_string_col() {
3432 let col_name = "some_col";
3433 let v = vec!["test".to_string()];
3434 let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3435 let mut df = DataFrame::new(vec![s0]).unwrap();
3436
3437 df = df
3438 .filter(
3439 &df.column(col_name)
3440 .unwrap()
3441 .as_materialized_series()
3442 .equal("")
3443 .unwrap(),
3444 )
3445 .unwrap();
3446 assert_eq!(
3447 df.column(col_name)
3448 .unwrap()
3449 .as_materialized_series()
3450 .n_chunks(),
3451 1
3452 );
3453 }
3454
3455 #[test]
3456 #[cfg_attr(miri, ignore)]
3457 fn test_filter_broadcast_on_list_col() {
3458 let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3459 let ll: ListChunked = [&s1].iter().copied().collect();
3460
3461 let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3462 let new = ll.filter(&mask).unwrap();
3463
3464 assert_eq!(new.chunks.len(), 1);
3465 assert_eq!(new.len(), 0);
3466 }
3467
3468 #[test]
3469 fn slice() {
3470 let df = create_frame();
3471 let sliced_df = df.slice(0, 2);
3472 assert_eq!(sliced_df.shape(), (2, 2));
3473 }
3474
3475 #[test]
3476 fn rechunk_false() {
3477 let df = create_frame();
3478 assert!(!df.should_rechunk())
3479 }
3480
3481 #[test]
3482 fn rechunk_true() -> PolarsResult<()> {
3483 let mut base = df!(
3484 "a" => [1, 2, 3],
3485 "b" => [1, 2, 3]
3486 )?;
3487
3488 // Create a series with multiple chunks
3489 let mut s = Series::new("foo".into(), 0..2);
3490 let s2 = Series::new("bar".into(), 0..1);
3491 s.append(&s2)?;
3492
3493 // Append series to frame
3494 let out = base.with_column(s)?;
3495
3496 // Now we should rechunk
3497 assert!(out.should_rechunk());
3498 Ok(())
3499 }
3500
3501 #[test]
3502 fn test_duplicate_column() {
3503 let mut df = df! {
3504 "foo" => [1, 2, 3]
3505 }
3506 .unwrap();
3507 // check if column is replaced
3508 assert!(
3509 df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3510 .is_ok()
3511 );
3512 assert!(
3513 df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3514 .is_ok()
3515 );
3516 assert!(df.column("bar").is_ok())
3517 }
3518
3519 #[test]
3520 #[cfg_attr(miri, ignore)]
3521 fn distinct() {
3522 let df = df! {
3523 "flt" => [1., 1., 2., 2., 3., 3.],
3524 "int" => [1, 1, 2, 2, 3, 3, ],
3525 "str" => ["a", "a", "b", "b", "c", "c"]
3526 }
3527 .unwrap();
3528 let df = df
3529 .unique_stable(None, UniqueKeepStrategy::First, None)
3530 .unwrap()
3531 .sort(["flt"], SortMultipleOptions::default())
3532 .unwrap();
3533 let valid = df! {
3534 "flt" => [1., 2., 3.],
3535 "int" => [1, 2, 3],
3536 "str" => ["a", "b", "c"]
3537 }
3538 .unwrap();
3539 assert!(df.equals(&valid));
3540 }
3541
3542 #[test]
3543 fn test_vstack() {
3544 // check that it does not accidentally rechunks
3545 let mut df = df! {
3546 "flt" => [1., 1., 2., 2., 3., 3.],
3547 "int" => [1, 1, 2, 2, 3, 3, ],
3548 "str" => ["a", "a", "b", "b", "c", "c"]
3549 }
3550 .unwrap();
3551
3552 df.vstack_mut(&df.slice(0, 3)).unwrap();
3553 assert_eq!(df.first_col_n_chunks(), 2)
3554 }
3555
3556 #[test]
3557 fn test_vstack_on_empty_dataframe() {
3558 let mut df = DataFrame::empty();
3559
3560 let df_data = df! {
3561 "flt" => [1., 1., 2., 2., 3., 3.],
3562 "int" => [1, 1, 2, 2, 3, 3, ],
3563 "str" => ["a", "a", "b", "b", "c", "c"]
3564 }
3565 .unwrap();
3566
3567 df.vstack_mut(&df_data).unwrap();
3568 assert_eq!(df.height, 6)
3569 }
3570
3571 #[test]
3572 fn test_replace_or_add() -> PolarsResult<()> {
3573 let mut df = df!(
3574 "a" => [1, 2, 3],
3575 "b" => [1, 2, 3]
3576 )?;
3577
3578 // check that the new column is "c" and not "bar".
3579 df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3580
3581 assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3582 Ok(())
3583 }
3584}