polars_core/frame/mod.rs
1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::itertools::Itertools;
10use rayon::prelude::*;
11
12use crate::chunked_array::flags::StatisticsFlags;
13#[cfg(feature = "algorithm_group_by")]
14use crate::chunked_array::ops::unique::is_unique_helper;
15use crate::prelude::*;
16#[cfg(feature = "row_hash")]
17use crate::utils::split_df;
18use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
19use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
20
21#[cfg(feature = "dataframe_arithmetic")]
22mod arithmetic;
23pub mod builder;
24mod chunks;
25pub use chunks::chunk_df_for_writing;
26pub mod column;
27pub mod explode;
28mod from;
29#[cfg(feature = "algorithm_group_by")]
30pub mod group_by;
31pub(crate) mod horizontal;
32#[cfg(any(feature = "rows", feature = "object"))]
33pub mod row;
34mod top_k;
35mod upstream_traits;
36mod validation;
37
38use arrow::record_batch::{RecordBatch, RecordBatchT};
39use polars_utils::pl_str::PlSmallStr;
40#[cfg(feature = "serde")]
41use serde::{Deserialize, Serialize};
42use strum_macros::IntoStaticStr;
43
44use crate::POOL;
45#[cfg(feature = "row_hash")]
46use crate::hashing::_df_rows_to_hashes_threaded_vertical;
47use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
48use crate::series::IsSorted;
49
50#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
51#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
52#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
53#[strum(serialize_all = "snake_case")]
54pub enum UniqueKeepStrategy {
55 /// Keep the first unique row.
56 First,
57 /// Keep the last unique row.
58 Last,
59 /// Keep None of the unique rows.
60 None,
61 /// Keep any of the unique rows
62 /// This allows more optimizations
63 #[default]
64 Any,
65}
66
67fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
68where
69 F: for<'a> FnMut(&'a T) -> &'a str,
70{
71 // Always unique.
72 if items.len() <= 1 {
73 return Ok(());
74 }
75
76 if items.len() <= 4 {
77 // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
78 for i in 0..items.len() - 1 {
79 let name = get_name(&items[i]);
80 for other in items.iter().skip(i + 1) {
81 if name == get_name(other) {
82 polars_bail!(duplicate = name);
83 }
84 }
85 }
86 } else {
87 let mut names = PlHashSet::with_capacity(items.len());
88 for item in items {
89 let name = get_name(item);
90 if !names.insert(name) {
91 polars_bail!(duplicate = name);
92 }
93 }
94 }
95 Ok(())
96}
97
98/// A contiguous growable collection of `Series` that have the same length.
99///
100/// ## Use declarations
101///
102/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
103///
104/// ```rust
105/// use polars_core::prelude::*; // if the crate polars-core is used directly
106/// // use polars::prelude::*; if the crate polars is used
107/// ```
108///
109/// # Initialization
110/// ## Default
111///
112/// A `DataFrame` can be initialized empty:
113///
114/// ```rust
115/// # use polars_core::prelude::*;
116/// let df = DataFrame::default();
117/// assert!(df.is_empty());
118/// ```
119///
120/// ## Wrapping a `Vec<Series>`
121///
122/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
123///
124/// ```rust
125/// # use polars_core::prelude::*;
126/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
127/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
128///
129/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
130/// ```
131///
132/// ## Using a macro
133///
134/// The [`df!`] macro is a convenient method:
135///
136/// ```rust
137/// # use polars_core::prelude::*;
138/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
139/// "Color" => ["Red", "Yellow", "Green"]);
140/// ```
141///
142/// ## Using a CSV file
143///
144/// See the `polars_io::csv::CsvReader`.
145///
146/// # Indexing
147/// ## By a number
148///
149/// The `Index<usize>` is implemented for the `DataFrame`.
150///
151/// ```rust
152/// # use polars_core::prelude::*;
153/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
154/// "Color" => ["Red", "Yellow", "Green"])?;
155///
156/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
157/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
158/// # Ok::<(), PolarsError>(())
159/// ```
160///
161/// ## By a `Series` name
162///
163/// ```rust
164/// # use polars_core::prelude::*;
165/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
166/// "Color" => ["Red", "Yellow", "Green"])?;
167///
168/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
169/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
170/// # Ok::<(), PolarsError>(())
171/// ```
172#[derive(Clone)]
173pub struct DataFrame {
174 height: usize,
175 // invariant: columns[i].len() == height for each 0 >= i > columns.len()
176 pub(crate) columns: Vec<Column>,
177
178 /// A cached schema. This might not give correct results if the DataFrame was modified in place
179 /// between schema and reading.
180 cached_schema: OnceLock<SchemaRef>,
181}
182
183impl DataFrame {
184 pub fn clear_schema(&mut self) {
185 self.cached_schema = OnceLock::new();
186 }
187
188 #[inline]
189 pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
190 self.columns.iter()
191 }
192
193 #[inline]
194 pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
195 self.columns.iter().map(Column::as_materialized_series)
196 }
197
198 #[inline]
199 pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
200 self.columns.par_iter().map(Column::as_materialized_series)
201 }
202
203 /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
204 ///
205 /// # Implementation
206 /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
207 /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
208 /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
209 ///
210 /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
211 /// However, this function will yield a smaller number. This is because this function returns
212 /// the visible size of the buffer, not its total capacity.
213 ///
214 /// FFI buffers are included in this estimation.
215 pub fn estimated_size(&self) -> usize {
216 self.columns.iter().map(Column::estimated_size).sum()
217 }
218
219 // Reduce monomorphization.
220 fn try_apply_columns(
221 &self,
222 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
223 ) -> PolarsResult<Vec<Column>> {
224 self.columns.iter().map(func).collect()
225 }
226 // Reduce monomorphization.
227 pub fn _apply_columns(&self, func: &dyn Fn(&Column) -> Column) -> Vec<Column> {
228 self.columns.iter().map(func).collect()
229 }
230 // Reduce monomorphization.
231 fn try_apply_columns_par(
232 &self,
233 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
234 ) -> PolarsResult<Vec<Column>> {
235 POOL.install(|| self.columns.par_iter().map(func).collect())
236 }
237 // Reduce monomorphization.
238 pub fn _apply_columns_par(
239 &self,
240 func: &(dyn Fn(&Column) -> Column + Send + Sync),
241 ) -> Vec<Column> {
242 POOL.install(|| self.columns.par_iter().map(func).collect())
243 }
244
245 /// Get the index of the column.
246 fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
247 self.get_column_index(name)
248 .ok_or_else(|| polars_err!(col_not_found = name))
249 }
250
251 fn check_already_present(&self, name: &str) -> PolarsResult<()> {
252 polars_ensure!(
253 self.columns.iter().all(|s| s.name().as_str() != name),
254 Duplicate: "column with name {:?} is already present in the DataFrame", name
255 );
256 Ok(())
257 }
258
259 /// Reserve additional slots into the chunks of the series.
260 pub(crate) fn reserve_chunks(&mut self, additional: usize) {
261 for s in &mut self.columns {
262 if let Column::Series(s) = s {
263 // SAFETY:
264 // do not modify the data, simply resize.
265 unsafe { s.chunks_mut().reserve(additional) }
266 }
267 }
268 }
269
270 /// Create a DataFrame from a Vector of Series.
271 ///
272 /// Errors if a column names are not unique, or if heights are not all equal.
273 ///
274 /// # Example
275 ///
276 /// ```
277 /// # use polars_core::prelude::*;
278 /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
279 /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
280 ///
281 /// let df = DataFrame::new(vec![s0, s1])?;
282 /// # Ok::<(), PolarsError>(())
283 /// ```
284 pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
285 DataFrame::validate_columns_slice(&columns)
286 .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
287 Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
288 }
289
290 pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
291 for col in &columns {
292 polars_ensure!(
293 col.len() == height,
294 ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
295 columns[0].name(), height, col.name(), col.len()
296 );
297 }
298
299 ensure_names_unique(&columns, |s| s.name().as_str())?;
300
301 Ok(DataFrame {
302 height,
303 columns,
304 cached_schema: OnceLock::new(),
305 })
306 }
307
308 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
309 /// columns to match the other columns.
310 pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
311 // The length of the longest non-unit length column determines the
312 // broadcast length. If all columns are unit-length the broadcast length
313 // is one.
314 let broadcast_len = columns
315 .iter()
316 .map(|s| s.len())
317 .filter(|l| *l != 1)
318 .max()
319 .unwrap_or(1);
320 Self::new_with_broadcast_len(columns, broadcast_len)
321 }
322
323 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
324 /// columns to broadcast_len.
325 pub fn new_with_broadcast_len(
326 columns: Vec<Column>,
327 broadcast_len: usize,
328 ) -> PolarsResult<Self> {
329 ensure_names_unique(&columns, |s| s.name().as_str())?;
330 unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
331 }
332
333 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
334 /// columns to match the other columns.
335 ///
336 /// # Safety
337 /// Does not check that the column names are unique (which they must be).
338 pub unsafe fn new_with_broadcast_no_namecheck(
339 mut columns: Vec<Column>,
340 broadcast_len: usize,
341 ) -> PolarsResult<Self> {
342 for col in &mut columns {
343 // Length not equal to the broadcast len, needs broadcast or is an error.
344 let len = col.len();
345 if len != broadcast_len {
346 if len != 1 {
347 let name = col.name().to_owned();
348 let extra_info =
349 if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
350 format!(" (matching column '{}')", c.name())
351 } else {
352 String::new()
353 };
354 polars_bail!(
355 ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
356 );
357 }
358 *col = col.new_from_index(0, broadcast_len);
359 }
360 }
361
362 let length = if columns.is_empty() { 0 } else { broadcast_len };
363
364 Ok(unsafe { DataFrame::new_no_checks(length, columns) })
365 }
366
367 pub fn new_from_index(&self, index: usize, height: usize) -> Self {
368 let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
369 unsafe { Self::new_no_checks(height, cols.collect()) }
370 }
371
372 /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
373 ///
374 /// # Example
375 ///
376 /// ```rust
377 /// use polars_core::prelude::DataFrame;
378 /// static EMPTY: DataFrame = DataFrame::empty();
379 /// ```
380 pub const fn empty() -> Self {
381 Self::empty_with_height(0)
382 }
383
384 /// Creates an empty `DataFrame` with a specific `height`.
385 pub const fn empty_with_height(height: usize) -> Self {
386 DataFrame {
387 height,
388 columns: vec![],
389 cached_schema: OnceLock::new(),
390 }
391 }
392
393 /// Create an empty `DataFrame` with empty columns as per the `schema`.
394 pub fn empty_with_arc_schema(schema: Arc<Schema>) -> Self {
395 let mut df = Self::empty_with_schema(&schema);
396 df.cached_schema = OnceLock::from(schema);
397 df
398 }
399
400 /// Create an empty `DataFrame` with empty columns as per the `schema`.
401 pub fn empty_with_schema(schema: &Schema) -> Self {
402 let cols = schema
403 .iter()
404 .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
405 .collect();
406 unsafe { DataFrame::new_no_checks(0, cols) }
407 }
408
409 /// Create an empty `DataFrame` with empty columns as per the `schema`.
410 pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
411 let cols = schema
412 .iter_values()
413 .map(|fld| {
414 Column::from(Series::new_empty(
415 fld.name.clone(),
416 &(DataType::from_arrow_field(fld)),
417 ))
418 })
419 .collect();
420 unsafe { DataFrame::new_no_checks(0, cols) }
421 }
422
423 /// Create a new `DataFrame` with the given schema, only containing nulls.
424 pub fn full_null(schema: &Schema, height: usize) -> Self {
425 let columns = schema
426 .iter_fields()
427 .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
428 .collect();
429 unsafe { DataFrame::new_no_checks(height, columns) }
430 }
431
432 /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
433 ///
434 /// # Example
435 ///
436 /// ```rust
437 /// # use polars_core::prelude::*;
438 /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
439 /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
440 /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
441 ///
442 /// assert_eq!(df.pop(), Some(s2));
443 /// assert_eq!(df.pop(), Some(s1));
444 /// assert_eq!(df.pop(), None);
445 /// assert!(df.is_empty());
446 /// # Ok::<(), PolarsError>(())
447 /// ```
448 pub fn pop(&mut self) -> Option<Column> {
449 self.clear_schema();
450
451 self.columns.pop()
452 }
453
454 /// Add a new column at index 0 that counts the rows.
455 ///
456 /// # Example
457 ///
458 /// ```
459 /// # use polars_core::prelude::*;
460 /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
461 /// assert_eq!(df1.shape(), (4, 1));
462 ///
463 /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
464 /// assert_eq!(df2.shape(), (4, 2));
465 /// println!("{}", df2);
466 ///
467 /// # Ok::<(), PolarsError>(())
468 /// ```
469 ///
470 /// Output:
471 ///
472 /// ```text
473 /// shape: (4, 2)
474 /// +-----+----------+
475 /// | Id | Name |
476 /// | --- | --- |
477 /// | u32 | str |
478 /// +=====+==========+
479 /// | 0 | James |
480 /// +-----+----------+
481 /// | 1 | Mary |
482 /// +-----+----------+
483 /// | 2 | John |
484 /// +-----+----------+
485 /// | 3 | Patricia |
486 /// +-----+----------+
487 /// ```
488 pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
489 let mut columns = Vec::with_capacity(self.columns.len() + 1);
490 let offset = offset.unwrap_or(0);
491
492 let col = Column::new_row_index(name, offset, self.height())?;
493 columns.push(col);
494 columns.extend_from_slice(&self.columns);
495 DataFrame::new(columns)
496 }
497
498 /// Add a row index column in place.
499 ///
500 /// # Safety
501 /// The caller should ensure the DataFrame does not already contain a column with the given name.
502 ///
503 /// # Panics
504 /// Panics if the resulting column would reach or overflow IdxSize::MAX.
505 pub unsafe fn with_row_index_mut(
506 &mut self,
507 name: PlSmallStr,
508 offset: Option<IdxSize>,
509 ) -> &mut Self {
510 // TODO: Make this function unsafe
511 debug_assert!(
512 self.columns.iter().all(|c| c.name() != &name),
513 "with_row_index_mut(): column with name {} already exists",
514 &name
515 );
516
517 let offset = offset.unwrap_or(0);
518 let col = Column::new_row_index(name, offset, self.height()).unwrap();
519
520 self.clear_schema();
521 self.columns.insert(0, col);
522 self
523 }
524
525 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
526 /// `Series`.
527 ///
528 /// Calculates the height from the first column or `0` if no columns are given.
529 ///
530 /// # Safety
531 ///
532 /// It is the callers responsibility to uphold the contract of all `Series`
533 /// having an equal length and a unique name, if not this may panic down the line.
534 pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
535 let height = columns.first().map_or(0, Column::len);
536 unsafe { Self::new_no_checks(height, columns) }
537 }
538
539 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
540 /// `Series`.
541 ///
542 /// It is advised to use [DataFrame::new] in favor of this method.
543 ///
544 /// # Safety
545 ///
546 /// It is the callers responsibility to uphold the contract of all `Series`
547 /// having an equal length and a unique name, if not this may panic down the line.
548 pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
549 if cfg!(debug_assertions) {
550 DataFrame::validate_columns_slice(&columns).unwrap();
551 }
552
553 unsafe { Self::_new_no_checks_impl(height, columns) }
554 }
555
556 /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
557 /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
558 /// constructed with this method is generally highly unsafe and should not be long-lived.
559 #[allow(clippy::missing_safety_doc)]
560 pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
561 DataFrame {
562 height,
563 columns,
564 cached_schema: OnceLock::new(),
565 }
566 }
567
568 /// Shrink the capacity of this DataFrame to fit its length.
569 pub fn shrink_to_fit(&mut self) {
570 // Don't parallelize this. Memory overhead
571 for s in &mut self.columns {
572 s.shrink_to_fit();
573 }
574 }
575
576 /// Aggregate all the chunks in the DataFrame to a single chunk.
577 pub fn as_single_chunk(&mut self) -> &mut Self {
578 // Don't parallelize this. Memory overhead
579 for s in &mut self.columns {
580 *s = s.rechunk();
581 }
582 self
583 }
584
585 /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
586 /// This may lead to more peak memory consumption.
587 pub fn as_single_chunk_par(&mut self) -> &mut Self {
588 if self.columns.iter().any(|c| c.n_chunks() > 1) {
589 self.columns = self._apply_columns_par(&|s| s.rechunk());
590 }
591 self
592 }
593
594 /// Rechunks all columns to only have a single chunk.
595 pub fn rechunk_mut(&mut self) {
596 // SAFETY: We never adjust the length or names of the columns.
597 let columns = unsafe { self.get_columns_mut() };
598
599 for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
600 *col = col.rechunk();
601 }
602 }
603
604 pub fn _deshare_views_mut(&mut self) {
605 // SAFETY: We never adjust the length or names of the columns.
606 unsafe {
607 let columns = self.get_columns_mut();
608 for col in columns {
609 let Column::Series(s) = col else { continue };
610
611 if let Ok(ca) = s.binary() {
612 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
613 *col = Column::from(gc_ca.into_series());
614 } else if let Ok(ca) = s.str() {
615 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
616 *col = Column::from(gc_ca.into_series());
617 }
618 }
619 }
620 }
621
622 /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
623 pub fn rechunk_to_record_batch(
624 self,
625 compat_level: CompatLevel,
626 ) -> RecordBatchT<Box<dyn Array>> {
627 let height = self.height();
628
629 let (schema, arrays) = self
630 .columns
631 .into_iter()
632 .map(|col| {
633 let mut series = col.take_materialized_series();
634 // Rechunk to one chunk if necessary
635 if series.n_chunks() > 1 {
636 series = series.rechunk();
637 }
638 (
639 series.field().to_arrow(compat_level),
640 series.to_arrow(0, compat_level),
641 )
642 })
643 .collect();
644
645 RecordBatchT::new(height, Arc::new(schema), arrays)
646 }
647
648 /// Returns true if the chunks of the columns do not align and re-chunking should be done
649 pub fn should_rechunk(&self) -> bool {
650 // Fast check. It is also needed for correctness, as code below doesn't check if the number
651 // of chunks is equal.
652 if !self
653 .get_columns()
654 .iter()
655 .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
656 .all_equal()
657 {
658 return true;
659 }
660
661 // From here we check chunk lengths.
662 let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
663 match chunk_lengths.next() {
664 None => false,
665 Some(first_column_chunk_lengths) => {
666 // Fast Path for single Chunk Series
667 if first_column_chunk_lengths.size_hint().0 == 1 {
668 return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
669 }
670 // Always rechunk if we have more chunks than rows.
671 // except when we have an empty df containing a single chunk
672 let height = self.height();
673 let n_chunks = first_column_chunk_lengths.size_hint().0;
674 if n_chunks > height && !(height == 0 && n_chunks == 1) {
675 return true;
676 }
677 // Slow Path for multi Chunk series
678 let v: Vec<_> = first_column_chunk_lengths.collect();
679 for cl in chunk_lengths {
680 if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
681 return true;
682 }
683 }
684 false
685 },
686 }
687 }
688
689 /// Ensure all the chunks in the [`DataFrame`] are aligned.
690 pub fn align_chunks_par(&mut self) -> &mut Self {
691 if self.should_rechunk() {
692 self.as_single_chunk_par()
693 } else {
694 self
695 }
696 }
697
698 pub fn align_chunks(&mut self) -> &mut Self {
699 if self.should_rechunk() {
700 self.as_single_chunk()
701 } else {
702 self
703 }
704 }
705
706 /// Get the [`DataFrame`] schema.
707 ///
708 /// # Example
709 ///
710 /// ```rust
711 /// # use polars_core::prelude::*;
712 /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
713 /// "Diameter (m)" => [8.8e26, f64::INFINITY])?;
714 ///
715 /// let f1: Field = Field::new("Thing".into(), DataType::String);
716 /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
717 /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
718 ///
719 /// assert_eq!(&**df.schema(), &sc);
720 /// # Ok::<(), PolarsError>(())
721 /// ```
722 pub fn schema(&self) -> &SchemaRef {
723 let out = self.cached_schema.get_or_init(|| {
724 Arc::new(
725 self.columns
726 .iter()
727 .map(|x| (x.name().clone(), x.dtype().clone()))
728 .collect(),
729 )
730 });
731
732 debug_assert_eq!(out.len(), self.width());
733
734 out
735 }
736
737 /// Get a reference to the [`DataFrame`] columns.
738 ///
739 /// # Example
740 ///
741 /// ```rust
742 /// # use polars_core::prelude::*;
743 /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
744 /// "Symbol" => ["A", "C", "G", "T"])?;
745 /// let columns: &[Column] = df.get_columns();
746 ///
747 /// assert_eq!(columns[0].name(), "Name");
748 /// assert_eq!(columns[1].name(), "Symbol");
749 /// # Ok::<(), PolarsError>(())
750 /// ```
751 #[inline]
752 pub fn get_columns(&self) -> &[Column] {
753 &self.columns
754 }
755
756 #[inline]
757 /// Get mutable access to the underlying columns.
758 ///
759 /// # Safety
760 ///
761 /// The caller must ensure the length of all [`Series`] remains equal to `height` or
762 /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
763 /// The caller must ensure that the cached schema is cleared if it modifies the schema by
764 /// calling [`DataFrame::clear_schema`].
765 pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
766 &mut self.columns
767 }
768
769 #[inline]
770 /// Remove all the columns in the [`DataFrame`] but keep the `height`.
771 pub fn clear_columns(&mut self) {
772 unsafe { self.get_columns_mut() }.clear();
773 self.clear_schema();
774 }
775
776 #[inline]
777 /// Extend the columns without checking for name collisions or height.
778 ///
779 /// # Safety
780 ///
781 /// The caller needs to ensure that:
782 /// - Column names are unique within the resulting [`DataFrame`].
783 /// - The length of each appended column matches the height of the [`DataFrame`]. For
784 /// `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
785 /// with [`DataFrame::set_height`].
786 pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
787 unsafe { self.get_columns_mut() }.extend(iter);
788 self.clear_schema();
789 }
790
791 /// Take ownership of the underlying columns vec.
792 pub fn take_columns(self) -> Vec<Column> {
793 self.columns
794 }
795
796 /// Iterator over the columns as [`Series`].
797 ///
798 /// # Example
799 ///
800 /// ```rust
801 /// # use polars_core::prelude::*;
802 /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
803 /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
804 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
805 ///
806 /// let mut iterator = df.iter();
807 ///
808 /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
809 /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
810 /// assert_eq!(iterator.next(), None);
811 /// # Ok::<(), PolarsError>(())
812 /// ```
813 pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
814 self.materialized_column_iter()
815 }
816
817 /// # Example
818 ///
819 /// ```rust
820 /// # use polars_core::prelude::*;
821 /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
822 /// "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
823 ///
824 /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
825 /// # Ok::<(), PolarsError>(())
826 /// ```
827 pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
828 self.columns.iter().map(|s| s.name()).collect()
829 }
830
831 /// Get the [`Vec<PlSmallStr>`] representing the column names.
832 pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
833 self.columns.iter().map(|s| s.name().clone()).collect()
834 }
835
836 pub fn get_column_names_str(&self) -> Vec<&str> {
837 self.columns.iter().map(|s| s.name().as_str()).collect()
838 }
839
840 /// Set the column names.
841 /// # Example
842 ///
843 /// ```rust
844 /// # use polars_core::prelude::*;
845 /// let mut df: DataFrame = df!("Mathematical set" => ["ā", "ā¤", "š»", "ā", "ā", "ā"])?;
846 /// df.set_column_names(["Set"])?;
847 ///
848 /// assert_eq!(df.get_column_names(), &["Set"]);
849 /// # Ok::<(), PolarsError>(())
850 /// ```
851 pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
852 where
853 I: IntoIterator<Item = S>,
854 S: Into<PlSmallStr>,
855 {
856 let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
857 self._set_column_names_impl(names.as_slice())
858 }
859
860 fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
861 polars_ensure!(
862 names.len() == self.width(),
863 ShapeMismatch: "{} column names provided for a DataFrame of width {}",
864 names.len(), self.width()
865 );
866 ensure_names_unique(names, |s| s.as_str())?;
867
868 let columns = mem::take(&mut self.columns);
869 self.columns = columns
870 .into_iter()
871 .zip(names)
872 .map(|(s, name)| {
873 let mut s = s;
874 s.rename(name.clone());
875 s
876 })
877 .collect();
878 self.clear_schema();
879 Ok(())
880 }
881
882 /// Get the data types of the columns in the [`DataFrame`].
883 ///
884 /// # Example
885 ///
886 /// ```rust
887 /// # use polars_core::prelude::*;
888 /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
889 /// "Fraction" => [0.965, 0.035])?;
890 ///
891 /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
892 /// # Ok::<(), PolarsError>(())
893 /// ```
894 pub fn dtypes(&self) -> Vec<DataType> {
895 self.columns.iter().map(|s| s.dtype().clone()).collect()
896 }
897
898 pub(crate) fn first_series_column(&self) -> Option<&Series> {
899 self.columns.iter().find_map(|col| col.as_series())
900 }
901
902 /// The number of chunks for the first column.
903 pub fn first_col_n_chunks(&self) -> usize {
904 match self.first_series_column() {
905 None if self.columns.is_empty() => 0,
906 None => 1,
907 Some(s) => s.n_chunks(),
908 }
909 }
910
911 /// The highest number of chunks for any column.
912 pub fn max_n_chunks(&self) -> usize {
913 self.columns
914 .iter()
915 .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
916 .max()
917 .unwrap_or(0)
918 }
919
920 /// Get a reference to the schema fields of the [`DataFrame`].
921 ///
922 /// # Example
923 ///
924 /// ```rust
925 /// # use polars_core::prelude::*;
926 /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
927 /// "Fraction" => [0.708, 0.292])?;
928 ///
929 /// let f1: Field = Field::new("Surface type".into(), DataType::String);
930 /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
931 ///
932 /// assert_eq!(earth.fields(), &[f1, f2]);
933 /// # Ok::<(), PolarsError>(())
934 /// ```
935 pub fn fields(&self) -> Vec<Field> {
936 self.columns
937 .iter()
938 .map(|s| s.field().into_owned())
939 .collect()
940 }
941
942 /// Get (height, width) of the [`DataFrame`].
943 ///
944 /// # Example
945 ///
946 /// ```rust
947 /// # use polars_core::prelude::*;
948 /// let df0: DataFrame = DataFrame::default();
949 /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
950 /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
951 /// "2" => [1, 2, 3, 4, 5])?;
952 ///
953 /// assert_eq!(df0.shape(), (0 ,0));
954 /// assert_eq!(df1.shape(), (5, 1));
955 /// assert_eq!(df2.shape(), (5, 2));
956 /// # Ok::<(), PolarsError>(())
957 /// ```
958 pub fn shape(&self) -> (usize, usize) {
959 (self.height, self.columns.len())
960 }
961
962 /// Get the width of the [`DataFrame`] which is the number of columns.
963 ///
964 /// # Example
965 ///
966 /// ```rust
967 /// # use polars_core::prelude::*;
968 /// let df0: DataFrame = DataFrame::default();
969 /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
970 /// let df2: DataFrame = df!("Series 1" => [0; 0],
971 /// "Series 2" => [0; 0])?;
972 ///
973 /// assert_eq!(df0.width(), 0);
974 /// assert_eq!(df1.width(), 1);
975 /// assert_eq!(df2.width(), 2);
976 /// # Ok::<(), PolarsError>(())
977 /// ```
978 pub fn width(&self) -> usize {
979 self.columns.len()
980 }
981
982 /// Get the height of the [`DataFrame`] which is the number of rows.
983 ///
984 /// # Example
985 ///
986 /// ```rust
987 /// # use polars_core::prelude::*;
988 /// let df0: DataFrame = DataFrame::default();
989 /// let df1: DataFrame = df!("Currency" => ["ā¬", "$"])?;
990 /// let df2: DataFrame = df!("Currency" => ["ā¬", "$", "Ā„", "Ā£", "āæ"])?;
991 ///
992 /// assert_eq!(df0.height(), 0);
993 /// assert_eq!(df1.height(), 2);
994 /// assert_eq!(df2.height(), 5);
995 /// # Ok::<(), PolarsError>(())
996 /// ```
997 pub fn height(&self) -> usize {
998 self.height
999 }
1000
1001 /// Returns the size as number of rows * number of columns
1002 pub fn size(&self) -> usize {
1003 let s = self.shape();
1004 s.0 * s.1
1005 }
1006
1007 /// Returns `true` if the [`DataFrame`] contains no rows.
1008 ///
1009 /// # Example
1010 ///
1011 /// ```rust
1012 /// # use polars_core::prelude::*;
1013 /// let df1: DataFrame = DataFrame::default();
1014 /// assert!(df1.is_empty());
1015 ///
1016 /// let df2: DataFrame = df!("First name" => ["Forever"],
1017 /// "Last name" => ["Alone"])?;
1018 /// assert!(!df2.is_empty());
1019 /// # Ok::<(), PolarsError>(())
1020 /// ```
1021 pub fn is_empty(&self) -> bool {
1022 matches!(self.shape(), (0, _) | (_, 0))
1023 }
1024
1025 /// Set the height (i.e. number of rows) of this [`DataFrame`].
1026 ///
1027 /// # Safety
1028 ///
1029 /// This needs to be equal to the length of all the columns.
1030 pub unsafe fn set_height(&mut self, height: usize) {
1031 self.height = height;
1032 }
1033
1034 /// Add multiple [`Series`] to a [`DataFrame`].
1035 /// The added `Series` are required to have the same length.
1036 ///
1037 /// # Example
1038 ///
1039 /// ```rust
1040 /// # use polars_core::prelude::*;
1041 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1042 /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1043 /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1044 ///
1045 /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1046 /// assert_eq!(df2.shape(), (3, 3));
1047 /// println!("{}", df2);
1048 /// # Ok::<(), PolarsError>(())
1049 /// ```
1050 ///
1051 /// Output:
1052 ///
1053 /// ```text
1054 /// shape: (3, 3)
1055 /// +---------+--------+----------+
1056 /// | Element | Proton | Electron |
1057 /// | --- | --- | --- |
1058 /// | str | i32 | i32 |
1059 /// +=========+========+==========+
1060 /// | Copper | 29 | 29 |
1061 /// +---------+--------+----------+
1062 /// | Silver | 47 | 47 |
1063 /// +---------+--------+----------+
1064 /// | Gold | 79 | 79 |
1065 /// +---------+--------+----------+
1066 /// ```
1067 pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1068 let mut new_cols = self.columns.clone();
1069 new_cols.extend_from_slice(columns);
1070 DataFrame::new(new_cols)
1071 }
1072
1073 /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1074 ///
1075 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1076 ///
1077 /// # Example
1078 ///
1079 /// ```rust
1080 /// # use polars_core::prelude::*;
1081 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1082 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1083 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1084 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1085 ///
1086 /// let df3: DataFrame = df1.vstack(&df2)?;
1087 ///
1088 /// assert_eq!(df3.shape(), (5, 2));
1089 /// println!("{}", df3);
1090 /// # Ok::<(), PolarsError>(())
1091 /// ```
1092 ///
1093 /// Output:
1094 ///
1095 /// ```text
1096 /// shape: (5, 2)
1097 /// +-----------+-------------------+
1098 /// | Element | Melting Point (K) |
1099 /// | --- | --- |
1100 /// | str | f64 |
1101 /// +===========+===================+
1102 /// | Copper | 1357.77 |
1103 /// +-----------+-------------------+
1104 /// | Silver | 1234.93 |
1105 /// +-----------+-------------------+
1106 /// | Gold | 1337.33 |
1107 /// +-----------+-------------------+
1108 /// | Platinum | 2041.4 |
1109 /// +-----------+-------------------+
1110 /// | Palladium | 1828.05 |
1111 /// +-----------+-------------------+
1112 /// ```
1113 pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1114 let mut df = self.clone();
1115 df.vstack_mut(other)?;
1116 Ok(df)
1117 }
1118
1119 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1120 ///
1121 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1122 ///
1123 /// # Example
1124 ///
1125 /// ```rust
1126 /// # use polars_core::prelude::*;
1127 /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1128 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1129 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1130 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1131 ///
1132 /// df1.vstack_mut(&df2)?;
1133 ///
1134 /// assert_eq!(df1.shape(), (5, 2));
1135 /// println!("{}", df1);
1136 /// # Ok::<(), PolarsError>(())
1137 /// ```
1138 ///
1139 /// Output:
1140 ///
1141 /// ```text
1142 /// shape: (5, 2)
1143 /// +-----------+-------------------+
1144 /// | Element | Melting Point (K) |
1145 /// | --- | --- |
1146 /// | str | f64 |
1147 /// +===========+===================+
1148 /// | Copper | 1357.77 |
1149 /// +-----------+-------------------+
1150 /// | Silver | 1234.93 |
1151 /// +-----------+-------------------+
1152 /// | Gold | 1337.33 |
1153 /// +-----------+-------------------+
1154 /// | Platinum | 2041.4 |
1155 /// +-----------+-------------------+
1156 /// | Palladium | 1828.05 |
1157 /// +-----------+-------------------+
1158 /// ```
1159 pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1160 if self.width() != other.width() {
1161 polars_ensure!(
1162 self.width() == 0,
1163 ShapeMismatch:
1164 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1165 self.width(), other.width(),
1166 );
1167 self.columns.clone_from(&other.columns);
1168 self.height = other.height;
1169 return Ok(self);
1170 }
1171
1172 self.columns
1173 .iter_mut()
1174 .zip(other.columns.iter())
1175 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1176 ensure_can_extend(&*left, right)?;
1177 left.append(right).map_err(|e| {
1178 e.context(format!("failed to vstack column '{}'", right.name()).into())
1179 })?;
1180 Ok(())
1181 })?;
1182 self.height += other.height;
1183 Ok(self)
1184 }
1185
1186 pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1187 if self.width() != other.width() {
1188 polars_ensure!(
1189 self.width() == 0,
1190 ShapeMismatch:
1191 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1192 self.width(), other.width(),
1193 );
1194 self.columns = other.columns;
1195 self.height = other.height;
1196 return Ok(self);
1197 }
1198
1199 self.columns
1200 .iter_mut()
1201 .zip(other.columns.into_iter())
1202 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1203 ensure_can_extend(&*left, &right)?;
1204 let right_name = right.name().clone();
1205 left.append_owned(right).map_err(|e| {
1206 e.context(format!("failed to vstack column '{right_name}'").into())
1207 })?;
1208 Ok(())
1209 })?;
1210 self.height += other.height;
1211 Ok(self)
1212 }
1213
1214 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1215 ///
1216 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1217 ///
1218 /// # Panics
1219 /// Panics if the schema's don't match.
1220 pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1221 self.columns
1222 .iter_mut()
1223 .zip(other.columns.iter())
1224 .for_each(|(left, right)| {
1225 left.append(right)
1226 .map_err(|e| {
1227 e.context(format!("failed to vstack column '{}'", right.name()).into())
1228 })
1229 .expect("should not fail");
1230 });
1231 self.height += other.height;
1232 }
1233
1234 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1235 ///
1236 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1237 ///
1238 /// # Panics
1239 /// Panics if the schema's don't match.
1240 pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1241 self.columns
1242 .iter_mut()
1243 .zip(other.columns)
1244 .for_each(|(left, right)| {
1245 left.append_owned(right).expect("should not fail");
1246 });
1247 self.height += other.height;
1248 }
1249
1250 /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1251 ///
1252 /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1253 /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1254 ///
1255 /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1256 /// and thus will yield faster queries.
1257 ///
1258 /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1259 /// online operations where you add `n` rows and rerun a query.
1260 ///
1261 /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1262 /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1263 /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1264 pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1265 polars_ensure!(
1266 self.width() == other.width(),
1267 ShapeMismatch:
1268 "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1269 self.width(), other.width(),
1270 );
1271
1272 self.columns
1273 .iter_mut()
1274 .zip(other.columns.iter())
1275 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1276 ensure_can_extend(&*left, right)?;
1277 left.extend(right).map_err(|e| {
1278 e.context(format!("failed to extend column '{}'", right.name()).into())
1279 })?;
1280 Ok(())
1281 })?;
1282 self.height += other.height;
1283 self.clear_schema();
1284 Ok(())
1285 }
1286
1287 /// Remove a column by name and return the column removed.
1288 ///
1289 /// # Example
1290 ///
1291 /// ```rust
1292 /// # use polars_core::prelude::*;
1293 /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1294 /// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1295 ///
1296 /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1297 /// assert!(s1.is_err());
1298 ///
1299 /// let s2: Column = df.drop_in_place("Animal")?;
1300 /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1301 /// # Ok::<(), PolarsError>(())
1302 /// ```
1303 pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1304 let idx = self.check_name_to_idx(name)?;
1305 self.clear_schema();
1306 Ok(self.columns.remove(idx))
1307 }
1308
1309 /// Return a new [`DataFrame`] where all null values are dropped.
1310 ///
1311 /// # Example
1312 ///
1313 /// ```no_run
1314 /// # use polars_core::prelude::*;
1315 /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1316 /// "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1317 /// assert_eq!(df1.shape(), (3, 2));
1318 ///
1319 /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1320 /// assert_eq!(df2.shape(), (1, 2));
1321 /// println!("{}", df2);
1322 /// # Ok::<(), PolarsError>(())
1323 /// ```
1324 ///
1325 /// Output:
1326 ///
1327 /// ```text
1328 /// shape: (1, 2)
1329 /// +---------+---------------------+
1330 /// | Country | Tax revenue (% GDP) |
1331 /// | --- | --- |
1332 /// | str | f64 |
1333 /// +=========+=====================+
1334 /// | Malta | 32.7 |
1335 /// +---------+---------------------+
1336 /// ```
1337 pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1338 where
1339 for<'a> &'a S: Into<PlSmallStr>,
1340 {
1341 if let Some(v) = subset {
1342 let v = self.select_columns(v)?;
1343 self._drop_nulls_impl(v.as_slice())
1344 } else {
1345 self._drop_nulls_impl(self.columns.as_slice())
1346 }
1347 }
1348
1349 fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1350 // fast path for no nulls in df
1351 if subset.iter().all(|s| !s.has_nulls()) {
1352 return Ok(self.clone());
1353 }
1354
1355 let mut iter = subset.iter();
1356
1357 let mask = iter
1358 .next()
1359 .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1360 let mut mask = mask.is_not_null();
1361
1362 for c in iter {
1363 mask = mask & c.is_not_null();
1364 }
1365 self.filter(&mask)
1366 }
1367
1368 /// Drop a column by name.
1369 /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1370 /// the current one in place.
1371 ///
1372 /// # Example
1373 ///
1374 /// ```rust
1375 /// # use polars_core::prelude::*;
1376 /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1377 /// let df2: DataFrame = df1.drop("Ray type")?;
1378 ///
1379 /// assert!(df2.is_empty());
1380 /// # Ok::<(), PolarsError>(())
1381 /// ```
1382 pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1383 let idx = self.check_name_to_idx(name)?;
1384 let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1385
1386 self.columns.iter().enumerate().for_each(|(i, s)| {
1387 if i != idx {
1388 new_cols.push(s.clone())
1389 }
1390 });
1391
1392 Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1393 }
1394
1395 /// Drop columns that are in `names`.
1396 pub fn drop_many<I, S>(&self, names: I) -> Self
1397 where
1398 I: IntoIterator<Item = S>,
1399 S: Into<PlSmallStr>,
1400 {
1401 let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1402 self.drop_many_amortized(&names)
1403 }
1404
1405 /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1406 pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1407 if names.is_empty() {
1408 return self.clone();
1409 }
1410 let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1411 self.columns.iter().for_each(|s| {
1412 if !names.contains(s.name()) {
1413 new_cols.push(s.clone())
1414 }
1415 });
1416
1417 unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1418 }
1419
1420 /// Insert a new column at a given index without checking for duplicates.
1421 /// This can leave the [`DataFrame`] at an invalid state
1422 fn insert_column_no_name_check(
1423 &mut self,
1424 index: usize,
1425 column: Column,
1426 ) -> PolarsResult<&mut Self> {
1427 polars_ensure!(
1428 self.width() == 0 || column.len() == self.height(),
1429 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1430 column.len(), self.height(),
1431 );
1432
1433 if self.width() == 0 {
1434 self.height = column.len();
1435 }
1436
1437 self.columns.insert(index, column);
1438 self.clear_schema();
1439 Ok(self)
1440 }
1441
1442 /// Insert a new column at a given index.
1443 pub fn insert_column<S: IntoColumn>(
1444 &mut self,
1445 index: usize,
1446 column: S,
1447 ) -> PolarsResult<&mut Self> {
1448 let column = column.into_column();
1449 self.check_already_present(column.name().as_str())?;
1450 self.insert_column_no_name_check(index, column)
1451 }
1452
1453 fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1454 if let Some(idx) = self.get_column_index(column.name().as_str()) {
1455 self.replace_column(idx, column)?;
1456 } else {
1457 if self.width() == 0 {
1458 self.height = column.len();
1459 }
1460
1461 self.columns.push(column);
1462 self.clear_schema();
1463 }
1464 Ok(())
1465 }
1466
1467 /// Add a new column to this [`DataFrame`] or replace an existing one.
1468 pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1469 fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1470 let height = df.height();
1471 if column.len() == 1 && height > 1 {
1472 column = column.new_from_index(0, height);
1473 }
1474
1475 if column.len() == height || df.get_columns().is_empty() {
1476 df.add_column_by_search(column)?;
1477 Ok(df)
1478 }
1479 // special case for literals
1480 else if height == 0 && column.len() == 1 {
1481 let s = column.clear();
1482 df.add_column_by_search(s)?;
1483 Ok(df)
1484 } else {
1485 polars_bail!(
1486 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1487 column.len(), height,
1488 );
1489 }
1490 }
1491 let column = column.into_column();
1492 inner(self, column)
1493 }
1494
1495 /// Adds a column to the [`DataFrame`] without doing any checks
1496 /// on length or duplicates.
1497 ///
1498 /// # Safety
1499 /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1500 pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1501 debug_assert!(self.width() == 0 || self.height() == column.len());
1502 debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1503
1504 // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1505 // properly for `width` == 0.
1506 if self.width() == 0 {
1507 unsafe { self.set_height(column.len()) };
1508 }
1509 unsafe { self.get_columns_mut() }.push(column);
1510 self.clear_schema();
1511
1512 self
1513 }
1514
1515 // Note: Schema can be both input or output_schema
1516 fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1517 let name = c.name();
1518 if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1519 if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1520 // Given schema is output_schema and we can push.
1521 if idx == self.columns.len() {
1522 if self.width() == 0 {
1523 self.height = c.len();
1524 }
1525
1526 self.columns.push(c);
1527 self.clear_schema();
1528 }
1529 // Schema is incorrect fallback to search
1530 else {
1531 debug_assert!(false);
1532 self.add_column_by_search(c)?;
1533 }
1534 } else {
1535 self.replace_column(idx, c)?;
1536 }
1537 } else {
1538 if self.width() == 0 {
1539 self.height = c.len();
1540 }
1541
1542 self.columns.push(c);
1543 self.clear_schema();
1544 }
1545
1546 Ok(())
1547 }
1548
1549 // Note: Schema can be both input or output_schema
1550 pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1551 for (i, s) in series.into_iter().enumerate() {
1552 // we need to branch here
1553 // because users can add multiple columns with the same name
1554 if i == 0 || schema.get(s.name().as_str()).is_some() {
1555 self.with_column_and_schema(s.into_column(), schema)?;
1556 } else {
1557 self.with_column(s.clone().into_column())?;
1558 }
1559 }
1560 Ok(())
1561 }
1562
1563 pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1564 for (i, s) in columns.into_iter().enumerate() {
1565 // we need to branch here
1566 // because users can add multiple columns with the same name
1567 if i == 0 || schema.get(s.name().as_str()).is_some() {
1568 self.with_column_and_schema(s, schema)?;
1569 } else {
1570 self.with_column(s.clone())?;
1571 }
1572 }
1573
1574 Ok(())
1575 }
1576
1577 /// Add a new column to this [`DataFrame`] or replace an existing one.
1578 /// Uses an existing schema to amortize lookups.
1579 /// If the schema is incorrect, we will fallback to linear search.
1580 ///
1581 /// Note: Schema can be both input or output_schema
1582 pub fn with_column_and_schema<C: IntoColumn>(
1583 &mut self,
1584 column: C,
1585 schema: &Schema,
1586 ) -> PolarsResult<&mut Self> {
1587 let mut column = column.into_column();
1588
1589 let height = self.height();
1590 if column.len() == 1 && height > 1 {
1591 column = column.new_from_index(0, height);
1592 }
1593
1594 if column.len() == height || self.columns.is_empty() {
1595 self.add_column_by_schema(column, schema)?;
1596 Ok(self)
1597 }
1598 // special case for literals
1599 else if height == 0 && column.len() == 1 {
1600 let s = column.clear();
1601 self.add_column_by_schema(s, schema)?;
1602 Ok(self)
1603 } else {
1604 polars_bail!(
1605 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1606 column.len(), height,
1607 );
1608 }
1609 }
1610
1611 /// Get a row in the [`DataFrame`]. Beware this is slow.
1612 ///
1613 /// # Example
1614 ///
1615 /// ```
1616 /// # use polars_core::prelude::*;
1617 /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1618 /// df.get(idx)
1619 /// }
1620 /// ```
1621 pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1622 match self.columns.first() {
1623 Some(s) => {
1624 if s.len() <= idx {
1625 return None;
1626 }
1627 },
1628 None => return None,
1629 }
1630 // SAFETY: we just checked bounds
1631 unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1632 }
1633
1634 /// Select a [`Series`] by index.
1635 ///
1636 /// # Example
1637 ///
1638 /// ```rust
1639 /// # use polars_core::prelude::*;
1640 /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1641 /// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1642 ///
1643 /// let s1: Option<&Column> = df.select_at_idx(0);
1644 /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1645 ///
1646 /// assert_eq!(s1, Some(&s2));
1647 /// # Ok::<(), PolarsError>(())
1648 /// ```
1649 pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1650 self.columns.get(idx)
1651 }
1652
1653 /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1654 ///
1655 /// # Examples
1656 ///
1657 /// ```rust
1658 /// # use polars_core::prelude::*;
1659 /// let df = df! {
1660 /// "0" => [0, 0, 0],
1661 /// "1" => [1, 1, 1],
1662 /// "2" => [2, 2, 2]
1663 /// }?;
1664 ///
1665 /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1666 /// assert!(df.equals(&df.select_by_range(..)?));
1667 /// # Ok::<(), PolarsError>(())
1668 /// ```
1669 pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1670 where
1671 R: ops::RangeBounds<usize>,
1672 {
1673 // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1674 // because it is the nightly feature. We should change here if this function were stable.
1675 fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1676 where
1677 R: ops::RangeBounds<usize>,
1678 {
1679 let len = bounds.end;
1680
1681 let start: ops::Bound<&usize> = range.start_bound();
1682 let start = match start {
1683 ops::Bound::Included(&start) => start,
1684 ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1685 panic!("attempted to index slice from after maximum usize");
1686 }),
1687 ops::Bound::Unbounded => 0,
1688 };
1689
1690 let end: ops::Bound<&usize> = range.end_bound();
1691 let end = match end {
1692 ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1693 panic!("attempted to index slice up to maximum usize");
1694 }),
1695 ops::Bound::Excluded(&end) => end,
1696 ops::Bound::Unbounded => len,
1697 };
1698
1699 if start > end {
1700 panic!("slice index starts at {start} but ends at {end}");
1701 }
1702 if end > len {
1703 panic!("range end index {end} out of range for slice of length {len}",);
1704 }
1705
1706 ops::Range { start, end }
1707 }
1708
1709 let colnames = self.get_column_names_owned();
1710 let range = get_range(range, ..colnames.len());
1711
1712 self._select_impl(&colnames[range])
1713 }
1714
1715 /// Get column index of a [`Series`] by name.
1716 /// # Example
1717 ///
1718 /// ```rust
1719 /// # use polars_core::prelude::*;
1720 /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1721 /// "Health" => [100, 200, 500],
1722 /// "Mana" => [250, 100, 0],
1723 /// "Strength" => [30, 150, 300])?;
1724 ///
1725 /// assert_eq!(df.get_column_index("Name"), Some(0));
1726 /// assert_eq!(df.get_column_index("Health"), Some(1));
1727 /// assert_eq!(df.get_column_index("Mana"), Some(2));
1728 /// assert_eq!(df.get_column_index("Strength"), Some(3));
1729 /// assert_eq!(df.get_column_index("Haste"), None);
1730 /// # Ok::<(), PolarsError>(())
1731 /// ```
1732 pub fn get_column_index(&self, name: &str) -> Option<usize> {
1733 let schema = self.schema();
1734 if let Some(idx) = schema.index_of(name) {
1735 if self
1736 .get_columns()
1737 .get(idx)
1738 .is_some_and(|c| c.name() == name)
1739 {
1740 return Some(idx);
1741 }
1742 }
1743
1744 self.columns.iter().position(|s| s.name().as_str() == name)
1745 }
1746
1747 /// Get column index of a [`Series`] by name.
1748 pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1749 self.get_column_index(name)
1750 .ok_or_else(|| polars_err!(col_not_found = name))
1751 }
1752
1753 /// Select a single column by name.
1754 ///
1755 /// # Example
1756 ///
1757 /// ```rust
1758 /// # use polars_core::prelude::*;
1759 /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1760 /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1761 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1762 ///
1763 /// assert_eq!(df.column("Password")?, &s1);
1764 /// # Ok::<(), PolarsError>(())
1765 /// ```
1766 pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1767 let idx = self.try_get_column_index(name)?;
1768 Ok(self.select_at_idx(idx).unwrap())
1769 }
1770
1771 /// Selected multiple columns by name.
1772 ///
1773 /// # Example
1774 ///
1775 /// ```rust
1776 /// # use polars_core::prelude::*;
1777 /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1778 /// "Max weight (kg)" => [16.0, 35.89])?;
1779 /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1780 ///
1781 /// assert_eq!(&df[0], sv[0]);
1782 /// assert_eq!(&df[1], sv[1]);
1783 /// # Ok::<(), PolarsError>(())
1784 /// ```
1785 pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1786 where
1787 I: IntoIterator<Item = S>,
1788 S: AsRef<str>,
1789 {
1790 names
1791 .into_iter()
1792 .map(|name| self.column(name.as_ref()))
1793 .collect()
1794 }
1795
1796 /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1797 ///
1798 /// # Examples
1799 ///
1800 /// ```
1801 /// # use polars_core::prelude::*;
1802 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1803 /// df.select(["foo", "bar"])
1804 /// }
1805 /// ```
1806 pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1807 where
1808 I: IntoIterator<Item = S>,
1809 S: Into<PlSmallStr>,
1810 {
1811 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1812 self._select_impl(cols.as_slice())
1813 }
1814
1815 pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1816 ensure_names_unique(cols, |s| s.as_str())?;
1817 self._select_impl_unchecked(cols)
1818 }
1819
1820 pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1821 let selected = self.select_columns_impl(cols)?;
1822 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1823 }
1824
1825 /// Select with a known schema. The schema names must match the column names of this DataFrame.
1826 pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1827 where
1828 I: IntoIterator<Item = S>,
1829 S: Into<PlSmallStr>,
1830 {
1831 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1832 self._select_with_schema_impl(&cols, schema, true)
1833 }
1834
1835 /// Select with a known schema without checking for duplicates in `selection`.
1836 /// The schema names must match the column names of this DataFrame.
1837 pub fn select_with_schema_unchecked<I, S>(
1838 &self,
1839 selection: I,
1840 schema: &Schema,
1841 ) -> PolarsResult<Self>
1842 where
1843 I: IntoIterator<Item = S>,
1844 S: Into<PlSmallStr>,
1845 {
1846 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1847 self._select_with_schema_impl(&cols, schema, false)
1848 }
1849
1850 /// * The schema names must match the column names of this DataFrame.
1851 pub fn _select_with_schema_impl(
1852 &self,
1853 cols: &[PlSmallStr],
1854 schema: &Schema,
1855 check_duplicates: bool,
1856 ) -> PolarsResult<Self> {
1857 if check_duplicates {
1858 ensure_names_unique(cols, |s| s.as_str())?;
1859 }
1860
1861 let selected = self.select_columns_impl_with_schema(cols, schema)?;
1862 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1863 }
1864
1865 /// A non generic implementation to reduce compiler bloat.
1866 fn select_columns_impl_with_schema(
1867 &self,
1868 cols: &[PlSmallStr],
1869 schema: &Schema,
1870 ) -> PolarsResult<Vec<Column>> {
1871 if cfg!(debug_assertions) {
1872 ensure_matching_schema_names(schema, self.schema())?;
1873 }
1874
1875 cols.iter()
1876 .map(|name| {
1877 let index = schema.try_get_full(name.as_str())?.0;
1878 Ok(self.columns[index].clone())
1879 })
1880 .collect()
1881 }
1882
1883 pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1884 where
1885 I: IntoIterator<Item = S>,
1886 S: Into<PlSmallStr>,
1887 {
1888 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1889 self.select_physical_impl(&cols)
1890 }
1891
1892 fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1893 ensure_names_unique(cols, |s| s.as_str())?;
1894 let selected = self.select_columns_physical_impl(cols)?;
1895 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1896 }
1897
1898 pub fn project(&self, to: SchemaRef) -> PolarsResult<Self> {
1899 let from = self.schema();
1900 let columns = to
1901 .iter_names()
1902 .map(|name| Ok(self.columns[from.try_index_of(name.as_str())?].clone()))
1903 .collect::<PolarsResult<Vec<_>>>()?;
1904 let mut df = unsafe { Self::new_no_checks(self.height(), columns) };
1905 df.cached_schema = to.into();
1906 Ok(df)
1907 }
1908
1909 /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1910 ///
1911 /// # Example
1912 ///
1913 /// ```rust
1914 /// # use polars_core::prelude::*;
1915 /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1916 /// "Carbon" => [1, 2, 3],
1917 /// "Hydrogen" => [4, 6, 8])?;
1918 /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1919 ///
1920 /// assert_eq!(df["Carbon"], sv[0]);
1921 /// assert_eq!(df["Hydrogen"], sv[1]);
1922 /// # Ok::<(), PolarsError>(())
1923 /// ```
1924 pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1925 let cols = selection.into_vec();
1926 self.select_columns_impl(&cols)
1927 }
1928
1929 fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1930 self.columns
1931 .iter()
1932 .enumerate()
1933 .map(|(i, s)| (s.name().as_str(), i))
1934 .collect()
1935 }
1936
1937 /// A non generic implementation to reduce compiler bloat.
1938 fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1939 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1940 let name_to_idx = self._names_to_idx_map();
1941 cols.iter()
1942 .map(|name| {
1943 let idx = *name_to_idx
1944 .get(name.as_str())
1945 .ok_or_else(|| polars_err!(col_not_found = name))?;
1946 Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1947 })
1948 .collect::<PolarsResult<Vec<_>>>()?
1949 } else {
1950 cols.iter()
1951 .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1952 .collect::<PolarsResult<Vec<_>>>()?
1953 };
1954
1955 Ok(selected)
1956 }
1957
1958 /// A non generic implementation to reduce compiler bloat.
1959 fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1960 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1961 // we hash, because there are user that having millions of columns.
1962 // # https://github.com/pola-rs/polars/issues/1023
1963 let name_to_idx = self._names_to_idx_map();
1964
1965 cols.iter()
1966 .map(|name| {
1967 let idx = *name_to_idx
1968 .get(name.as_str())
1969 .ok_or_else(|| polars_err!(col_not_found = name))?;
1970 Ok(self.select_at_idx(idx).unwrap().clone())
1971 })
1972 .collect::<PolarsResult<Vec<_>>>()?
1973 } else {
1974 cols.iter()
1975 .map(|c| self.column(c.as_str()).cloned())
1976 .collect::<PolarsResult<Vec<_>>>()?
1977 };
1978
1979 Ok(selected)
1980 }
1981
1982 fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1983 // If there is a filtered column just see how many columns there are left.
1984 if let Some(fst) = filtered.first() {
1985 return fst.len();
1986 }
1987
1988 // Otherwise, count the number of values that would be filtered and return that height.
1989 let num_trues = mask.num_trues();
1990 if mask.len() == self.height() {
1991 num_trues
1992 } else {
1993 // This is for broadcasting masks
1994 debug_assert!(num_trues == 0 || num_trues == 1);
1995 self.height() * num_trues
1996 }
1997 }
1998
1999 /// Take the [`DataFrame`] rows by a boolean mask.
2000 ///
2001 /// # Example
2002 ///
2003 /// ```
2004 /// # use polars_core::prelude::*;
2005 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2006 /// let mask = df.column("sepal_width")?.is_not_null();
2007 /// df.filter(&mask)
2008 /// }
2009 /// ```
2010 pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2011 let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
2012 let height = self.filter_height(&new_col, mask);
2013
2014 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2015 }
2016
2017 /// Same as `filter` but does not parallelize.
2018 pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2019 let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
2020 let height = self.filter_height(&new_col, mask);
2021
2022 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2023 }
2024
2025 /// Take [`DataFrame`] rows by index values.
2026 ///
2027 /// # Example
2028 ///
2029 /// ```
2030 /// # use polars_core::prelude::*;
2031 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2032 /// let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2033 /// df.take(&idx)
2034 /// }
2035 /// ```
2036 pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2037 let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2038
2039 Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2040 }
2041
2042 /// # Safety
2043 /// The indices must be in-bounds.
2044 pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2045 self.take_unchecked_impl(idx, true)
2046 }
2047
2048 /// # Safety
2049 /// The indices must be in-bounds.
2050 pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2051 let cols = if allow_threads && POOL.current_num_threads() > 1 {
2052 POOL.install(|| {
2053 if POOL.current_num_threads() > self.width() {
2054 let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2055 self._apply_columns_par(&|c| {
2056 (0..idx.len().div_ceil(stride))
2057 .into_par_iter()
2058 .map(|i| c.take_unchecked(&idx.slice((i * stride) as i64, stride)))
2059 .reduce(
2060 || Column::new_empty(c.name().clone(), c.dtype()),
2061 |mut a, b| {
2062 a.append_owned(b).unwrap();
2063 a
2064 },
2065 )
2066 })
2067 } else {
2068 self._apply_columns_par(&|c| c.take_unchecked(idx))
2069 }
2070 })
2071 } else {
2072 self._apply_columns(&|s| s.take_unchecked(idx))
2073 };
2074 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2075 }
2076
2077 /// # Safety
2078 /// The indices must be in-bounds.
2079 pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2080 self.take_slice_unchecked_impl(idx, true)
2081 }
2082
2083 /// # Safety
2084 /// The indices must be in-bounds.
2085 pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2086 let cols = if allow_threads && POOL.current_num_threads() > 1 {
2087 POOL.install(|| {
2088 if POOL.current_num_threads() > self.width() {
2089 let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2090 self._apply_columns_par(&|c| {
2091 (0..idx.len().div_ceil(stride))
2092 .into_par_iter()
2093 .map(|i| {
2094 let idx = &idx[i * stride..];
2095 let idx = &idx[..idx.len().min(stride)];
2096 c.take_slice_unchecked(idx)
2097 })
2098 .reduce(
2099 || Column::new_empty(c.name().clone(), c.dtype()),
2100 |mut a, b| {
2101 a.append_owned(b).unwrap();
2102 a
2103 },
2104 )
2105 })
2106 } else {
2107 self._apply_columns_par(&|s| s.take_slice_unchecked(idx))
2108 }
2109 })
2110 } else {
2111 self._apply_columns(&|s| s.take_slice_unchecked(idx))
2112 };
2113 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2114 }
2115
2116 /// Rename a column in the [`DataFrame`].
2117 ///
2118 /// # Example
2119 ///
2120 /// ```
2121 /// # use polars_core::prelude::*;
2122 /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2123 /// let original_name = "foo";
2124 /// let new_name = "bar";
2125 /// df.rename(original_name, new_name.into())
2126 /// }
2127 /// ```
2128 pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2129 if column == name.as_str() {
2130 return Ok(self);
2131 }
2132 polars_ensure!(
2133 !self.schema().contains(&name),
2134 Duplicate: "column rename attempted with already existing name \"{name}\""
2135 );
2136
2137 self.get_column_index(column)
2138 .and_then(|idx| self.columns.get_mut(idx))
2139 .ok_or_else(|| polars_err!(col_not_found = column))
2140 .map(|c| c.rename(name))?;
2141 self.clear_schema();
2142
2143 Ok(self)
2144 }
2145
2146 /// Sort [`DataFrame`] in place.
2147 ///
2148 /// See [`DataFrame::sort`] for more instruction.
2149 pub fn sort_in_place(
2150 &mut self,
2151 by: impl IntoVec<PlSmallStr>,
2152 sort_options: SortMultipleOptions,
2153 ) -> PolarsResult<&mut Self> {
2154 let by_column = self.select_columns(by)?;
2155 self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2156 Ok(self)
2157 }
2158
2159 #[doc(hidden)]
2160 /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2161 pub fn sort_impl(
2162 &self,
2163 by_column: Vec<Column>,
2164 mut sort_options: SortMultipleOptions,
2165 slice: Option<(i64, usize)>,
2166 ) -> PolarsResult<Self> {
2167 if by_column.is_empty() {
2168 // If no columns selected, any order (including original order) is correct.
2169 return if let Some((offset, len)) = slice {
2170 Ok(self.slice(offset, len))
2171 } else {
2172 Ok(self.clone())
2173 };
2174 }
2175
2176 // note that the by_column argument also contains evaluated expression from
2177 // polars-lazy that may not even be present in this dataframe. therefore
2178 // when we try to set the first columns as sorted, we ignore the error as
2179 // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2180 let first_descending = sort_options.descending[0];
2181 let first_by_column = by_column[0].name().to_string();
2182
2183 let set_sorted = |df: &mut DataFrame| {
2184 // Mark the first sort column as sorted; if the column does not exist it
2185 // is ok, because we sorted by an expression not present in the dataframe
2186 let _ = df.apply(&first_by_column, |s| {
2187 let mut s = s.clone();
2188 if first_descending {
2189 s.set_sorted_flag(IsSorted::Descending)
2190 } else {
2191 s.set_sorted_flag(IsSorted::Ascending)
2192 }
2193 s
2194 });
2195 };
2196 if self.is_empty() {
2197 let mut out = self.clone();
2198 set_sorted(&mut out);
2199 return Ok(out);
2200 }
2201
2202 if let Some((0, k)) = slice {
2203 if k < self.len() {
2204 return self.bottom_k_impl(k, by_column, sort_options);
2205 }
2206 }
2207 // Check if the required column is already sorted; if so we can exit early
2208 // We can do so when there is only one column to sort by, for multiple columns
2209 // it will be complicated to do so
2210 #[cfg(feature = "dtype-categorical")]
2211 let is_not_categorical_enum =
2212 !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2213 || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2214
2215 #[cfg(not(feature = "dtype-categorical"))]
2216 #[allow(non_upper_case_globals)]
2217 const is_not_categorical_enum: bool = true;
2218
2219 if by_column.len() == 1 && is_not_categorical_enum {
2220 let required_sorting = if sort_options.descending[0] {
2221 IsSorted::Descending
2222 } else {
2223 IsSorted::Ascending
2224 };
2225 // If null count is 0 then nulls_last doesnt matter
2226 // Safe to get value at last position since the dataframe is not empty (taken care above)
2227 let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2228 && ((by_column[0].null_count() == 0)
2229 || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2230 == sort_options.nulls_last[0]);
2231
2232 if no_sorting_required {
2233 return if let Some((offset, len)) = slice {
2234 Ok(self.slice(offset, len))
2235 } else {
2236 Ok(self.clone())
2237 };
2238 }
2239 }
2240
2241 let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2242
2243 // a lot of indirection in both sorting and take
2244 let mut df = self.clone();
2245 let df = df.as_single_chunk_par();
2246 let mut take = match (by_column.len(), has_nested) {
2247 (1, false) => {
2248 let s = &by_column[0];
2249 let options = SortOptions {
2250 descending: sort_options.descending[0],
2251 nulls_last: sort_options.nulls_last[0],
2252 multithreaded: sort_options.multithreaded,
2253 maintain_order: sort_options.maintain_order,
2254 limit: sort_options.limit,
2255 };
2256 // fast path for a frame with a single series
2257 // no need to compute the sort indices and then take by these indices
2258 // simply sort and return as frame
2259 if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2260 let mut out = s.sort_with(options)?;
2261 if let Some((offset, len)) = slice {
2262 out = out.slice(offset, len);
2263 }
2264 return Ok(out.into_frame());
2265 }
2266 s.arg_sort(options)
2267 },
2268 _ => {
2269 if sort_options.nulls_last.iter().all(|&x| x)
2270 || has_nested
2271 || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2272 {
2273 argsort_multiple_row_fmt(
2274 &by_column,
2275 sort_options.descending,
2276 sort_options.nulls_last,
2277 sort_options.multithreaded,
2278 )?
2279 } else {
2280 let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2281 first
2282 .as_materialized_series()
2283 .arg_sort_multiple(&other, &sort_options)?
2284 }
2285 },
2286 };
2287
2288 if let Some((offset, len)) = slice {
2289 take = take.slice(offset, len);
2290 }
2291
2292 // SAFETY:
2293 // the created indices are in bounds
2294 let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2295 set_sorted(&mut df);
2296 Ok(df)
2297 }
2298
2299 /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2300 ///
2301 /// This dataframe does not necessarily have a specified schema and may be changed at any
2302 /// point. It is primarily used for debugging.
2303 pub fn _to_metadata(&self) -> DataFrame {
2304 let num_columns = self.columns.len();
2305
2306 let mut column_names =
2307 StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2308 let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2309 let mut sorted_asc_ca =
2310 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2311 let mut sorted_dsc_ca =
2312 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2313 let mut fast_explode_list_ca =
2314 BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2315 let mut materialized_at_ca =
2316 StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2317
2318 for col in &self.columns {
2319 let flags = col.get_flags();
2320
2321 let (repr, materialized_at) = match col {
2322 Column::Series(s) => ("series", s.materialized_at()),
2323 Column::Partitioned(_) => ("partitioned", None),
2324 Column::Scalar(_) => ("scalar", None),
2325 };
2326 let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2327 let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2328 let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2329
2330 column_names.append_value(col.name().clone());
2331 repr_ca.append_value(repr);
2332 sorted_asc_ca.append_value(sorted_asc);
2333 sorted_dsc_ca.append_value(sorted_dsc);
2334 fast_explode_list_ca.append_value(fast_explode_list);
2335 materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2336 }
2337
2338 unsafe {
2339 DataFrame::new_no_checks(
2340 self.width(),
2341 vec![
2342 column_names.finish().into_column(),
2343 repr_ca.finish().into_column(),
2344 sorted_asc_ca.finish().into_column(),
2345 sorted_dsc_ca.finish().into_column(),
2346 fast_explode_list_ca.finish().into_column(),
2347 materialized_at_ca.finish().into_column(),
2348 ],
2349 )
2350 }
2351 }
2352
2353 /// Return a sorted clone of this [`DataFrame`].
2354 ///
2355 /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2356 /// # Example
2357 ///
2358 /// Sort by a single column with default options:
2359 /// ```
2360 /// # use polars_core::prelude::*;
2361 /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2362 /// df.sort(["sepal_width"], Default::default())
2363 /// }
2364 /// ```
2365 /// Sort by a single column with specific order:
2366 /// ```
2367 /// # use polars_core::prelude::*;
2368 /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2369 /// df.sort(
2370 /// ["sepal_width"],
2371 /// SortMultipleOptions::new()
2372 /// .with_order_descending(descending)
2373 /// )
2374 /// }
2375 /// ```
2376 /// Sort by multiple columns with specifying order for each column:
2377 /// ```
2378 /// # use polars_core::prelude::*;
2379 /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2380 /// df.sort(
2381 /// ["sepal_width", "sepal_length"],
2382 /// SortMultipleOptions::new()
2383 /// .with_order_descending_multi([false, true])
2384 /// )
2385 /// }
2386 /// ```
2387 /// See [`SortMultipleOptions`] for more options.
2388 ///
2389 /// Also see [`DataFrame::sort_in_place`].
2390 pub fn sort(
2391 &self,
2392 by: impl IntoVec<PlSmallStr>,
2393 sort_options: SortMultipleOptions,
2394 ) -> PolarsResult<Self> {
2395 let mut df = self.clone();
2396 df.sort_in_place(by, sort_options)?;
2397 Ok(df)
2398 }
2399
2400 /// Replace a column with a [`Series`].
2401 ///
2402 /// # Example
2403 ///
2404 /// ```rust
2405 /// # use polars_core::prelude::*;
2406 /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2407 /// "Area (km²)" => [9_833_520, 9_596_961])?;
2408 /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2409 ///
2410 /// assert!(df.replace("Nation", s.clone()).is_err());
2411 /// assert!(df.replace("Country", s).is_ok());
2412 /// # Ok::<(), PolarsError>(())
2413 /// ```
2414 pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2415 self.apply(column, |_| new_col.into_series())
2416 }
2417
2418 /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2419 /// is that now the value of `column: &str` determines the name of the column and not the name
2420 /// of the `Series` passed to this method.
2421 pub fn replace_or_add<S: IntoSeries>(
2422 &mut self,
2423 column: PlSmallStr,
2424 new_col: S,
2425 ) -> PolarsResult<&mut Self> {
2426 let mut new_col = new_col.into_series();
2427 new_col.rename(column);
2428 self.with_column(new_col)
2429 }
2430
2431 /// Replace column at index `idx` with a [`Series`].
2432 ///
2433 /// # Example
2434 ///
2435 /// ```ignored
2436 /// # use polars_core::prelude::*;
2437 /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2438 /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2439 /// let mut df = DataFrame::new(vec![s0, s1])?;
2440 ///
2441 /// // Add 32 to get lowercase ascii values
2442 /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2443 /// # Ok::<(), PolarsError>(())
2444 /// ```
2445 pub fn replace_column<C: IntoColumn>(
2446 &mut self,
2447 index: usize,
2448 new_column: C,
2449 ) -> PolarsResult<&mut Self> {
2450 polars_ensure!(
2451 index < self.width(),
2452 ShapeMismatch:
2453 "unable to replace at index {}, the DataFrame has only {} columns",
2454 index, self.width(),
2455 );
2456 let mut new_column = new_column.into_column();
2457 polars_ensure!(
2458 new_column.len() == self.height(),
2459 ShapeMismatch:
2460 "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2461 new_column.len(), self.height(),
2462 );
2463 let old_col = &mut self.columns[index];
2464 mem::swap(old_col, &mut new_column);
2465 self.clear_schema();
2466 Ok(self)
2467 }
2468
2469 /// Apply a closure to a column. This is the recommended way to do in place modification.
2470 ///
2471 /// # Example
2472 ///
2473 /// ```rust
2474 /// # use polars_core::prelude::*;
2475 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2476 /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2477 /// let mut df = DataFrame::new(vec![s0, s1])?;
2478 ///
2479 /// fn str_to_len(str_val: &Column) -> Column {
2480 /// str_val.str()
2481 /// .unwrap()
2482 /// .into_iter()
2483 /// .map(|opt_name: Option<&str>| {
2484 /// opt_name.map(|name: &str| name.len() as u32)
2485 /// })
2486 /// .collect::<UInt32Chunked>()
2487 /// .into_column()
2488 /// }
2489 ///
2490 /// // Replace the names column by the length of the names.
2491 /// df.apply("names", str_to_len);
2492 /// # Ok::<(), PolarsError>(())
2493 /// ```
2494 /// Results in:
2495 ///
2496 /// ```text
2497 /// +--------+-------+
2498 /// | foo | |
2499 /// | --- | names |
2500 /// | str | u32 |
2501 /// +========+=======+
2502 /// | "ham" | 4 |
2503 /// +--------+-------+
2504 /// | "spam" | 6 |
2505 /// +--------+-------+
2506 /// | "egg" | 3 |
2507 /// +--------+-------+
2508 /// ```
2509 pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2510 where
2511 F: FnOnce(&Column) -> C,
2512 C: IntoColumn,
2513 {
2514 let idx = self.check_name_to_idx(name)?;
2515 self.apply_at_idx(idx, f)?;
2516 Ok(self)
2517 }
2518
2519 /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2520 /// modification.
2521 ///
2522 /// # Example
2523 ///
2524 /// ```rust
2525 /// # use polars_core::prelude::*;
2526 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2527 /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2528 /// let mut df = DataFrame::new(vec![s0, s1])?;
2529 ///
2530 /// // Add 32 to get lowercase ascii values
2531 /// df.apply_at_idx(1, |s| s + 32);
2532 /// # Ok::<(), PolarsError>(())
2533 /// ```
2534 /// Results in:
2535 ///
2536 /// ```text
2537 /// +--------+-------+
2538 /// | foo | ascii |
2539 /// | --- | --- |
2540 /// | str | i32 |
2541 /// +========+=======+
2542 /// | "ham" | 102 |
2543 /// +--------+-------+
2544 /// | "spam" | 111 |
2545 /// +--------+-------+
2546 /// | "egg" | 111 |
2547 /// +--------+-------+
2548 /// ```
2549 pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2550 where
2551 F: FnOnce(&Column) -> C,
2552 C: IntoColumn,
2553 {
2554 let df_height = self.height();
2555 let width = self.width();
2556 let col = self.columns.get_mut(idx).ok_or_else(|| {
2557 polars_err!(
2558 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2559 idx, width
2560 )
2561 })?;
2562 let name = col.name().clone();
2563 let dtype_before = col.dtype().clone();
2564 let new_col = f(col).into_column();
2565 match new_col.len() {
2566 1 => {
2567 let new_col = new_col.new_from_index(0, df_height);
2568 let _ = mem::replace(col, new_col);
2569 },
2570 len if (len == df_height) => {
2571 let _ = mem::replace(col, new_col);
2572 },
2573 len => polars_bail!(
2574 ShapeMismatch:
2575 "resulting Series has length {} while the DataFrame has height {}",
2576 len, df_height
2577 ),
2578 }
2579
2580 // make sure the name remains the same after applying the closure
2581 unsafe {
2582 let col = self.columns.get_unchecked_mut(idx);
2583 col.rename(name);
2584
2585 if col.dtype() != &dtype_before {
2586 self.clear_schema();
2587 }
2588 }
2589 Ok(self)
2590 }
2591
2592 /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2593 /// modification.
2594 ///
2595 /// # Example
2596 ///
2597 /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2598 ///
2599 /// ```rust
2600 /// # use polars_core::prelude::*;
2601 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2602 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2603 /// let mut df = DataFrame::new(vec![s0, s1])?;
2604 ///
2605 /// let idx = vec![0, 1, 4];
2606 ///
2607 /// df.try_apply("foo", |c| {
2608 /// c.str()?
2609 /// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2610 /// });
2611 /// # Ok::<(), PolarsError>(())
2612 /// ```
2613 /// Results in:
2614 ///
2615 /// ```text
2616 /// +---------------------+--------+
2617 /// | foo | values |
2618 /// | --- | --- |
2619 /// | str | i32 |
2620 /// +=====================+========+
2621 /// | "ham-is-modified" | 1 |
2622 /// +---------------------+--------+
2623 /// | "spam-is-modified" | 2 |
2624 /// +---------------------+--------+
2625 /// | "egg" | 3 |
2626 /// +---------------------+--------+
2627 /// | "bacon" | 4 |
2628 /// +---------------------+--------+
2629 /// | "quack-is-modified" | 5 |
2630 /// +---------------------+--------+
2631 /// ```
2632 pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2633 where
2634 F: FnOnce(&Column) -> PolarsResult<C>,
2635 C: IntoColumn,
2636 {
2637 let width = self.width();
2638 let col = self.columns.get_mut(idx).ok_or_else(|| {
2639 polars_err!(
2640 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2641 idx, width
2642 )
2643 })?;
2644 let name = col.name().clone();
2645
2646 let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2647
2648 // make sure the name remains the same after applying the closure
2649 unsafe {
2650 let col = self.columns.get_unchecked_mut(idx);
2651 col.rename(name);
2652 }
2653 Ok(self)
2654 }
2655
2656 /// Apply a closure that may fail to a column. This is the recommended way to do in place
2657 /// modification.
2658 ///
2659 /// # Example
2660 ///
2661 /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2662 ///
2663 /// ```rust
2664 /// # use polars_core::prelude::*;
2665 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2666 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2667 /// let mut df = DataFrame::new(vec![s0, s1])?;
2668 ///
2669 /// // create a mask
2670 /// let values = df.column("values")?.as_materialized_series();
2671 /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2672 ///
2673 /// df.try_apply("foo", |c| {
2674 /// c.str()?
2675 /// .set(&mask, Some("not_within_bounds"))
2676 /// });
2677 /// # Ok::<(), PolarsError>(())
2678 /// ```
2679 /// Results in:
2680 ///
2681 /// ```text
2682 /// +---------------------+--------+
2683 /// | foo | values |
2684 /// | --- | --- |
2685 /// | str | i32 |
2686 /// +=====================+========+
2687 /// | "not_within_bounds" | 1 |
2688 /// +---------------------+--------+
2689 /// | "spam" | 2 |
2690 /// +---------------------+--------+
2691 /// | "egg" | 3 |
2692 /// +---------------------+--------+
2693 /// | "bacon" | 4 |
2694 /// +---------------------+--------+
2695 /// | "not_within_bounds" | 5 |
2696 /// +---------------------+--------+
2697 /// ```
2698 pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2699 where
2700 F: FnOnce(&Series) -> PolarsResult<C>,
2701 C: IntoColumn,
2702 {
2703 let idx = self.try_get_column_index(column)?;
2704 self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2705 }
2706
2707 /// Slice the [`DataFrame`] along the rows.
2708 ///
2709 /// # Example
2710 ///
2711 /// ```rust
2712 /// # use polars_core::prelude::*;
2713 /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2714 /// "Color" => ["Green", "Red", "White", "White", "Red"])?;
2715 /// let sl: DataFrame = df.slice(2, 3);
2716 ///
2717 /// assert_eq!(sl.shape(), (3, 2));
2718 /// println!("{}", sl);
2719 /// # Ok::<(), PolarsError>(())
2720 /// ```
2721 /// Output:
2722 /// ```text
2723 /// shape: (3, 2)
2724 /// +-------+-------+
2725 /// | Fruit | Color |
2726 /// | --- | --- |
2727 /// | str | str |
2728 /// +=======+=======+
2729 /// | Grape | White |
2730 /// +-------+-------+
2731 /// | Fig | White |
2732 /// +-------+-------+
2733 /// | Fig | Red |
2734 /// +-------+-------+
2735 /// ```
2736 #[must_use]
2737 pub fn slice(&self, offset: i64, length: usize) -> Self {
2738 if offset == 0 && length == self.height() {
2739 return self.clone();
2740 }
2741 if length == 0 {
2742 return self.clear();
2743 }
2744 let col = self
2745 .columns
2746 .iter()
2747 .map(|s| s.slice(offset, length))
2748 .collect::<Vec<_>>();
2749
2750 let height = if let Some(fst) = col.first() {
2751 fst.len()
2752 } else {
2753 let (_, length) = slice_offsets(offset, length, self.height());
2754 length
2755 };
2756
2757 unsafe { DataFrame::new_no_checks(height, col) }
2758 }
2759
2760 /// Split [`DataFrame`] at the given `offset`.
2761 pub fn split_at(&self, offset: i64) -> (Self, Self) {
2762 let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2763
2764 let (idx, _) = slice_offsets(offset, 0, self.height());
2765
2766 let a = unsafe { DataFrame::new_no_checks(idx, a) };
2767 let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2768 (a, b)
2769 }
2770
2771 #[must_use]
2772 pub fn clear(&self) -> Self {
2773 let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2774 unsafe { DataFrame::new_no_checks(0, col) }
2775 }
2776
2777 #[must_use]
2778 pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2779 if offset == 0 && length == self.height() {
2780 return self.clone();
2781 }
2782 let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2783 unsafe { DataFrame::new_no_checks(length, columns) }
2784 }
2785
2786 #[must_use]
2787 pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2788 if offset == 0 && length == self.height() {
2789 return self.clone();
2790 }
2791 // @scalar-opt
2792 let columns = self._apply_columns(&|s| {
2793 let mut out = s.slice(offset, length);
2794 out.shrink_to_fit();
2795 out
2796 });
2797 unsafe { DataFrame::new_no_checks(length, columns) }
2798 }
2799
2800 /// Get the head of the [`DataFrame`].
2801 ///
2802 /// # Example
2803 ///
2804 /// ```rust
2805 /// # use polars_core::prelude::*;
2806 /// let countries: DataFrame =
2807 /// df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2808 /// "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2809 /// "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2810 /// "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2811 /// assert_eq!(countries.shape(), (5, 4));
2812 ///
2813 /// println!("{}", countries.head(Some(3)));
2814 /// # Ok::<(), PolarsError>(())
2815 /// ```
2816 ///
2817 /// Output:
2818 ///
2819 /// ```text
2820 /// shape: (3, 4)
2821 /// +--------------------+---------------+---------------+------------+
2822 /// | Rank by GDP (2021) | Continent | Country | Capital |
2823 /// | --- | --- | --- | --- |
2824 /// | i32 | str | str | str |
2825 /// +====================+===============+===============+============+
2826 /// | 1 | North America | United States | Washington |
2827 /// +--------------------+---------------+---------------+------------+
2828 /// | 2 | Asia | China | Beijing |
2829 /// +--------------------+---------------+---------------+------------+
2830 /// | 3 | Asia | Japan | Tokyo |
2831 /// +--------------------+---------------+---------------+------------+
2832 /// ```
2833 #[must_use]
2834 pub fn head(&self, length: Option<usize>) -> Self {
2835 let col = self
2836 .columns
2837 .iter()
2838 .map(|c| c.head(length))
2839 .collect::<Vec<_>>();
2840
2841 let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2842 let height = usize::min(height, self.height());
2843 unsafe { DataFrame::new_no_checks(height, col) }
2844 }
2845
2846 /// Get the tail of the [`DataFrame`].
2847 ///
2848 /// # Example
2849 ///
2850 /// ```rust
2851 /// # use polars_core::prelude::*;
2852 /// let countries: DataFrame =
2853 /// df!("Rank (2021)" => [105, 106, 107, 108, 109],
2854 /// "Apple Price (ā¬/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2855 /// "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2856 /// assert_eq!(countries.shape(), (5, 3));
2857 ///
2858 /// println!("{}", countries.tail(Some(2)));
2859 /// # Ok::<(), PolarsError>(())
2860 /// ```
2861 ///
2862 /// Output:
2863 ///
2864 /// ```text
2865 /// shape: (2, 3)
2866 /// +-------------+--------------------+---------+
2867 /// | Rank (2021) | Apple Price (ā¬/kg) | Country |
2868 /// | --- | --- | --- |
2869 /// | i32 | f64 | str |
2870 /// +=============+====================+=========+
2871 /// | 108 | 0.63 | Syria |
2872 /// +-------------+--------------------+---------+
2873 /// | 109 | 0.63 | Turkey |
2874 /// +-------------+--------------------+---------+
2875 /// ```
2876 #[must_use]
2877 pub fn tail(&self, length: Option<usize>) -> Self {
2878 let col = self
2879 .columns
2880 .iter()
2881 .map(|c| c.tail(length))
2882 .collect::<Vec<_>>();
2883
2884 let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2885 let height = usize::min(height, self.height());
2886 unsafe { DataFrame::new_no_checks(height, col) }
2887 }
2888
2889 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2890 ///
2891 /// # Panics
2892 ///
2893 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2894 ///
2895 /// This responsibility is left to the caller as we don't want to take mutable references here,
2896 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2897 /// as well.
2898 pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter<'_> {
2899 debug_assert!(!self.should_rechunk(), "expected equal chunks");
2900 // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2901 // as we must allocate arrow strings/binaries.
2902 let must_convert = compat_level.0 == 0;
2903 let parallel = parallel
2904 && must_convert
2905 && self.columns.len() > 1
2906 && self
2907 .columns
2908 .iter()
2909 .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2910
2911 RecordBatchIter {
2912 columns: &self.columns,
2913 schema: Arc::new(
2914 self.columns
2915 .iter()
2916 .map(|c| c.field().to_arrow(compat_level))
2917 .collect(),
2918 ),
2919 idx: 0,
2920 n_chunks: self.first_col_n_chunks(),
2921 compat_level,
2922 parallel,
2923 }
2924 }
2925
2926 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2927 ///
2928 /// # Panics
2929 ///
2930 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2931 ///
2932 /// This responsibility is left to the caller as we don't want to take mutable references here,
2933 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2934 /// as well.
2935 pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2936 debug_assert!(!self.should_rechunk());
2937 PhysRecordBatchIter {
2938 schema: Arc::new(
2939 self.get_columns()
2940 .iter()
2941 .map(|c| c.field().to_arrow(CompatLevel::newest()))
2942 .collect(),
2943 ),
2944 arr_iters: self
2945 .materialized_column_iter()
2946 .map(|s| s.chunks().iter())
2947 .collect(),
2948 }
2949 }
2950
2951 /// Get a [`DataFrame`] with all the columns in reversed order.
2952 #[must_use]
2953 pub fn reverse(&self) -> Self {
2954 let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2955 unsafe { DataFrame::new_no_checks(self.height(), col) }
2956 }
2957
2958 /// Shift the values by a given period and fill the parts that will be empty due to this operation
2959 /// with `Nones`.
2960 ///
2961 /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2962 #[must_use]
2963 pub fn shift(&self, periods: i64) -> Self {
2964 let col = self._apply_columns_par(&|s| s.shift(periods));
2965 unsafe { DataFrame::new_no_checks(self.height(), col) }
2966 }
2967
2968 /// Replace None values with one of the following strategies:
2969 /// * Forward fill (replace None with the previous value)
2970 /// * Backward fill (replace None with the next value)
2971 /// * Mean fill (replace None with the mean of the whole array)
2972 /// * Min fill (replace None with the minimum of the whole array)
2973 /// * Max fill (replace None with the maximum of the whole array)
2974 ///
2975 /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2976 pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2977 let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2978
2979 Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2980 }
2981
2982 /// Pipe different functions/ closure operations that work on a DataFrame together.
2983 pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2984 where
2985 F: Fn(DataFrame) -> PolarsResult<B>,
2986 {
2987 f(self)
2988 }
2989
2990 /// Pipe different functions/ closure operations that work on a DataFrame together.
2991 pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2992 where
2993 F: Fn(&mut DataFrame) -> PolarsResult<B>,
2994 {
2995 f(self)
2996 }
2997
2998 /// Pipe different functions/ closure operations that work on a DataFrame together.
2999 pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
3000 where
3001 F: Fn(DataFrame, Args) -> PolarsResult<B>,
3002 {
3003 f(self, args)
3004 }
3005
3006 /// Drop duplicate rows from a [`DataFrame`].
3007 /// *This fails when there is a column of type List in DataFrame*
3008 ///
3009 /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
3010 ///
3011 /// # Example
3012 ///
3013 /// ```no_run
3014 /// # use polars_core::prelude::*;
3015 /// let df = df! {
3016 /// "flt" => [1., 1., 2., 2., 3., 3.],
3017 /// "int" => [1, 1, 2, 2, 3, 3, ],
3018 /// "str" => ["a", "a", "b", "b", "c", "c"]
3019 /// }?;
3020 ///
3021 /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
3022 /// # Ok::<(), PolarsError>(())
3023 /// ```
3024 /// Returns
3025 ///
3026 /// ```text
3027 /// +-----+-----+-----+
3028 /// | flt | int | str |
3029 /// | --- | --- | --- |
3030 /// | f64 | i32 | str |
3031 /// +=====+=====+=====+
3032 /// | 1 | 1 | "a" |
3033 /// +-----+-----+-----+
3034 /// | 2 | 2 | "b" |
3035 /// +-----+-----+-----+
3036 /// | 3 | 3 | "c" |
3037 /// +-----+-----+-----+
3038 /// ```
3039 #[cfg(feature = "algorithm_group_by")]
3040 pub fn unique_stable(
3041 &self,
3042 subset: Option<&[String]>,
3043 keep: UniqueKeepStrategy,
3044 slice: Option<(i64, usize)>,
3045 ) -> PolarsResult<DataFrame> {
3046 self.unique_impl(
3047 true,
3048 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3049 keep,
3050 slice,
3051 )
3052 }
3053
3054 /// Unstable distinct. See [`DataFrame::unique_stable`].
3055 #[cfg(feature = "algorithm_group_by")]
3056 pub fn unique<I, S>(
3057 &self,
3058 subset: Option<&[String]>,
3059 keep: UniqueKeepStrategy,
3060 slice: Option<(i64, usize)>,
3061 ) -> PolarsResult<DataFrame> {
3062 self.unique_impl(
3063 false,
3064 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3065 keep,
3066 slice,
3067 )
3068 }
3069
3070 #[cfg(feature = "algorithm_group_by")]
3071 pub fn unique_impl(
3072 &self,
3073 maintain_order: bool,
3074 subset: Option<Vec<PlSmallStr>>,
3075 keep: UniqueKeepStrategy,
3076 slice: Option<(i64, usize)>,
3077 ) -> PolarsResult<Self> {
3078 let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3079 let mut df = self.clone();
3080 // take on multiple chunks is terrible
3081 df.as_single_chunk_par();
3082
3083 let columns = match (keep, maintain_order) {
3084 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3085 let gb = df.group_by_stable(names)?;
3086 let groups = gb.get_groups();
3087 let (offset, len) = slice.unwrap_or((0, groups.len()));
3088 let groups = groups.slice(offset, len);
3089 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3090 },
3091 (UniqueKeepStrategy::Last, true) => {
3092 // maintain order by last values, so the sorted groups are not correct as they
3093 // are sorted by the first value
3094 let gb = df.group_by_stable(names)?;
3095 let groups = gb.get_groups();
3096
3097 let last_idx: NoNull<IdxCa> = groups
3098 .iter()
3099 .map(|g| match g {
3100 GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3101 GroupsIndicator::Slice([first, len]) => first + len - 1,
3102 })
3103 .collect();
3104
3105 let mut last_idx = last_idx.into_inner().sort(false);
3106
3107 if let Some((offset, len)) = slice {
3108 last_idx = last_idx.slice(offset, len);
3109 }
3110
3111 let last_idx = NoNull::new(last_idx);
3112 let out = unsafe { df.take_unchecked(&last_idx) };
3113 return Ok(out);
3114 },
3115 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3116 let gb = df.group_by(names)?;
3117 let groups = gb.get_groups();
3118 let (offset, len) = slice.unwrap_or((0, groups.len()));
3119 let groups = groups.slice(offset, len);
3120 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3121 },
3122 (UniqueKeepStrategy::Last, false) => {
3123 let gb = df.group_by(names)?;
3124 let groups = gb.get_groups();
3125 let (offset, len) = slice.unwrap_or((0, groups.len()));
3126 let groups = groups.slice(offset, len);
3127 df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3128 },
3129 (UniqueKeepStrategy::None, _) => {
3130 let df_part = df.select(names)?;
3131 let mask = df_part.is_unique()?;
3132 let mut filtered = df.filter(&mask)?;
3133
3134 if let Some((offset, len)) = slice {
3135 filtered = filtered.slice(offset, len);
3136 }
3137 return Ok(filtered);
3138 },
3139 };
3140 let height = Self::infer_height(&columns);
3141 Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3142 }
3143
3144 /// Get a mask of all the unique rows in the [`DataFrame`].
3145 ///
3146 /// # Example
3147 ///
3148 /// ```no_run
3149 /// # use polars_core::prelude::*;
3150 /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3151 /// "ISIN" => ["US0378331005", "US5949181045"])?;
3152 /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3153 ///
3154 /// assert!(ca.all());
3155 /// # Ok::<(), PolarsError>(())
3156 /// ```
3157 #[cfg(feature = "algorithm_group_by")]
3158 pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3159 let gb = self.group_by(self.get_column_names_owned())?;
3160 let groups = gb.get_groups();
3161 Ok(is_unique_helper(
3162 groups,
3163 self.height() as IdxSize,
3164 true,
3165 false,
3166 ))
3167 }
3168
3169 /// Get a mask of all the duplicated rows in the [`DataFrame`].
3170 ///
3171 /// # Example
3172 ///
3173 /// ```no_run
3174 /// # use polars_core::prelude::*;
3175 /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3176 /// "ISIN" => ["US02079K3059", "US02079K1079"])?;
3177 /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3178 ///
3179 /// assert!(!ca.all());
3180 /// # Ok::<(), PolarsError>(())
3181 /// ```
3182 #[cfg(feature = "algorithm_group_by")]
3183 pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3184 let gb = self.group_by(self.get_column_names_owned())?;
3185 let groups = gb.get_groups();
3186 Ok(is_unique_helper(
3187 groups,
3188 self.height() as IdxSize,
3189 false,
3190 true,
3191 ))
3192 }
3193
3194 /// Create a new [`DataFrame`] that shows the null counts per column.
3195 #[must_use]
3196 pub fn null_count(&self) -> Self {
3197 let cols = self
3198 .columns
3199 .iter()
3200 .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3201 .collect();
3202 unsafe { Self::new_no_checks(1, cols) }
3203 }
3204
3205 /// Hash and combine the row values
3206 #[cfg(feature = "row_hash")]
3207 pub fn hash_rows(
3208 &mut self,
3209 hasher_builder: Option<PlSeedableRandomStateQuality>,
3210 ) -> PolarsResult<UInt64Chunked> {
3211 let dfs = split_df(self, POOL.current_num_threads(), false);
3212 let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3213
3214 let mut iter = cas.into_iter();
3215 let mut acc_ca = iter.next().unwrap();
3216 for ca in iter {
3217 acc_ca.append(&ca)?;
3218 }
3219 Ok(acc_ca.rechunk().into_owned())
3220 }
3221
3222 /// Get the supertype of the columns in this DataFrame
3223 pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3224 self.columns
3225 .iter()
3226 .map(|s| Ok(s.dtype().clone()))
3227 .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3228 }
3229
3230 /// Take by index values given by the slice `idx`.
3231 /// # Warning
3232 /// Be careful with allowing threads when calling this in a large hot loop
3233 /// every thread split may be on rayon stack and lead to SO
3234 #[doc(hidden)]
3235 pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3236 self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3237 }
3238
3239 /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3240 /// if the index value in `idx` are sorted. This will maintain sorted flags.
3241 ///
3242 /// # Warning
3243 /// Be careful with allowing threads when calling this in a large hot loop
3244 /// every thread split may be on rayon stack and lead to SO
3245 #[doc(hidden)]
3246 pub unsafe fn _take_unchecked_slice_sorted(
3247 &self,
3248 idx: &[IdxSize],
3249 allow_threads: bool,
3250 sorted: IsSorted,
3251 ) -> Self {
3252 #[cfg(debug_assertions)]
3253 {
3254 if idx.len() > 2 {
3255 match sorted {
3256 IsSorted::Ascending => {
3257 assert!(idx[0] <= idx[idx.len() - 1]);
3258 },
3259 IsSorted::Descending => {
3260 assert!(idx[0] >= idx[idx.len() - 1]);
3261 },
3262 _ => {},
3263 }
3264 }
3265 }
3266 let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3267 ca.set_sorted_flag(sorted);
3268 self.take_unchecked_impl(&ca, allow_threads)
3269 }
3270
3271 #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3272 #[doc(hidden)]
3273 pub fn _partition_by_impl(
3274 &self,
3275 cols: &[PlSmallStr],
3276 stable: bool,
3277 include_key: bool,
3278 parallel: bool,
3279 ) -> PolarsResult<Vec<DataFrame>> {
3280 let selected_keys = self.select_columns(cols.iter().cloned())?;
3281 let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3282 let groups = groups.take_groups();
3283
3284 // drop key columns prior to calculation if requested
3285 let df = if include_key {
3286 self.clone()
3287 } else {
3288 self.drop_many(cols.iter().cloned())
3289 };
3290
3291 if parallel {
3292 // don't parallelize this
3293 // there is a lot of parallelization in take and this may easily SO
3294 POOL.install(|| {
3295 match groups.as_ref() {
3296 GroupsType::Idx(idx) => {
3297 // Rechunk as the gather may rechunk for every group #17562.
3298 let mut df = df.clone();
3299 df.as_single_chunk_par();
3300 Ok(idx
3301 .into_par_iter()
3302 .map(|(_, group)| {
3303 // groups are in bounds
3304 unsafe {
3305 df._take_unchecked_slice_sorted(
3306 group,
3307 false,
3308 IsSorted::Ascending,
3309 )
3310 }
3311 })
3312 .collect())
3313 },
3314 GroupsType::Slice { groups, .. } => Ok(groups
3315 .into_par_iter()
3316 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3317 .collect()),
3318 }
3319 })
3320 } else {
3321 match groups.as_ref() {
3322 GroupsType::Idx(idx) => {
3323 // Rechunk as the gather may rechunk for every group #17562.
3324 let mut df = df;
3325 df.as_single_chunk();
3326 Ok(idx
3327 .into_iter()
3328 .map(|(_, group)| {
3329 // groups are in bounds
3330 unsafe {
3331 df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3332 }
3333 })
3334 .collect())
3335 },
3336 GroupsType::Slice { groups, .. } => Ok(groups
3337 .iter()
3338 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3339 .collect()),
3340 }
3341 }
3342 }
3343
3344 /// Split into multiple DataFrames partitioned by groups
3345 #[cfg(feature = "partition_by")]
3346 pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3347 where
3348 I: IntoIterator<Item = S>,
3349 S: Into<PlSmallStr>,
3350 {
3351 let cols = cols
3352 .into_iter()
3353 .map(Into::into)
3354 .collect::<Vec<PlSmallStr>>();
3355 self._partition_by_impl(cols.as_slice(), false, include_key, true)
3356 }
3357
3358 /// Split into multiple DataFrames partitioned by groups
3359 /// Order of the groups are maintained.
3360 #[cfg(feature = "partition_by")]
3361 pub fn partition_by_stable<I, S>(
3362 &self,
3363 cols: I,
3364 include_key: bool,
3365 ) -> PolarsResult<Vec<DataFrame>>
3366 where
3367 I: IntoIterator<Item = S>,
3368 S: Into<PlSmallStr>,
3369 {
3370 let cols = cols
3371 .into_iter()
3372 .map(Into::into)
3373 .collect::<Vec<PlSmallStr>>();
3374 self._partition_by_impl(cols.as_slice(), true, include_key, true)
3375 }
3376
3377 /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3378 /// inserted as columns.
3379 #[cfg(feature = "dtype-struct")]
3380 pub fn unnest<I: IntoVec<PlSmallStr>>(
3381 &self,
3382 cols: I,
3383 separator: Option<&str>,
3384 ) -> PolarsResult<DataFrame> {
3385 let cols = cols.into_vec();
3386 self.unnest_impl(cols.into_iter().collect(), separator)
3387 }
3388
3389 #[cfg(feature = "dtype-struct")]
3390 fn unnest_impl(
3391 &self,
3392 cols: PlHashSet<PlSmallStr>,
3393 separator: Option<&str>,
3394 ) -> PolarsResult<DataFrame> {
3395 let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3396 let mut count = 0;
3397 for s in &self.columns {
3398 if cols.contains(s.name()) {
3399 let ca = s.struct_()?.clone();
3400 new_cols.extend(ca.fields_as_series().into_iter().map(|mut f| {
3401 if let Some(separator) = &separator {
3402 f.rename(polars_utils::format_pl_smallstr!(
3403 "{}{}{}",
3404 s.name(),
3405 separator,
3406 f.name()
3407 ));
3408 }
3409 Column::from(f)
3410 }));
3411 count += 1;
3412 } else {
3413 new_cols.push(s.clone())
3414 }
3415 }
3416 if count != cols.len() {
3417 // one or more columns not found
3418 // the code below will return an error with the missing name
3419 let schema = self.schema();
3420 for col in cols {
3421 let _ = schema
3422 .get(col.as_str())
3423 .ok_or_else(|| polars_err!(col_not_found = col))?;
3424 }
3425 }
3426 DataFrame::new(new_cols)
3427 }
3428
3429 pub(crate) fn infer_height(cols: &[Column]) -> usize {
3430 cols.first().map_or(0, Column::len)
3431 }
3432
3433 pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3434 // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3435 // append_chunk or something like this. It is just quite difficult to make that safe.
3436 let df = DataFrame::from(rb);
3437 polars_ensure!(
3438 self.schema() == df.schema(),
3439 SchemaMismatch: "cannot append record batch with different schema\n\n
3440 Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3441 );
3442 self.vstack_mut_owned_unchecked(df);
3443 Ok(())
3444 }
3445}
3446
3447pub struct RecordBatchIter<'a> {
3448 columns: &'a Vec<Column>,
3449 schema: ArrowSchemaRef,
3450 idx: usize,
3451 n_chunks: usize,
3452 compat_level: CompatLevel,
3453 parallel: bool,
3454}
3455
3456impl Iterator for RecordBatchIter<'_> {
3457 type Item = RecordBatch;
3458
3459 fn next(&mut self) -> Option<Self::Item> {
3460 if self.idx >= self.n_chunks {
3461 return None;
3462 }
3463
3464 // Create a batch of the columns with the same chunk no.
3465 let batch_cols: Vec<ArrayRef> = if self.parallel {
3466 let iter = self
3467 .columns
3468 .par_iter()
3469 .map(Column::as_materialized_series)
3470 .map(|s| s.to_arrow(self.idx, self.compat_level));
3471 POOL.install(|| iter.collect())
3472 } else {
3473 self.columns
3474 .iter()
3475 .map(Column::as_materialized_series)
3476 .map(|s| s.to_arrow(self.idx, self.compat_level))
3477 .collect()
3478 };
3479 self.idx += 1;
3480
3481 let length = batch_cols.first().map_or(0, |arr| arr.len());
3482 Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3483 }
3484
3485 fn size_hint(&self) -> (usize, Option<usize>) {
3486 let n = self.n_chunks - self.idx;
3487 (n, Some(n))
3488 }
3489}
3490
3491pub struct PhysRecordBatchIter<'a> {
3492 schema: ArrowSchemaRef,
3493 arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3494}
3495
3496impl Iterator for PhysRecordBatchIter<'_> {
3497 type Item = RecordBatch;
3498
3499 fn next(&mut self) -> Option<Self::Item> {
3500 let arrs = self
3501 .arr_iters
3502 .iter_mut()
3503 .map(|phys_iter| phys_iter.next().cloned())
3504 .collect::<Option<Vec<_>>>()?;
3505
3506 let length = arrs.first().map_or(0, |arr| arr.len());
3507 Some(RecordBatch::new(length, self.schema.clone(), arrs))
3508 }
3509
3510 fn size_hint(&self) -> (usize, Option<usize>) {
3511 if let Some(iter) = self.arr_iters.first() {
3512 iter.size_hint()
3513 } else {
3514 (0, None)
3515 }
3516 }
3517}
3518
3519impl Default for DataFrame {
3520 fn default() -> Self {
3521 DataFrame::empty()
3522 }
3523}
3524
3525impl From<DataFrame> for Vec<Column> {
3526 fn from(df: DataFrame) -> Self {
3527 df.columns
3528 }
3529}
3530
3531// utility to test if we can vstack/extend the columns
3532fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3533 polars_ensure!(
3534 left.name() == right.name(),
3535 ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3536 left.name(), right.name(),
3537 );
3538 Ok(())
3539}
3540
3541#[cfg(test)]
3542mod test {
3543 use super::*;
3544
3545 fn create_frame() -> DataFrame {
3546 let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3547 let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3548 DataFrame::new(vec![s0, s1]).unwrap()
3549 }
3550
3551 #[test]
3552 #[cfg_attr(miri, ignore)]
3553 fn test_recordbatch_iterator() {
3554 let df = df!(
3555 "foo" => [1, 2, 3, 4, 5]
3556 )
3557 .unwrap();
3558 let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3559 assert_eq!(5, iter.next().unwrap().len());
3560 assert!(iter.next().is_none());
3561 }
3562
3563 #[test]
3564 #[cfg_attr(miri, ignore)]
3565 fn test_select() {
3566 let df = create_frame();
3567 assert_eq!(
3568 df.column("days")
3569 .unwrap()
3570 .as_series()
3571 .unwrap()
3572 .equal(1)
3573 .unwrap()
3574 .sum(),
3575 Some(1)
3576 );
3577 }
3578
3579 #[test]
3580 #[cfg_attr(miri, ignore)]
3581 fn test_filter_broadcast_on_string_col() {
3582 let col_name = "some_col";
3583 let v = vec!["test".to_string()];
3584 let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3585 let mut df = DataFrame::new(vec![s0]).unwrap();
3586
3587 df = df
3588 .filter(
3589 &df.column(col_name)
3590 .unwrap()
3591 .as_materialized_series()
3592 .equal("")
3593 .unwrap(),
3594 )
3595 .unwrap();
3596 assert_eq!(
3597 df.column(col_name)
3598 .unwrap()
3599 .as_materialized_series()
3600 .n_chunks(),
3601 1
3602 );
3603 }
3604
3605 #[test]
3606 #[cfg_attr(miri, ignore)]
3607 fn test_filter_broadcast_on_list_col() {
3608 let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3609 let ll: ListChunked = [&s1].iter().copied().collect();
3610
3611 let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3612 let new = ll.filter(&mask).unwrap();
3613
3614 assert_eq!(new.chunks.len(), 1);
3615 assert_eq!(new.len(), 0);
3616 }
3617
3618 #[test]
3619 fn slice() {
3620 let df = create_frame();
3621 let sliced_df = df.slice(0, 2);
3622 assert_eq!(sliced_df.shape(), (2, 2));
3623 }
3624
3625 #[test]
3626 fn rechunk_false() {
3627 let df = create_frame();
3628 assert!(!df.should_rechunk())
3629 }
3630
3631 #[test]
3632 fn rechunk_true() -> PolarsResult<()> {
3633 let mut base = df!(
3634 "a" => [1, 2, 3],
3635 "b" => [1, 2, 3]
3636 )?;
3637
3638 // Create a series with multiple chunks
3639 let mut s = Series::new("foo".into(), 0..2);
3640 let s2 = Series::new("bar".into(), 0..1);
3641 s.append(&s2)?;
3642
3643 // Append series to frame
3644 let out = base.with_column(s)?;
3645
3646 // Now we should rechunk
3647 assert!(out.should_rechunk());
3648 Ok(())
3649 }
3650
3651 #[test]
3652 fn test_duplicate_column() {
3653 let mut df = df! {
3654 "foo" => [1, 2, 3]
3655 }
3656 .unwrap();
3657 // check if column is replaced
3658 assert!(
3659 df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3660 .is_ok()
3661 );
3662 assert!(
3663 df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3664 .is_ok()
3665 );
3666 assert!(df.column("bar").is_ok())
3667 }
3668
3669 #[test]
3670 #[cfg_attr(miri, ignore)]
3671 fn distinct() {
3672 let df = df! {
3673 "flt" => [1., 1., 2., 2., 3., 3.],
3674 "int" => [1, 1, 2, 2, 3, 3, ],
3675 "str" => ["a", "a", "b", "b", "c", "c"]
3676 }
3677 .unwrap();
3678 let df = df
3679 .unique_stable(None, UniqueKeepStrategy::First, None)
3680 .unwrap()
3681 .sort(["flt"], SortMultipleOptions::default())
3682 .unwrap();
3683 let valid = df! {
3684 "flt" => [1., 2., 3.],
3685 "int" => [1, 2, 3],
3686 "str" => ["a", "b", "c"]
3687 }
3688 .unwrap();
3689 assert!(df.equals(&valid));
3690 }
3691
3692 #[test]
3693 fn test_vstack() {
3694 // check that it does not accidentally rechunks
3695 let mut df = df! {
3696 "flt" => [1., 1., 2., 2., 3., 3.],
3697 "int" => [1, 1, 2, 2, 3, 3, ],
3698 "str" => ["a", "a", "b", "b", "c", "c"]
3699 }
3700 .unwrap();
3701
3702 df.vstack_mut(&df.slice(0, 3)).unwrap();
3703 assert_eq!(df.first_col_n_chunks(), 2)
3704 }
3705
3706 #[test]
3707 fn test_vstack_on_empty_dataframe() {
3708 let mut df = DataFrame::empty();
3709
3710 let df_data = df! {
3711 "flt" => [1., 1., 2., 2., 3., 3.],
3712 "int" => [1, 1, 2, 2, 3, 3, ],
3713 "str" => ["a", "a", "b", "b", "c", "c"]
3714 }
3715 .unwrap();
3716
3717 df.vstack_mut(&df_data).unwrap();
3718 assert_eq!(df.height, 6)
3719 }
3720
3721 #[test]
3722 fn test_replace_or_add() -> PolarsResult<()> {
3723 let mut df = df!(
3724 "a" => [1, 2, 3],
3725 "b" => [1, 2, 3]
3726 )?;
3727
3728 // check that the new column is "c" and not "bar".
3729 df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3730
3731 assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3732 Ok(())
3733 }
3734
3735 #[test]
3736 fn test_unique_keep_none_with_slice() {
3737 let df = df! {
3738 "x" => [1, 2, 3, 2, 1]
3739 }
3740 .unwrap();
3741 let out = df
3742 .unique_stable(
3743 Some(&["x".to_string()][..]),
3744 UniqueKeepStrategy::None,
3745 Some((0, 2)),
3746 )
3747 .unwrap();
3748 let expected = df! {
3749 "x" => [3]
3750 }
3751 .unwrap();
3752 assert!(out.equals(&expected));
3753 }
3754
3755 #[test]
3756 #[cfg(feature = "dtype-i8")]
3757 fn test_apply_result_schema() {
3758 let mut df = df! {
3759 "x" => [1, 2, 3, 2, 1]
3760 }
3761 .unwrap();
3762
3763 let schema_before = df.schema().clone();
3764 df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3765 assert_ne!(&schema_before, df.schema());
3766 }
3767}