polars_core/frame/
chunks.rs

1use arrow::record_batch::RecordBatch;
2use rayon::prelude::*;
3
4use crate::POOL;
5use crate::prelude::*;
6use crate::utils::{_split_offsets, accumulate_dataframes_vertical_unchecked, split_df_as_ref};
7
8impl From<RecordBatch> for DataFrame {
9    fn from(rb: RecordBatch) -> DataFrame {
10        let height = rb.height();
11        let (schema, arrays) = rb.into_schema_and_arrays();
12
13        let columns: Vec<Column> = arrays
14            .into_iter()
15            .zip(schema.iter())
16            .map(|(arr, (name, field))| {
17                // SAFETY: Record Batch has the invariant that the schema datatype matches the
18                // columns.
19                unsafe {
20                    Series::_try_from_arrow_unchecked_with_md(
21                        name.clone(),
22                        vec![arr],
23                        field.dtype(),
24                        field.metadata.as_deref(),
25                    )
26                }
27                .unwrap()
28                .into_column()
29            })
30            .collect();
31
32        // SAFETY: RecordBatch has the same invariants for names and heights as DataFrame.
33        unsafe { DataFrame::new_no_checks(height, columns) }
34    }
35}
36
37impl DataFrame {
38    pub fn split_chunks(&mut self) -> impl Iterator<Item = DataFrame> + '_ {
39        self.align_chunks_par();
40
41        let first_series_col_idx = self
42            .columns
43            .iter()
44            .position(|col| col.as_series().is_some());
45        let df_height = self.height();
46        let mut prev_height = 0;
47        (0..self.first_col_n_chunks()).map(move |i| unsafe {
48            // There might still be scalar/partitioned columns after aligning,
49            // so we follow the size of the chunked column, if any.
50            let chunk_size = first_series_col_idx
51                .map(|c| self.get_columns()[c].as_series().unwrap().chunks()[i].len())
52                .unwrap_or(df_height);
53            let columns = self
54                .get_columns()
55                .iter()
56                .map(|col| match col {
57                    Column::Series(s) => Column::from(s.select_chunk(i)),
58                    Column::Scalar(_) => col.slice(prev_height as i64, chunk_size),
59                })
60                .collect::<Vec<_>>();
61
62            prev_height += chunk_size;
63
64            DataFrame::new_no_checks(chunk_size, columns)
65        })
66    }
67
68    pub fn split_chunks_by_n(self, n: usize, parallel: bool) -> Vec<DataFrame> {
69        let split = _split_offsets(self.height(), n);
70
71        let split_fn = |(offset, len)| self.slice(offset as i64, len);
72
73        if parallel {
74            // Parallel so that null_counts run in parallel
75            POOL.install(|| split.into_par_iter().map(split_fn).collect())
76        } else {
77            split.into_iter().map(split_fn).collect()
78        }
79    }
80}
81
82/// Split DataFrame into chunks in preparation for writing. The chunks have a
83/// maximum number of rows per chunk to ensure reasonable memory efficiency when
84/// reading the resulting file, and a minimum size per chunk to ensure
85/// reasonable performance when writing.
86pub fn chunk_df_for_writing(
87    df: &mut DataFrame,
88    row_group_size: usize,
89) -> PolarsResult<std::borrow::Cow<'_, DataFrame>> {
90    // ensures all chunks are aligned.
91    df.align_chunks_par();
92
93    // Accumulate many small chunks to the row group size.
94    // See: #16403
95    if !df.get_columns().is_empty()
96        && df.get_columns()[0]
97            .as_materialized_series()
98            .chunk_lengths()
99            .take(5)
100            .all(|len| len < row_group_size)
101    {
102        fn finish(scratch: &mut Vec<DataFrame>, new_chunks: &mut Vec<DataFrame>) {
103            let mut new = accumulate_dataframes_vertical_unchecked(scratch.drain(..));
104            new.as_single_chunk_par();
105            new_chunks.push(new);
106        }
107
108        let mut new_chunks = Vec::with_capacity(df.first_col_n_chunks()); // upper limit;
109        let mut scratch = vec![];
110        let mut remaining = row_group_size;
111
112        for df in df.split_chunks() {
113            remaining = remaining.saturating_sub(df.height());
114            scratch.push(df);
115
116            if remaining == 0 {
117                remaining = row_group_size;
118                finish(&mut scratch, &mut new_chunks);
119            }
120        }
121        if !scratch.is_empty() {
122            finish(&mut scratch, &mut new_chunks);
123        }
124        return Ok(std::borrow::Cow::Owned(
125            accumulate_dataframes_vertical_unchecked(new_chunks),
126        ));
127    }
128
129    let n_splits = df.height() / row_group_size;
130    let result = if n_splits > 0 {
131        let mut splits = split_df_as_ref(df, n_splits, false);
132
133        for df in splits.iter_mut() {
134            // If the chunks are small enough, writing many small chunks
135            // leads to slow writing performance, so in that case we
136            // merge them.
137            let n_chunks = df.first_col_n_chunks();
138            if n_chunks > 1 && (df.estimated_size() / n_chunks < 128 * 1024) {
139                df.as_single_chunk_par();
140            }
141        }
142
143        std::borrow::Cow::Owned(accumulate_dataframes_vertical_unchecked(splits))
144    } else {
145        std::borrow::Cow::Borrowed(df)
146    };
147    Ok(result)
148}