polars_core/frame/
dataframe.rs

1use std::sync::{Arc, OnceLock};
2
3use polars_error::PolarsResult;
4
5use super::broadcast::{broadcast_columns, infer_broadcast_height};
6use super::validation::validate_columns_slice;
7use crate::frame::column::Column;
8use crate::schema::{Schema, SchemaRef};
9
10/// A contiguous growable collection of [`Column`]s that have the same length.
11///
12/// ## Use declarations
13///
14/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
15///
16/// ```rust
17/// use polars_core::prelude::*; // if the crate polars-core is used directly
18/// // use polars::prelude::*;      if the crate polars is used
19/// ```
20///
21/// # Initialization
22/// ## Default
23///
24/// A `DataFrame` can be initialized empty:
25///
26/// ```rust
27/// # use polars_core::prelude::*;
28/// let df = DataFrame::empty();
29/// assert_eq!(df.shape(), (0, 0));
30/// ```
31///
32/// ## Wrapping a `Vec<Series>`
33///
34/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
35///
36/// ```rust
37/// # use polars_core::prelude::*;
38/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
39/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
40///
41/// let df: PolarsResult<DataFrame> = DataFrame::new_infer_height(vec![s1, s2]);
42/// ```
43///
44/// ## Using a macro
45///
46/// The [`df!`] macro is a convenient method:
47///
48/// ```rust
49/// # use polars_core::prelude::*;
50/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
51///                                       "Color" => ["Red", "Yellow", "Green"]);
52/// ```
53///
54/// ## Using a CSV file
55///
56/// See the `polars_io::csv::CsvReader`.
57///
58/// # Indexing
59/// ## By a number
60///
61/// The `Index<usize>` is implemented for the `DataFrame`.
62///
63/// ```rust
64/// # use polars_core::prelude::*;
65/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
66///              "Color" => ["Red", "Yellow", "Green"])?;
67///
68/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
69/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
70/// # Ok::<(), PolarsError>(())
71/// ```
72///
73/// ## By a `Series` name
74///
75/// ```rust
76/// # use polars_core::prelude::*;
77/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
78///              "Color" => ["Red", "Yellow", "Green"])?;
79///
80/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
81/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
82/// # Ok::<(), PolarsError>(())
83/// ```
84#[derive(Clone)]
85pub struct DataFrame {
86    height: usize,
87    /// All columns must have length equal to `self.height`.
88    columns: Vec<Column>,
89    /// Cached schema. Must be cleared if column names / dtypes in `self.columns` change.
90    cached_schema: OnceLock<SchemaRef>,
91}
92
93impl Default for DataFrame {
94    fn default() -> Self {
95        DataFrame::empty()
96    }
97}
98
99impl DataFrame {
100    /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
101    ///
102    /// # Example
103    ///
104    /// ```rust
105    /// use polars_core::prelude::DataFrame;
106    /// static EMPTY: DataFrame = DataFrame::empty();
107    /// ```
108    pub const fn empty() -> Self {
109        DataFrame::empty_with_height(0)
110    }
111
112    pub const fn empty_with_height(height: usize) -> Self {
113        DataFrame {
114            height,
115            columns: vec![],
116            cached_schema: OnceLock::new(),
117        }
118    }
119
120    pub fn new(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
121        validate_columns_slice(height, &columns)
122            .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
123
124        Ok(unsafe { DataFrame::_new_unchecked_impl(height, columns) })
125    }
126
127    /// Height is sourced from first column.
128    pub fn new_infer_height(columns: Vec<Column>) -> PolarsResult<Self> {
129        DataFrame::new(columns.first().map_or(0, |c| c.len()), columns)
130    }
131
132    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
133    /// [`Column`]s.
134    ///
135    /// # Safety
136    /// [`Column`]s must have unique names and matching lengths.
137    pub unsafe fn new_unchecked(height: usize, columns: Vec<Column>) -> DataFrame {
138        if cfg!(debug_assertions) {
139            validate_columns_slice(height, &columns).unwrap();
140        }
141
142        unsafe { DataFrame::_new_unchecked_impl(height, columns) }
143    }
144
145    /// Height is sourced from first column. Does not check for matching height / duplicate names.
146    ///
147    /// # Safety
148    /// [`Column`]s must have unique names and matching lengths.
149    pub unsafe fn new_unchecked_infer_height(columns: Vec<Column>) -> DataFrame {
150        DataFrame::new_unchecked(columns.first().map_or(0, |c| c.len()), columns)
151    }
152
153    /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
154    /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
155    /// constructed with this method is generally highly unsafe and should not be long-lived.
156    #[expect(clippy::missing_safety_doc)]
157    pub const unsafe fn _new_unchecked_impl(height: usize, columns: Vec<Column>) -> DataFrame {
158        DataFrame {
159            height,
160            columns,
161            cached_schema: OnceLock::new(),
162        }
163    }
164
165    /// Broadcasts unit-length columns to `height`. Errors if a column has height that is non-unit
166    /// length and not equal to `self.height()`.
167    pub fn new_with_broadcast(height: usize, mut columns: Vec<Column>) -> PolarsResult<Self> {
168        broadcast_columns(height, &mut columns)?;
169        DataFrame::new(height, columns)
170    }
171
172    /// Infers height as the first non-unit length column or 1 if not found.
173    pub fn new_infer_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
174        DataFrame::new_with_broadcast(infer_broadcast_height(&columns), columns)
175    }
176
177    /// Broadcasts unit-length columns to `height`. Errors if a column has height that is non-unit
178    /// length and not equal to `self.height()`.
179    ///
180    /// # Safety
181    /// [`Column`]s must have unique names.
182    pub unsafe fn new_unchecked_with_broadcast(
183        height: usize,
184        mut columns: Vec<Column>,
185    ) -> PolarsResult<Self> {
186        broadcast_columns(height, &mut columns)?;
187        Ok(unsafe { DataFrame::new_unchecked(height, columns) })
188    }
189
190    /// # Safety
191    /// [`Column`]s must have unique names.
192    pub unsafe fn new_unchecked_infer_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
193        DataFrame::new_unchecked_with_broadcast(infer_broadcast_height(&columns), columns)
194    }
195
196    /// Create a `DataFrame` 0 height and columns as per the `schema`.
197    pub fn empty_with_schema(schema: &Schema) -> Self {
198        let cols = schema
199            .iter()
200            .map(|(name, dtype)| Column::new_empty(name.clone(), dtype))
201            .collect();
202
203        unsafe { DataFrame::_new_unchecked_impl(0, cols) }
204    }
205
206    /// Create an empty `DataFrame` with empty columns as per the `schema`.
207    pub fn empty_with_arc_schema(schema: SchemaRef) -> Self {
208        let mut df = DataFrame::empty_with_schema(&schema);
209        unsafe { df.set_schema(schema) };
210        df
211    }
212
213    /// Set the height (i.e. number of rows) of this [`DataFrame`].
214    ///
215    /// # Safety
216    ///
217    /// This needs to be equal to the length of all the columns, or `self.width()` must be 0.
218    #[inline]
219    pub unsafe fn set_height(&mut self, height: usize) -> &mut Self {
220        self.height = height;
221        self
222    }
223
224    /// Get the height of the [`DataFrame`] which is the number of rows.
225    #[inline]
226    pub fn height(&self) -> usize {
227        self.height
228    }
229
230    /// Get the number of columns in this [`DataFrame`].
231    #[inline]
232    pub fn width(&self) -> usize {
233        self.columns.len()
234    }
235
236    /// Get (height, width) of the [`DataFrame`].
237    ///
238    /// # Example
239    ///
240    /// ```rust
241    /// # use polars_core::prelude::*;
242    /// let df0: DataFrame = DataFrame::empty();
243    /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
244    /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
245    ///                          "2" => [1, 2, 3, 4, 5])?;
246    ///
247    /// assert_eq!(df0.shape(), (0 ,0));
248    /// assert_eq!(df1.shape(), (5, 1));
249    /// assert_eq!(df2.shape(), (5, 2));
250    /// # Ok::<(), PolarsError>(())
251    /// ```
252    #[inline]
253    pub fn shape(&self) -> (usize, usize) {
254        (self.height(), self.width())
255    }
256
257    /// 0 width or height.
258    #[inline]
259    pub fn shape_has_zero(&self) -> bool {
260        matches!(self.shape(), (0, _) | (_, 0))
261    }
262
263    #[inline]
264    pub fn columns(&self) -> &[Column] {
265        self.columns.as_slice()
266    }
267
268    #[inline]
269    pub fn into_columns(self) -> Vec<Column> {
270        self.columns
271    }
272
273    /// # Safety
274    ///
275    /// The caller must ensure the length of all [`Column`]s remains equal to `self.height`, or
276    /// that [`DataFrame::set_height`] is called afterwards with the new `height`.
277    #[inline]
278    pub unsafe fn columns_mut(&mut self) -> &mut Vec<Column> {
279        self.clear_schema();
280        &mut self.columns
281    }
282
283    /// # Safety
284    /// Adheres to all safety requirements of [`DataFrame::columns_mut`], and that the list of column
285    /// names remains unchanged.
286    #[inline]
287    pub unsafe fn columns_mut_retain_schema(&mut self) -> &mut Vec<Column> {
288        &mut self.columns
289    }
290
291    /// Get the schema of this [`DataFrame`].
292    ///
293    /// # Panics
294    /// Panics if there are duplicate column names.
295    pub fn schema(&self) -> &SchemaRef {
296        let out = self.cached_schema.get_or_init(|| {
297            Arc::new(
298                Schema::from_iter_check_duplicates(
299                    self.columns
300                        .iter()
301                        .map(|x| (x.name().clone(), x.dtype().clone())),
302                )
303                .unwrap(),
304            )
305        });
306
307        assert_eq!(out.len(), self.width());
308
309        out
310    }
311
312    #[inline]
313    pub fn cached_schema(&self) -> Option<&SchemaRef> {
314        self.cached_schema.get()
315    }
316
317    /// Set the cached schema
318    ///
319    /// # Safety
320    /// Schema must match the columns in `self`.
321    #[inline]
322    pub unsafe fn set_schema(&mut self, schema: SchemaRef) -> &mut Self {
323        self.cached_schema = schema.into();
324        self
325    }
326
327    /// Set the cached schema
328    ///
329    /// # Safety
330    /// Schema must match the columns in `self`.
331    #[inline]
332    pub unsafe fn with_schema(mut self, schema: SchemaRef) -> Self {
333        self.cached_schema = schema.into();
334        self
335    }
336
337    /// Set the cached schema if `schema` is `Some()`.
338    ///
339    /// # Safety
340    /// Schema must match the columns in `self`.
341    #[inline]
342    pub unsafe fn set_opt_schema(&mut self, schema: Option<SchemaRef>) -> &mut Self {
343        if let Some(schema) = schema {
344            unsafe { self.set_schema(schema) };
345        }
346
347        self
348    }
349
350    /// Clones the cached schema from `from` to `self.cached_schema` if there is one.
351    ///
352    /// # Safety
353    /// Schema must match the columns in `self`.
354    #[inline]
355    pub unsafe fn set_schema_from(&mut self, from: &DataFrame) -> &mut Self {
356        self.set_opt_schema(from.cached_schema().cloned());
357        self
358    }
359
360    /// Clones the cached schema from `from` to `self.cached_schema` if there is one.
361    ///
362    /// # Safety
363    /// Schema must match the columns in `self`.
364    #[inline]
365    pub unsafe fn with_schema_from(mut self, from: &DataFrame) -> Self {
366        self.set_opt_schema(from.cached_schema().cloned());
367        self
368    }
369
370    #[inline]
371    fn clear_schema(&mut self) -> &mut Self {
372        self.cached_schema = OnceLock::new();
373        self
374    }
375}