Skip to main content

polars_core/frame/
dataframe.rs

1use std::sync::{Arc, OnceLock};
2
3use polars_error::PolarsResult;
4
5use super::broadcast::{broadcast_columns, infer_broadcast_height};
6use super::validation::validate_columns_slice;
7use crate::frame::column::Column;
8use crate::schema::{Schema, SchemaRef};
9
10/// A contiguous growable collection of [`Column`]s that have the same length.
11///
12/// ## Use declarations
13///
14/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
15///
16/// ```rust
17/// use polars_core::prelude::*; // if the crate polars-core is used directly
18/// // use polars::prelude::*;      if the crate polars is used
19/// ```
20///
21/// # Initialization
22/// ## Default
23///
24/// A `DataFrame` can be initialized empty:
25///
26/// ```rust
27/// # use polars_core::prelude::*;
28/// let df = DataFrame::empty();
29/// assert_eq!(df.shape(), (0, 0));
30/// ```
31///
32/// ## Constructing from a `Vec<Column>`
33///
34/// A `DataFrame` is backed by a `Vec<Column>` where the `Column`s have the same length.
35/// ```rust
36/// # use polars_core::prelude::*;
37/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
38/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
39///
40/// let df: PolarsResult<DataFrame> = DataFrame::new_infer_height(vec![s1, s2]);
41/// ```
42///
43/// ## Using a macro
44///
45/// The [`df!`] macro is a convenient method:
46///
47/// ```rust
48/// # use polars_core::prelude::*;
49/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
50///                                       "Color" => ["Red", "Yellow", "Green"]);
51/// ```
52///
53/// ## Using a CSV file
54///
55/// See the `polars_io::csv::CsvReader`.
56///
57/// # Indexing
58/// ## By a number
59///
60/// The `Index<usize>` is implemented for the `DataFrame`.
61///
62/// ```rust
63/// # use polars_core::prelude::*;
64/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
65///              "Color" => ["Red", "Yellow", "Green"])?;
66///
67/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
68/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
69/// # Ok::<(), PolarsError>(())
70/// ```
71///
72/// ## By a `Series` name
73///
74/// ```rust
75/// # use polars_core::prelude::*;
76/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
77///              "Color" => ["Red", "Yellow", "Green"])?;
78///
79/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
80/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
81/// # Ok::<(), PolarsError>(())
82/// ```
83#[derive(Clone)]
84pub struct DataFrame {
85    height: usize,
86    /// All columns must have length equal to `self.height`.
87    columns: Vec<Column>,
88    /// Cached schema. Must be cleared if column names / dtypes in `self.columns` change.
89    cached_schema: OnceLock<SchemaRef>,
90}
91
92impl Default for DataFrame {
93    fn default() -> Self {
94        DataFrame::empty()
95    }
96}
97
98impl DataFrame {
99    /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
100    ///
101    /// # Example
102    ///
103    /// ```rust
104    /// use polars_core::prelude::DataFrame;
105    /// static EMPTY: DataFrame = DataFrame::empty();
106    /// ```
107    pub const fn empty() -> Self {
108        DataFrame::empty_with_height(0)
109    }
110
111    pub const fn empty_with_height(height: usize) -> Self {
112        DataFrame {
113            height,
114            columns: vec![],
115            cached_schema: OnceLock::new(),
116        }
117    }
118
119    pub fn new(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
120        validate_columns_slice(height, &columns)
121            .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
122
123        Ok(unsafe { DataFrame::_new_unchecked_impl(height, columns) })
124    }
125
126    /// Height is sourced from first column.
127    pub fn new_infer_height(columns: Vec<Column>) -> PolarsResult<Self> {
128        DataFrame::new(columns.first().map_or(0, |c| c.len()), columns)
129    }
130
131    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
132    /// [`Column`]s.
133    ///
134    /// # Safety
135    /// [`Column`]s must have unique names and matching lengths.
136    pub unsafe fn new_unchecked(height: usize, columns: Vec<Column>) -> DataFrame {
137        if cfg!(debug_assertions) {
138            validate_columns_slice(height, &columns).unwrap();
139        }
140
141        unsafe { DataFrame::_new_unchecked_impl(height, columns) }
142    }
143
144    /// Height is sourced from first column. Does not check for matching height / duplicate names.
145    ///
146    /// # Safety
147    /// [`Column`]s must have unique names and matching lengths.
148    pub unsafe fn new_unchecked_infer_height(columns: Vec<Column>) -> DataFrame {
149        DataFrame::new_unchecked(columns.first().map_or(0, |c| c.len()), columns)
150    }
151
152    /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
153    /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
154    /// constructed with this method is generally highly unsafe and should not be long-lived.
155    #[expect(clippy::missing_safety_doc)]
156    pub const unsafe fn _new_unchecked_impl(height: usize, columns: Vec<Column>) -> DataFrame {
157        DataFrame {
158            height,
159            columns,
160            cached_schema: OnceLock::new(),
161        }
162    }
163
164    /// Broadcasts unit-length columns to `height`. Errors if a column has height that is non-unit
165    /// length and not equal to `self.height()`.
166    pub fn new_with_broadcast(height: usize, mut columns: Vec<Column>) -> PolarsResult<Self> {
167        broadcast_columns(height, &mut columns)?;
168        DataFrame::new(height, columns)
169    }
170
171    /// Infers height as the first non-unit length column or 1 if not found.
172    pub fn new_infer_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
173        DataFrame::new_with_broadcast(infer_broadcast_height(&columns), columns)
174    }
175
176    /// Broadcasts unit-length columns to `height`. Errors if a column has height that is non-unit
177    /// length and not equal to `self.height()`.
178    ///
179    /// # Safety
180    /// [`Column`]s must have unique names.
181    pub unsafe fn new_unchecked_with_broadcast(
182        height: usize,
183        mut columns: Vec<Column>,
184    ) -> PolarsResult<Self> {
185        broadcast_columns(height, &mut columns)?;
186        Ok(unsafe { DataFrame::new_unchecked(height, columns) })
187    }
188
189    /// # Safety
190    /// [`Column`]s must have unique names.
191    pub unsafe fn new_unchecked_infer_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
192        DataFrame::new_unchecked_with_broadcast(infer_broadcast_height(&columns), columns)
193    }
194
195    /// Create a `DataFrame` 0 height and columns as per the `schema`.
196    pub fn empty_with_schema(schema: &Schema) -> Self {
197        let cols = schema
198            .iter()
199            .map(|(name, dtype)| Column::new_empty(name.clone(), dtype))
200            .collect();
201
202        unsafe { DataFrame::_new_unchecked_impl(0, cols) }
203    }
204
205    /// Create an empty `DataFrame` with empty columns as per the `schema`.
206    pub fn empty_with_arc_schema(schema: SchemaRef) -> Self {
207        let mut df = DataFrame::empty_with_schema(&schema);
208        unsafe { df.set_schema(schema) };
209        df
210    }
211
212    /// Set the height (i.e. number of rows) of this [`DataFrame`].
213    ///
214    /// # Safety
215    ///
216    /// This needs to be equal to the length of all the columns, or `self.width()` must be 0.
217    #[inline]
218    pub unsafe fn set_height(&mut self, height: usize) -> &mut Self {
219        self.height = height;
220        self
221    }
222
223    /// Get the height of the [`DataFrame`] which is the number of rows.
224    #[inline]
225    pub fn height(&self) -> usize {
226        self.height
227    }
228
229    /// Get the number of columns in this [`DataFrame`].
230    #[inline]
231    pub fn width(&self) -> usize {
232        self.columns.len()
233    }
234
235    /// Get (height, width) of the [`DataFrame`].
236    ///
237    /// # Example
238    ///
239    /// ```rust
240    /// # use polars_core::prelude::*;
241    /// let df0: DataFrame = DataFrame::empty();
242    /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
243    /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
244    ///                          "2" => [1, 2, 3, 4, 5])?;
245    ///
246    /// assert_eq!(df0.shape(), (0 ,0));
247    /// assert_eq!(df1.shape(), (5, 1));
248    /// assert_eq!(df2.shape(), (5, 2));
249    /// # Ok::<(), PolarsError>(())
250    /// ```
251    #[inline]
252    pub fn shape(&self) -> (usize, usize) {
253        (self.height(), self.width())
254    }
255
256    /// 0 width or height.
257    #[inline]
258    pub fn shape_has_zero(&self) -> bool {
259        matches!(self.shape(), (0, _) | (_, 0))
260    }
261
262    #[inline]
263    pub fn columns(&self) -> &[Column] {
264        self.columns.as_slice()
265    }
266
267    #[inline]
268    pub fn into_columns(self) -> Vec<Column> {
269        self.columns
270    }
271
272    /// # Safety
273    ///
274    /// The caller must ensure the length of all [`Column`]s remains equal to `self.height`, or
275    /// that [`DataFrame::set_height`] is called afterwards with the new `height`.
276    #[inline]
277    pub unsafe fn columns_mut(&mut self) -> &mut Vec<Column> {
278        self.clear_schema();
279        &mut self.columns
280    }
281
282    /// # Safety
283    /// Adheres to all safety requirements of [`DataFrame::columns_mut`], and that the list of column
284    /// names remains unchanged.
285    #[inline]
286    pub unsafe fn columns_mut_retain_schema(&mut self) -> &mut Vec<Column> {
287        &mut self.columns
288    }
289
290    /// Get the schema of this [`DataFrame`].
291    ///
292    /// # Panics
293    /// Panics if there are duplicate column names.
294    pub fn schema(&self) -> &SchemaRef {
295        let out = self.cached_schema.get_or_init(|| {
296            Arc::new(
297                Schema::from_iter_check_duplicates(
298                    self.columns
299                        .iter()
300                        .map(|x| (x.name().clone(), x.dtype().clone())),
301                )
302                .unwrap(),
303            )
304        });
305
306        assert_eq!(out.len(), self.width());
307
308        out
309    }
310
311    #[inline]
312    pub fn cached_schema(&self) -> Option<&SchemaRef> {
313        self.cached_schema.get()
314    }
315
316    /// Set the cached schema
317    ///
318    /// # Safety
319    /// Schema must match the columns in `self`.
320    #[inline]
321    pub unsafe fn set_schema(&mut self, schema: SchemaRef) -> &mut Self {
322        self.cached_schema = schema.into();
323        self
324    }
325
326    /// Set the cached schema
327    ///
328    /// # Safety
329    /// Schema must match the columns in `self`.
330    #[inline]
331    pub unsafe fn with_schema(mut self, schema: SchemaRef) -> Self {
332        self.cached_schema = schema.into();
333        self
334    }
335
336    /// Set the cached schema if `schema` is `Some()`.
337    ///
338    /// # Safety
339    /// Schema must match the columns in `self`.
340    #[inline]
341    pub unsafe fn set_opt_schema(&mut self, schema: Option<SchemaRef>) -> &mut Self {
342        if let Some(schema) = schema {
343            unsafe { self.set_schema(schema) };
344        }
345
346        self
347    }
348
349    /// Clones the cached schema from `from` to `self.cached_schema` if there is one.
350    ///
351    /// # Safety
352    /// Schema must match the columns in `self`.
353    #[inline]
354    pub unsafe fn set_schema_from(&mut self, from: &DataFrame) -> &mut Self {
355        self.set_opt_schema(from.cached_schema().cloned());
356        self
357    }
358
359    /// Clones the cached schema from `from` to `self.cached_schema` if there is one.
360    ///
361    /// # Safety
362    /// Schema must match the columns in `self`.
363    #[inline]
364    pub unsafe fn with_schema_from(mut self, from: &DataFrame) -> Self {
365        self.set_opt_schema(from.cached_schema().cloned());
366        self
367    }
368
369    #[inline]
370    fn clear_schema(&mut self) -> &mut Self {
371        self.cached_schema = OnceLock::new();
372        self
373    }
374}