polars_io/parquet/read/
options.rs

1use polars_core::schema::SchemaRef;
2#[cfg(feature = "serde")]
3use serde::{Deserialize, Serialize};
4
5#[derive(Clone, Debug, PartialEq, Eq, Hash)]
6#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
7#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
8pub struct ParquetOptions {
9    pub schema: Option<SchemaRef>,
10    pub parallel: ParallelStrategy,
11    pub low_memory: bool,
12    pub use_statistics: bool,
13}
14
15impl Default for ParquetOptions {
16    fn default() -> Self {
17        Self {
18            schema: None,
19            parallel: ParallelStrategy::default(),
20            low_memory: false,
21            use_statistics: true,
22        }
23    }
24}
25
26#[derive(Copy, Clone, Debug, Eq, PartialEq, Default, Hash)]
27#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
28#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
29pub enum ParallelStrategy {
30    /// Don't parallelize
31    None,
32    /// Parallelize over the columns
33    Columns,
34    /// Parallelize over the row groups
35    RowGroups,
36    /// First evaluates the pushed-down predicates in parallel and determines a mask of which rows
37    /// to read. Then, it parallelizes over both the columns and the row groups while filtering out
38    /// rows that do not need to be read. This can provide significant speedups for large files
39    /// (i.e. many row-groups) with a predicate that filters clustered rows or filters heavily. In
40    /// other cases, this may slow down the scan compared other strategies.
41    ///
42    /// If no predicate is given, this falls back to back to [`ParallelStrategy::Auto`].
43    Prefiltered,
44    /// Automatically determine over which unit to parallelize
45    /// This will choose the most occurring unit.
46    #[default]
47    Auto,
48}