Struct polars_lazy::frame::LazyFrame
source · pub struct LazyFrame {
pub logical_plan: DslPlan,
/* private fields */
}
Expand description
Lazy abstraction over an eager DataFrame
.
It really is an abstraction over a logical plan. The methods of this struct will incrementally
modify a logical plan until output is requested (via collect
).
Fields§
§logical_plan: DslPlan
Implementations§
source§impl LazyFrame
impl LazyFrame
pub fn scan_from_python_function( schema: Schema, scan_fn: PyObject, pyarrow: bool, ) -> Self
python
only.source§impl LazyFrame
impl LazyFrame
pub fn set_cached_arena(&self, lp_arena: Arena<IR>, expr_arena: Arena<AExpr>)
pub fn schema_with_arenas( &mut self, lp_arena: &mut Arena<IR>, expr_arena: &mut Arena<AExpr>, ) -> PolarsResult<SchemaRef>
sourcepub fn collect_schema(&mut self) -> PolarsResult<SchemaRef>
pub fn collect_schema(&mut self) -> PolarsResult<SchemaRef>
Get a handle to the schema — a map from column names to data types — of the current
LazyFrame
computation.
Returns an Err
if the logical plan has already encountered an error (i.e., if
self.collect()
would fail), Ok
otherwise.
source§impl LazyFrame
impl LazyFrame
pub fn collect_concurrently(self) -> PolarsResult<InProcessQuery>
source§impl LazyFrame
impl LazyFrame
sourcepub fn get_current_optimizations(&self) -> OptFlags
pub fn get_current_optimizations(&self) -> OptFlags
Get current optimizations.
sourcepub fn with_optimizations(self, opt_state: OptFlags) -> Self
pub fn with_optimizations(self, opt_state: OptFlags) -> Self
Set allowed optimizations.
sourcepub fn without_optimizations(self) -> Self
pub fn without_optimizations(self) -> Self
Turn off all optimizations.
sourcepub fn with_projection_pushdown(self, toggle: bool) -> Self
pub fn with_projection_pushdown(self, toggle: bool) -> Self
Toggle projection pushdown optimization.
sourcepub fn with_cluster_with_columns(self, toggle: bool) -> Self
pub fn with_cluster_with_columns(self, toggle: bool) -> Self
Toggle cluster with columns optimization.
sourcepub fn with_predicate_pushdown(self, toggle: bool) -> Self
pub fn with_predicate_pushdown(self, toggle: bool) -> Self
Toggle predicate pushdown optimization.
sourcepub fn with_type_coercion(self, toggle: bool) -> Self
pub fn with_type_coercion(self, toggle: bool) -> Self
Toggle type coercion optimization.
sourcepub fn with_simplify_expr(self, toggle: bool) -> Self
pub fn with_simplify_expr(self, toggle: bool) -> Self
Toggle expression simplification optimization on or off.
sourcepub fn with_comm_subplan_elim(self, toggle: bool) -> Self
Available on crate feature cse
only.
pub fn with_comm_subplan_elim(self, toggle: bool) -> Self
cse
only.Toggle common subplan elimination optimization on or off
sourcepub fn with_comm_subexpr_elim(self, toggle: bool) -> Self
Available on crate feature cse
only.
pub fn with_comm_subexpr_elim(self, toggle: bool) -> Self
cse
only.Toggle common subexpression elimination optimization on or off
sourcepub fn with_slice_pushdown(self, toggle: bool) -> Self
pub fn with_slice_pushdown(self, toggle: bool) -> Self
Toggle slice pushdown optimization.
sourcepub fn with_streaming(self, toggle: bool) -> Self
Available on crate feature streaming
only.
pub fn with_streaming(self, toggle: bool) -> Self
streaming
only.Run nodes that are capably of doing so on the streaming engine.
pub fn with_new_streaming(self, toggle: bool) -> Self
new_streaming
only.sourcepub fn with_row_estimate(self, toggle: bool) -> Self
pub fn with_row_estimate(self, toggle: bool) -> Self
Try to estimate the number of rows so that joins can determine which side to keep in memory.
sourcepub fn _with_eager(self, toggle: bool) -> Self
pub fn _with_eager(self, toggle: bool) -> Self
Run every node eagerly. This turns off multi-node optimizations.
sourcepub fn describe_plan(&self) -> PolarsResult<String>
pub fn describe_plan(&self) -> PolarsResult<String>
Return a String describing the naive (un-optimized) logical plan.
sourcepub fn describe_plan_tree(&self) -> PolarsResult<String>
pub fn describe_plan_tree(&self) -> PolarsResult<String>
Return a String describing the naive (un-optimized) logical plan in tree format.
sourcepub fn describe_optimized_plan(&self) -> PolarsResult<String>
pub fn describe_optimized_plan(&self) -> PolarsResult<String>
Return a String describing the optimized logical plan.
Returns Err
if optimizing the logical plan fails.
sourcepub fn describe_optimized_plan_tree(&self) -> PolarsResult<String>
pub fn describe_optimized_plan_tree(&self) -> PolarsResult<String>
Return a String describing the optimized logical plan in tree format.
Returns Err
if optimizing the logical plan fails.
sourcepub fn explain(&self, optimized: bool) -> PolarsResult<String>
pub fn explain(&self, optimized: bool) -> PolarsResult<String>
Return a String describing the logical plan.
If optimized
is true
, explains the optimized plan. If optimized
is `false,
explains the naive, un-optimized plan.
sourcepub fn sort(
self,
by: impl IntoVec<PlSmallStr>,
sort_options: SortMultipleOptions,
) -> Self
pub fn sort( self, by: impl IntoVec<PlSmallStr>, sort_options: SortMultipleOptions, ) -> Self
Add a sort operation to the logical plan.
Sorts the LazyFrame by the column name specified using the provided options.
§Example
Sort DataFrame by ‘sepal_width’ column:
fn sort_by_a(df: DataFrame) -> LazyFrame {
df.lazy().sort(["sepal_width"], Default::default())
}
Sort by a single column with specific order:
fn sort_with_specific_order(df: DataFrame, descending: bool) -> LazyFrame {
df.lazy().sort(
["sepal_width"],
SortMultipleOptions::new()
.with_order_descending(descending)
)
}
Sort by multiple columns with specifying order for each column:
fn sort_by_multiple_columns_with_specific_order(df: DataFrame) -> LazyFrame {
df.lazy().sort(
["sepal_width", "sepal_length"],
SortMultipleOptions::new()
.with_order_descending_multi([false, true])
)
}
See SortMultipleOptions
for more options.
sourcepub fn sort_by_exprs<E: AsRef<[Expr]>>(
self,
by_exprs: E,
sort_options: SortMultipleOptions,
) -> Self
pub fn sort_by_exprs<E: AsRef<[Expr]>>( self, by_exprs: E, sort_options: SortMultipleOptions, ) -> Self
Add a sort operation to the logical plan.
Sorts the LazyFrame by the provided list of expressions, which will be turned into concrete columns before sorting.
See SortMultipleOptions
for more options.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
/// Sort DataFrame by 'sepal_width' column
fn example(df: DataFrame) -> LazyFrame {
df.lazy()
.sort_by_exprs(vec![col("sepal_width")], Default::default())
}
pub fn top_k<E: AsRef<[Expr]>>( self, k: IdxSize, by_exprs: E, sort_options: SortMultipleOptions, ) -> Self
pub fn bottom_k<E: AsRef<[Expr]>>( self, k: IdxSize, by_exprs: E, sort_options: SortMultipleOptions, ) -> Self
sourcepub fn reverse(self) -> Self
pub fn reverse(self) -> Self
Reverse the DataFrame
from top to bottom.
Row i
becomes row number_of_rows - i - 1
.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
fn example(df: DataFrame) -> LazyFrame {
df.lazy()
.reverse()
}
sourcepub fn rename<I, J, T, S>(self, existing: I, new: J) -> Self
pub fn rename<I, J, T, S>(self, existing: I, new: J) -> Self
Rename columns in the DataFrame.
existing
and new
are iterables of the same length containing the old and
corresponding new column names. Renaming happens to all existing
columns
simultaneously, not iteratively. (In particular, all columns in existing
must
already exist in the LazyFrame
when rename
is called.)
sourcepub fn drop<I, T>(self, columns: I) -> Self
pub fn drop<I, T>(self, columns: I) -> Self
Removes columns from the DataFrame. Note that it’s better to only select the columns you need and let the projection pushdown optimize away the unneeded columns.
Any given columns that are not in the schema will give a [PolarsError::ColumnNotFound
]
error while materializing the LazyFrame
.
sourcepub fn drop_no_validate<I, T>(self, columns: I) -> Self
pub fn drop_no_validate<I, T>(self, columns: I) -> Self
Removes columns from the DataFrame. Note that it’s better to only select the columns you need and let the projection pushdown optimize away the unneeded columns.
If a column name does not exist in the schema, it will quietly be ignored.
sourcepub fn shift<E: Into<Expr>>(self, n: E) -> Self
pub fn shift<E: Into<Expr>>(self, n: E) -> Self
Shift the values by a given period and fill the parts that will be empty due to this operation
with Nones
.
See the method on Series for more info on the shift
operation.
sourcepub fn shift_and_fill<E: Into<Expr>, IE: Into<Expr>>(
self,
n: E,
fill_value: IE,
) -> Self
pub fn shift_and_fill<E: Into<Expr>, IE: Into<Expr>>( self, n: E, fill_value: IE, ) -> Self
Shift the values by a given period and fill the parts that will be empty due to this operation
with the result of the fill_value
expression.
See the method on Series for more info on the shift
operation.
sourcepub fn fill_null<E: Into<Expr>>(self, fill_value: E) -> LazyFrame
pub fn fill_null<E: Into<Expr>>(self, fill_value: E) -> LazyFrame
Fill None values in the DataFrame with an expression.
sourcepub fn fill_nan<E: Into<Expr>>(self, fill_value: E) -> LazyFrame
pub fn fill_nan<E: Into<Expr>>(self, fill_value: E) -> LazyFrame
Fill NaN values in the DataFrame with an expression.
sourcepub fn cache(self) -> Self
pub fn cache(self) -> Self
Caches the result into a new LazyFrame.
This should be used to prevent computations running multiple times.
sourcepub fn cast(self, dtypes: PlHashMap<&str, DataType>, strict: bool) -> Self
pub fn cast(self, dtypes: PlHashMap<&str, DataType>, strict: bool) -> Self
Cast named frame columns, resulting in a new LazyFrame with updated dtypes
sourcepub fn cast_all(self, dtype: DataType, strict: bool) -> Self
pub fn cast_all(self, dtype: DataType, strict: bool) -> Self
Cast all frame columns to the given dtype, resulting in a new LazyFrame
sourcepub fn fetch(self, n_rows: usize) -> PolarsResult<DataFrame>
pub fn fetch(self, n_rows: usize) -> PolarsResult<DataFrame>
Fetch is like a collect operation, but it overwrites the number of rows read by every scan operation. This is a utility that helps debug a query on a smaller number of rows.
Note that the fetch does not guarantee the final number of rows in the DataFrame. Filter, join operations and a lower number of rows available in the scanned file influence the final number of rows.
pub fn optimize( self, lp_arena: &mut Arena<IR>, expr_arena: &mut Arena<AExpr>, ) -> PolarsResult<Node>
pub fn to_alp_optimized(self) -> PolarsResult<IRPlan>
pub fn to_alp(self) -> PolarsResult<IRPlan>
pub fn _collect_post_opt<P>(self, post_opt: P) -> PolarsResult<DataFrame>
sourcepub fn collect(self) -> PolarsResult<DataFrame>
pub fn collect(self) -> PolarsResult<DataFrame>
Execute all the lazy operations and collect them into a DataFrame
.
The query is optimized prior to execution.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
fn example(df: DataFrame) -> PolarsResult<DataFrame> {
df.lazy()
.group_by([col("foo")])
.agg([col("bar").sum(), col("ham").mean().alias("avg_ham")])
.collect()
}
sourcepub fn profile(self) -> PolarsResult<(DataFrame, DataFrame)>
pub fn profile(self) -> PolarsResult<(DataFrame, DataFrame)>
Profile a LazyFrame.
This will run the query and return a tuple containing the materialized DataFrame and a DataFrame that contains profiling information of each node that is executed.
The units of the timings are microseconds.
sourcepub fn sink_parquet(
self,
path: impl AsRef<Path>,
options: ParquetWriteOptions,
) -> PolarsResult<()>
Available on crate feature parquet
only.
pub fn sink_parquet( self, path: impl AsRef<Path>, options: ParquetWriteOptions, ) -> PolarsResult<()>
parquet
only.Stream a query result into a parquet file. This is useful if the final result doesn’t fit into memory. This methods will return an error if the query cannot be completely done in a streaming fashion.
sourcepub fn sink_parquet_cloud(
self,
uri: String,
cloud_options: Option<CloudOptions>,
parquet_options: ParquetWriteOptions,
) -> PolarsResult<()>
Available on crate features cloud_write
and parquet
only.
pub fn sink_parquet_cloud( self, uri: String, cloud_options: Option<CloudOptions>, parquet_options: ParquetWriteOptions, ) -> PolarsResult<()>
cloud_write
and parquet
only.Stream a query result into a parquet file on an ObjectStore-compatible cloud service. This is useful if the final result doesn’t fit into memory, and where you do not want to write to a local file but to a location in the cloud. This method will return an error if the query cannot be completely done in a streaming fashion.
sourcepub fn sink_ipc(
self,
path: impl AsRef<Path>,
options: IpcWriterOptions,
) -> PolarsResult<()>
Available on crate feature ipc
only.
pub fn sink_ipc( self, path: impl AsRef<Path>, options: IpcWriterOptions, ) -> PolarsResult<()>
ipc
only.Stream a query result into an ipc/arrow file. This is useful if the final result doesn’t fit into memory. This methods will return an error if the query cannot be completely done in a streaming fashion.
sourcepub fn sink_ipc_cloud(
self,
uri: String,
cloud_options: Option<CloudOptions>,
ipc_options: IpcWriterOptions,
) -> PolarsResult<()>
Available on crate features cloud_write
and ipc
only.
pub fn sink_ipc_cloud( self, uri: String, cloud_options: Option<CloudOptions>, ipc_options: IpcWriterOptions, ) -> PolarsResult<()>
cloud_write
and ipc
only.Stream a query result into an ipc/arrow file on an ObjectStore-compatible cloud service. This is useful if the final result doesn’t fit into memory, and where you do not want to write to a local file but to a location in the cloud. This method will return an error if the query cannot be completely done in a streaming fashion.
sourcepub fn sink_csv(
self,
path: impl AsRef<Path>,
options: CsvWriterOptions,
) -> PolarsResult<()>
Available on crate feature csv
only.
pub fn sink_csv( self, path: impl AsRef<Path>, options: CsvWriterOptions, ) -> PolarsResult<()>
csv
only.Stream a query result into an csv file. This is useful if the final result doesn’t fit into memory. This methods will return an error if the query cannot be completely done in a streaming fashion.
sourcepub fn sink_json(
self,
path: impl AsRef<Path>,
options: JsonWriterOptions,
) -> PolarsResult<()>
Available on crate feature json
only.
pub fn sink_json( self, path: impl AsRef<Path>, options: JsonWriterOptions, ) -> PolarsResult<()>
json
only.Stream a query result into a json file. This is useful if the final result doesn’t fit into memory. This methods will return an error if the query cannot be completely done in a streaming fashion.
sourcepub fn filter(self, predicate: Expr) -> Self
pub fn filter(self, predicate: Expr) -> Self
Filter by some predicate expression.
The expression must yield boolean values.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
fn example(df: DataFrame) -> LazyFrame {
df.lazy()
.filter(col("sepal_width").is_not_null())
.select([col("sepal_width"), col("sepal_length")])
}
sourcepub fn select<E: AsRef<[Expr]>>(self, exprs: E) -> Self
pub fn select<E: AsRef<[Expr]>>(self, exprs: E) -> Self
Select (and optionally rename, with alias
) columns from the query.
Columns can be selected with col
;
If you want to select all columns use col(PlSmallStr::from_static("*"))
.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
/// This function selects column "foo" and column "bar".
/// Column "bar" is renamed to "ham".
fn example(df: DataFrame) -> LazyFrame {
df.lazy()
.select([col("foo"),
col("bar").alias("ham")])
}
/// This function selects all columns except "foo"
fn exclude_a_column(df: DataFrame) -> LazyFrame {
df.lazy()
.select([col(PlSmallStr::from_static("*")).exclude(["foo"])])
}
pub fn select_seq<E: AsRef<[Expr]>>(self, exprs: E) -> Self
sourcepub fn group_by<E: AsRef<[IE]>, IE: Into<Expr> + Clone>(
self,
by: E,
) -> LazyGroupBy
pub fn group_by<E: AsRef<[IE]>, IE: Into<Expr> + Clone>( self, by: E, ) -> LazyGroupBy
Performs a “group-by” on a LazyFrame
, producing a LazyGroupBy
, which can subsequently be aggregated.
Takes a list of expressions to group on.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
use arrow::legacy::prelude::QuantileInterpolOptions;
fn example(df: DataFrame) -> LazyFrame {
df.lazy()
.group_by([col("date")])
.agg([
col("rain").min().alias("min_rain"),
col("rain").sum().alias("sum_rain"),
col("rain").quantile(lit(0.5), QuantileInterpolOptions::Nearest).alias("median_rain"),
])
}
sourcepub fn rolling<E: AsRef<[Expr]>>(
self,
index_column: Expr,
group_by: E,
options: RollingGroupOptions,
) -> LazyGroupBy
Available on crate feature dynamic_group_by
only.
pub fn rolling<E: AsRef<[Expr]>>( self, index_column: Expr, group_by: E, options: RollingGroupOptions, ) -> LazyGroupBy
dynamic_group_by
only.Create rolling groups based on a time column.
Also works for index values of type UInt32, UInt64, Int32, or Int64.
Different from a group_by_dynamic
, the windows are now determined by the
individual values and are not of constant intervals. For constant intervals use
group_by_dynamic
sourcepub fn group_by_dynamic<E: AsRef<[Expr]>>(
self,
index_column: Expr,
group_by: E,
options: DynamicGroupOptions,
) -> LazyGroupBy
Available on crate feature dynamic_group_by
only.
pub fn group_by_dynamic<E: AsRef<[Expr]>>( self, index_column: Expr, group_by: E, options: DynamicGroupOptions, ) -> LazyGroupBy
dynamic_group_by
only.Group based on a time value (or index value of type Int32, Int64).
Time windows are calculated and rows are assigned to windows. Different from a normal group_by is that a row can be member of multiple groups. The time/index window could be seen as a rolling window, with a window size determined by dates/times/values instead of slots in the DataFrame.
A window is defined by:
- every: interval of the window
- period: length of the window
- offset: offset of the window
The group_by
argument should be empty []
if you don’t want to combine this
with a ordinary group_by on these keys.
sourcepub fn group_by_stable<E: AsRef<[IE]>, IE: Into<Expr> + Clone>(
self,
by: E,
) -> LazyGroupBy
pub fn group_by_stable<E: AsRef<[IE]>, IE: Into<Expr> + Clone>( self, by: E, ) -> LazyGroupBy
Similar to group_by
, but order of the DataFrame is maintained.
sourcepub fn anti_join<E: Into<Expr>>(
self,
other: LazyFrame,
left_on: E,
right_on: E,
) -> LazyFrame
Available on crate feature semi_anti_join
only.
pub fn anti_join<E: Into<Expr>>( self, other: LazyFrame, left_on: E, right_on: E, ) -> LazyFrame
semi_anti_join
only.Left anti join this query with another lazy query.
Matches on the values of the expressions left_on
and right_on
. For more
flexible join logic, see join
or
join_builder
.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
fn anti_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
ldf
.anti_join(other, col("foo"), col("bar").cast(DataType::String))
}
sourcepub fn cross_join(
self,
other: LazyFrame,
suffix: Option<PlSmallStr>,
) -> LazyFrame
Available on crate feature cross_join
only.
pub fn cross_join( self, other: LazyFrame, suffix: Option<PlSmallStr>, ) -> LazyFrame
cross_join
only.Creates the Cartesian product from both frames, preserving the order of the left keys.
sourcepub fn left_join<E: Into<Expr>>(
self,
other: LazyFrame,
left_on: E,
right_on: E,
) -> LazyFrame
pub fn left_join<E: Into<Expr>>( self, other: LazyFrame, left_on: E, right_on: E, ) -> LazyFrame
Left outer join this query with another lazy query.
Matches on the values of the expressions left_on
and right_on
. For more
flexible join logic, see join
or
join_builder
.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
fn left_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
ldf
.left_join(other, col("foo"), col("bar"))
}
sourcepub fn inner_join<E: Into<Expr>>(
self,
other: LazyFrame,
left_on: E,
right_on: E,
) -> LazyFrame
pub fn inner_join<E: Into<Expr>>( self, other: LazyFrame, left_on: E, right_on: E, ) -> LazyFrame
Inner join this query with another lazy query.
Matches on the values of the expressions left_on
and right_on
. For more
flexible join logic, see join
or
join_builder
.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
fn inner_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
ldf
.inner_join(other, col("foo"), col("bar").cast(DataType::String))
}
sourcepub fn full_join<E: Into<Expr>>(
self,
other: LazyFrame,
left_on: E,
right_on: E,
) -> LazyFrame
pub fn full_join<E: Into<Expr>>( self, other: LazyFrame, left_on: E, right_on: E, ) -> LazyFrame
Full outer join this query with another lazy query.
Matches on the values of the expressions left_on
and right_on
. For more
flexible join logic, see join
or
join_builder
.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
fn full_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
ldf
.full_join(other, col("foo"), col("bar"))
}
sourcepub fn semi_join<E: Into<Expr>>(
self,
other: LazyFrame,
left_on: E,
right_on: E,
) -> LazyFrame
Available on crate feature semi_anti_join
only.
pub fn semi_join<E: Into<Expr>>( self, other: LazyFrame, left_on: E, right_on: E, ) -> LazyFrame
semi_anti_join
only.Left semi join this query with another lazy query.
Matches on the values of the expressions left_on
and right_on
. For more
flexible join logic, see join
or
join_builder
.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
fn semi_join_dataframes(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
ldf
.semi_join(other, col("foo"), col("bar").cast(DataType::String))
}
sourcepub fn join<E: AsRef<[Expr]>>(
self,
other: LazyFrame,
left_on: E,
right_on: E,
args: JoinArgs,
) -> LazyFrame
pub fn join<E: AsRef<[Expr]>>( self, other: LazyFrame, left_on: E, right_on: E, args: JoinArgs, ) -> LazyFrame
Generic function to join two LazyFrames.
join
can join on multiple columns, given as two list of expressions, and with a
JoinType
specified by how
. Non-joined column names in the right DataFrame
that already exist in this DataFrame are suffixed with "_right"
. For control
over how columns are renamed and parallelization options, use
join_builder
.
Any provided args.slice
parameter is not considered, but set by the internal optimizer.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
fn example(ldf: LazyFrame, other: LazyFrame) -> LazyFrame {
ldf
.join(other, [col("foo"), col("bar")], [col("foo"), col("bar")], JoinArgs::new(JoinType::Inner))
}
sourcepub fn join_builder(self) -> JoinBuilder
pub fn join_builder(self) -> JoinBuilder
Consume self
and return a JoinBuilder
to customize a join on this LazyFrame.
After the JoinBuilder
has been created and set up, calling
finish()
on it will give back the LazyFrame
representing the join
operation.
sourcepub fn with_column(self, expr: Expr) -> LazyFrame
pub fn with_column(self, expr: Expr) -> LazyFrame
Add or replace a column, given as an expression, to a DataFrame.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
fn add_column(df: DataFrame) -> LazyFrame {
df.lazy()
.with_column(
when(col("sepal_length").lt(lit(5.0)))
.then(lit(10))
.otherwise(lit(1))
.alias("new_column_name"),
)
}
sourcepub fn with_columns<E: AsRef<[Expr]>>(self, exprs: E) -> LazyFrame
pub fn with_columns<E: AsRef<[Expr]>>(self, exprs: E) -> LazyFrame
Add or replace multiple columns, given as expressions, to a DataFrame.
§Example
use polars_core::prelude::*;
use polars_lazy::prelude::*;
fn add_columns(df: DataFrame) -> LazyFrame {
df.lazy()
.with_columns(
vec![lit(10).alias("foo"), lit(100).alias("bar")]
)
}
sourcepub fn with_columns_seq<E: AsRef<[Expr]>>(self, exprs: E) -> LazyFrame
pub fn with_columns_seq<E: AsRef<[Expr]>>(self, exprs: E) -> LazyFrame
Add or replace multiple columns to a DataFrame, but evaluate them sequentially.
pub fn with_context<C: AsRef<[LazyFrame]>>(self, contexts: C) -> LazyFrame
sourcepub fn max(self) -> Self
pub fn max(self) -> Self
Aggregate all the columns as their maximum values.
Aggregated columns will have the same names as the original columns.
sourcepub fn min(self) -> Self
pub fn min(self) -> Self
Aggregate all the columns as their minimum values.
Aggregated columns will have the same names as the original columns.
sourcepub fn sum(self) -> Self
pub fn sum(self) -> Self
Aggregate all the columns as their sum values.
Aggregated columns will have the same names as the original columns.
- Boolean columns will sum to a
u32
containing the number oftrue
s. - For integer columns, the ordinary checks for overflow are performed:
if running in
debug
mode, overflows will panic, whereas inrelease
mode overflows will silently wrap. - String columns will sum to None.
sourcepub fn mean(self) -> Self
pub fn mean(self) -> Self
Aggregate all the columns as their mean values.
- Boolean and integer columns are converted to
f64
before computing the mean. - String columns will have a mean of None.
sourcepub fn median(self) -> Self
pub fn median(self) -> Self
Aggregate all the columns as their median values.
- Boolean and integer results are converted to
f64
. However, they are still susceptible to overflow before this conversion occurs. - String columns will sum to None.
sourcepub fn quantile(self, quantile: Expr, interpol: QuantileInterpolOptions) -> Self
pub fn quantile(self, quantile: Expr, interpol: QuantileInterpolOptions) -> Self
Aggregate all the columns as their quantile values.
sourcepub fn std(self, ddof: u8) -> Self
pub fn std(self, ddof: u8) -> Self
Aggregate all the columns as their standard deviation values.
ddof
is the “Delta Degrees of Freedom”; N - ddof
will be the denominator when
computing the variance, where N
is the number of rows.
In standard statistical practice,
ddof=1
provides an unbiased estimator of the variance of a hypothetical infinite population.ddof=0
provides a maximum likelihood estimate of the variance for normally distributed variables. The standard deviation computed in this function is the square root of the estimated variance, so even withddof=1
, it will not be an unbiased estimate of the standard deviation per se.
Source: Numpy
sourcepub fn var(self, ddof: u8) -> Self
pub fn var(self, ddof: u8) -> Self
Aggregate all the columns as their variance values.
ddof
is the “Delta Degrees of Freedom”; N - ddof
will be the denominator when
computing the variance, where N
is the number of rows.
In standard statistical practice,
ddof=1
provides an unbiased estimator of the variance of a hypothetical infinite population.ddof=0
provides a maximum likelihood estimate of the variance for normally distributed variables.
Source: Numpy
sourcepub fn explode<E: AsRef<[IE]>, IE: Into<Selector> + Clone>(
self,
columns: E,
) -> LazyFrame
pub fn explode<E: AsRef<[IE]>, IE: Into<Selector> + Clone>( self, columns: E, ) -> LazyFrame
Apply explode operation. See eager explode.
sourcepub fn null_count(self) -> LazyFrame
pub fn null_count(self) -> LazyFrame
Aggregate all the columns as the sum of their null value count.
sourcepub fn unique_stable(
self,
subset: Option<Vec<PlSmallStr>>,
keep_strategy: UniqueKeepStrategy,
) -> LazyFrame
pub fn unique_stable( self, subset: Option<Vec<PlSmallStr>>, keep_strategy: UniqueKeepStrategy, ) -> LazyFrame
Drop non-unique rows and maintain the order of kept rows.
subset
is an optional Vec
of column names to consider for uniqueness; if
None
, all columns are considered.
pub fn unique_stable_generic<E, IE>( self, subset: Option<E>, keep_strategy: UniqueKeepStrategy, ) -> LazyFrame
sourcepub fn unique(
self,
subset: Option<Vec<String>>,
keep_strategy: UniqueKeepStrategy,
) -> LazyFrame
pub fn unique( self, subset: Option<Vec<String>>, keep_strategy: UniqueKeepStrategy, ) -> LazyFrame
Drop non-unique rows without maintaining the order of kept rows.
The order of the kept rows may change; to maintain the original row order, use
unique_stable
.
subset
is an optional Vec
of column names to consider for uniqueness; if None,
all columns are considered.
pub fn unique_generic<E: AsRef<[IE]>, IE: Into<Selector> + Clone>( self, subset: Option<E>, keep_strategy: UniqueKeepStrategy, ) -> LazyFrame
sourcepub fn drop_nulls(self, subset: Option<Vec<Expr>>) -> LazyFrame
pub fn drop_nulls(self, subset: Option<Vec<Expr>>) -> LazyFrame
Drop rows containing None.
subset
is an optional Vec
of column names to consider for nulls; if None, all
columns are considered.
sourcepub fn slice(self, offset: i64, len: IdxSize) -> LazyFrame
pub fn slice(self, offset: i64, len: IdxSize) -> LazyFrame
Slice the DataFrame using an offset (starting row) and a length.
If offset
is negative, it is counted from the end of the DataFrame. For
instance, lf.slice(-5, 3)
gets three rows, starting at the row fifth from the
end.
If offset
and len
are such that the slice extends beyond the end of the
DataFrame, the portion between offset
and the end will be returned. In this
case, the number of rows in the returned DataFrame will be less than len
.
sourcepub fn tail(self, n: IdxSize) -> LazyFrame
pub fn tail(self, n: IdxSize) -> LazyFrame
Get the last n
rows.
Equivalent to self.slice(-(n as i64), n)
.
sourcepub fn unpivot(self, args: UnpivotArgsDSL) -> LazyFrame
Available on crate feature pivot
only.
pub fn unpivot(self, args: UnpivotArgsDSL) -> LazyFrame
pivot
only.Unpivot the DataFrame from wide to long format.
See UnpivotArgsIR
for information on how to unpivot a DataFrame.
sourcepub fn limit(self, n: IdxSize) -> LazyFrame
pub fn limit(self, n: IdxSize) -> LazyFrame
Limit the DataFrame to the first n
rows.
Note if you don’t want the rows to be scanned, use fetch
.
sourcepub fn map<F>(
self,
function: F,
optimizations: AllowedOptimizations,
schema: Option<Arc<dyn UdfSchema>>,
name: Option<&'static str>,
) -> LazyFrame
pub fn map<F>( self, function: F, optimizations: AllowedOptimizations, schema: Option<Arc<dyn UdfSchema>>, name: Option<&'static str>, ) -> LazyFrame
Apply a function/closure once the logical plan get executed.
The function has access to the whole materialized DataFrame at the time it is called.
To apply specific functions to specific columns, use Expr::map
in conjunction
with LazyFrame::with_column
or with_columns
.
§Warning
This can blow up in your face if the schema is changed due to the operation. The optimizer relies on a correct schema.
You can toggle certain optimizations off.
pub fn map_python( self, function: PythonFunction, optimizations: AllowedOptimizations, schema: Option<SchemaRef>, validate_output: bool, ) -> LazyFrame
python
only.sourcepub fn with_row_index<S>(self, name: S, offset: Option<IdxSize>) -> LazyFramewhere
S: Into<PlSmallStr>,
pub fn with_row_index<S>(self, name: S, offset: Option<IdxSize>) -> LazyFramewhere
S: Into<PlSmallStr>,
Add a new column at index 0 that counts the rows.
name
is the name of the new column. offset
is where to start counting from; if
None
, it is set to 0
.
§Warning
This can have a negative effect on query performance. This may for instance block predicate pushdown optimization.
sourcepub fn unnest<E, IE>(self, cols: E) -> Self
Available on crate feature dtype-struct
only.
pub fn unnest<E, IE>(self, cols: E) -> Self
dtype-struct
only.Unnest the given Struct
columns: the fields of the Struct
type will be
inserted as columns.
pub fn merge_sorted<S>(
self,
other: LazyFrame,
key: S,
) -> PolarsResult<LazyFrame>where
S: Into<PlSmallStr>,
merge_sorted
only.source§impl LazyFrame
impl LazyFrame
pub fn anonymous_scan( function: Arc<dyn AnonymousScan>, args: ScanArgsAnonymous, ) -> PolarsResult<Self>
source§impl LazyFrame
impl LazyFrame
sourcepub fn scan_ipc(path: impl AsRef<Path>, args: ScanArgsIpc) -> PolarsResult<Self>
Available on crate feature ipc
only.
pub fn scan_ipc(path: impl AsRef<Path>, args: ScanArgsIpc) -> PolarsResult<Self>
ipc
only.Create a LazyFrame directly from a ipc scan.
pub fn scan_ipc_files( paths: Arc<Vec<PathBuf>>, args: ScanArgsIpc, ) -> PolarsResult<Self>
ipc
only.source§impl LazyFrame
impl LazyFrame
sourcepub fn scan_parquet(
path: impl AsRef<Path>,
args: ScanArgsParquet,
) -> PolarsResult<Self>
Available on crate feature parquet
only.
pub fn scan_parquet( path: impl AsRef<Path>, args: ScanArgsParquet, ) -> PolarsResult<Self>
parquet
only.Create a LazyFrame directly from a parquet scan.
sourcepub fn scan_parquet_files(
paths: Arc<Vec<PathBuf>>,
args: ScanArgsParquet,
) -> PolarsResult<Self>
Available on crate feature parquet
only.
pub fn scan_parquet_files( paths: Arc<Vec<PathBuf>>, args: ScanArgsParquet, ) -> PolarsResult<Self>
parquet
only.Create a LazyFrame directly from a parquet scan.
Trait Implementations§
source§impl From<LazyGroupBy> for LazyFrame
impl From<LazyGroupBy> for LazyFrame
source§fn from(lgb: LazyGroupBy) -> Self
fn from(lgb: LazyGroupBy) -> Self
Auto Trait Implementations§
impl !Freeze for LazyFrame
impl !RefUnwindSafe for LazyFrame
impl Send for LazyFrame
impl Sync for LazyFrame
impl Unpin for LazyFrame
impl !UnwindSafe for LazyFrame
Blanket Implementations§
source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
source§unsafe fn clone_to_uninit(&self, dst: *mut T)
unsafe fn clone_to_uninit(&self, dst: *mut T)
clone_to_uninit
)§impl<T> Instrument for T
impl<T> Instrument for T
§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
source§impl<T> IntoEither for T
impl<T> IntoEither for T
source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left
is true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read moresource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left(&self)
returns true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read more