polars/docs/
lazy.rs

1//!
2//! # Polars Lazy cookbook
3//!
4//! This page should serve as a cookbook to quickly get you started with Polars' query engine.
5//! The lazy API allows you to create complex well performing queries on top of Polars eager.
6//!
7//! ## Tree Of Contents
8//!
9//! * [Start a lazy computation](#start-a-lazy-computation)
10//! * [Filter](#filter)
11//! * [Sort](#sort)
12//! * [GroupBy](#group_by)
13//! * [Joins](#joins)
14//! * [Conditionally apply](#conditionally-apply)
15//! * [Black box function](#black-box-function)
16//!
17//! ## Start a lazy computation
18//!
19//! ```
20//! use polars::prelude::*;
21//! use polars::df;
22//!
23//! # fn example() -> PolarsResult<()> {
24//! let df = df![
25//!     "a" => [1, 2, 3],
26//!     "b" => [None, Some("a"), Some("b")]
27//! ]?;
28//! // from an eager DataFrame
29//! let lf: LazyFrame = df.lazy();
30//!
31//! // scan a csv file lazily
32//! let lf: LazyFrame = LazyCsvReader::new("some_path")
33//!     .with_has_header(true)
34//!     .finish()?;
35//!
36//! // scan a parquet file lazily
37//! let lf: LazyFrame = LazyFrame::scan_parquet("some_path", Default::default())?;
38//!
39//! # Ok(())
40//! # }
41//! ```
42//!
43//! ## Filter
44//! ```
45//! use polars::prelude::*;
46//! use polars::df;
47//!
48//! # fn example() -> PolarsResult<()> {
49//! let df = df![
50//!     "a" => [1, 2, 3],
51//!     "b" => [None, Some("a"), Some("b")]
52//! ]?;
53//!
54//! let filtered = df.lazy()
55//!     .filter(col("a").gt(lit(2)))
56//!     .collect()?;
57//!
58//! // filtered:
59//!
60//! // ╭─────┬─────╮
61//! // │ a   ┆ b   │
62//! // │ --- ┆ --- │
63//! // │ i64 ┆ str │
64//! // ╞═════╪═════╡
65//! // │ 3   ┆ "c" │
66//! // ╰─────┴─────╯
67//!
68//! # Ok(())
69//! # }
70//! ```
71//!
72//! ## Sort
73//! ```
74//! use polars::prelude::*;
75//! use polars::df;
76//!
77//! # fn example() -> PolarsResult<()> {
78//! let df = df![
79//!     "a" => [1, 2, 3],
80//!     "b" => ["a", "a", "b"]
81//! ]?;
82//! // sort this DataFrame by multiple columns
83//!
84//! let sorted = df.lazy()
85//!     .sort_by_exprs(vec![col("b"), col("a")], SortMultipleOptions::default())
86//!     .collect()?;
87//!
88//! // sorted:
89//!
90//! // ╭─────┬─────╮
91//! // │ a   ┆ b   │
92//! // │ --- ┆ --- │
93//! // │ i64 ┆ str │
94//! // ╞═════╪═════╡
95//! // │ 1   ┆ "a" │
96//! // │ 2   ┆ "a" │
97//! // │ 3   ┆ "b" │
98//! // ╰─────┴─────╯
99//!
100//! # Ok(())
101//! # }
102//! ```
103//!
104//! ## Groupby
105//!
106//! This example is from the polars [user guide](https://docs.pola.rs/user-guide/concepts/expressions-and-contexts/#group_by-and-aggregations).
107//!
108//! ```
109//! use polars::prelude::*;
110//! # fn example() -> PolarsResult<()> {
111//!
112//!  let df = LazyCsvReader::new("reddit.csv")
113//!     .with_has_header(true)
114//!     .with_separator(b',')
115//!     .finish()?
116//!     .group_by([col("comment_karma")])
117//!     .agg([col("name").n_unique().alias("unique_names"), col("link_karma").max()])
118//!     // take only 100 rows.
119//!     .fetch(100)?;
120//! # Ok(())
121//! # }
122//! ```
123//!
124//! ## Joins
125//!
126//! ```
127//! use polars::prelude::*;
128//! use polars::df;
129//! # fn example() -> PolarsResult<()> {
130//! let df_a = df![
131//!     "a" => [1, 2, 1, 1],
132//!     "b" => ["a", "b", "c", "c"],
133//!     "c" => [0, 1, 2, 3]
134//! ]?;
135//!
136//! let df_b = df![
137//!     "foo" => [1, 1, 1],
138//!     "bar" => ["a", "c", "c"],
139//!     "ham" => ["let", "var", "const"]
140//! ]?;
141//!
142//! let lf_a = df_a.clone().lazy();
143//! let lf_b = df_b.clone().lazy();
144//!
145//! let joined = lf_a.join(lf_b, vec![col("a")], vec![col("foo")], JoinArgs::new(JoinType::Full)).collect()?;
146//! // joined:
147//!
148//! // ╭─────┬─────┬─────┬──────┬─────────╮
149//! // │ b   ┆ c   ┆ a   ┆ bar  ┆ ham     │
150//! // │ --- ┆ --- ┆ --- ┆ ---  ┆ ---     │
151//! // │ str ┆ i64 ┆ i64 ┆ str  ┆ str     │
152//! // ╞═════╪═════╪═════╪══════╪═════════╡
153//! // │ "a" ┆ 0   ┆ 1   ┆ "a"  ┆ "let"   │
154//! // │ "a" ┆ 0   ┆ 1   ┆ "c"  ┆ "var"   │
155//! // │ "a" ┆ 0   ┆ 1   ┆ "c"  ┆ "const" │
156//! // │ "b" ┆ 1   ┆ 2   ┆ null ┆ null    │
157//! // │ "c" ┆ 2   ┆ 1   ┆ null ┆ null    │
158//! // │ "c" ┆ 3   ┆ 1   ┆ null ┆ null    │
159//! // ╰─────┴─────┴─────┴──────┴─────────╯
160//!
161//! // other join syntax options
162//! # let lf_a = df_a.clone().lazy();
163//! # let lf_b = df_b.clone().lazy();
164//! let inner = lf_a.inner_join(lf_b, col("a"), col("foo")).collect()?;
165//!
166//! # let lf_a = df_a.clone().lazy();
167//! # let lf_b = df_b.clone().lazy();
168//! let left = lf_a.left_join(lf_b, col("a"), col("foo")).collect()?;
169//!
170//! # let lf_a = df_a.clone().lazy();
171//! # let lf_b = df_b.clone().lazy();
172//! let outer = lf_a.full_join(lf_b, col("a"), col("foo")).collect()?;
173//!
174//! # let lf_a = df_a.clone().lazy();
175//! # let lf_b = df_b.clone().lazy();
176//! let joined_with_builder = lf_a.join_builder()
177//!     .with(lf_b)
178//!     .left_on(vec![col("a")])
179//!     .right_on(vec![col("foo")])
180//!     .how(JoinType::Inner)
181//!     .force_parallel(true)
182//!     .finish()
183//!     .collect()?;
184//!
185//! # Ok(())
186//! # }
187//! ```
188//!
189//! ## Conditionally apply
190//! If we want to create a new column based on some condition, we can use the [`when`]/[`then`]/[`otherwise`] expressions.
191//!
192//! * [`when`] - accepts a predicate expression
193//! * [`then`] - expression to use when `predicate == true`
194//! * [`otherwise`] - expression to use when `predicate == false`
195//!
196//! [`when`]: polars_lazy::dsl::Then::when
197//! [`then`]: polars_lazy::dsl::When::then
198//! [`otherwise`]: polars_lazy::dsl::Then::otherwise
199//!
200//! ```
201//! use polars::prelude::*;
202//! use polars::df;
203//! # fn example() -> PolarsResult<()> {
204//! let df = df![
205//!     "range" => [1, 2, 3, 4, 5, 6, 8, 9, 10],
206//!     "left" => (0..10).map(|_| Some("foo")).collect::<Vec<_>>(),
207//!     "right" => (0..10).map(|_| Some("bar")).collect::<Vec<_>>()
208//! ]?;
209//!
210//! let new = df.lazy()
211//!     .with_column(when(col("range").gt_eq(lit(5)))
212//!         .then(col("left"))
213//!         .otherwise(col("right")).alias("foo_or_bar")
214//!     ).collect()?;
215//!
216//! // new:
217//!
218//! // ╭───────┬───────┬───────┬────────────╮
219//! // │ range ┆ left  ┆ right ┆ foo_or_bar │
220//! // │ ---   ┆ ---   ┆ ---   ┆ ---        │
221//! // │ i64   ┆ str   ┆ str   ┆ str        │
222//! // ╞═══════╪═══════╪═══════╪════════════╡
223//! // │ 0     ┆ "foo" ┆ "bar" ┆ "bar"      │
224//! // │ 1     ┆ "foo" ┆ "bar" ┆ "bar"      │
225//! // │ 2     ┆ "foo" ┆ "bar" ┆ "bar"      │
226//! // │ 3     ┆ "foo" ┆ "bar" ┆ "bar"      │
227//! // │ …     ┆ …     ┆ …     ┆ …          │
228//! // │ 5     ┆ "foo" ┆ "bar" ┆ "foo"      │
229//! // │ 6     ┆ "foo" ┆ "bar" ┆ "foo"      │
230//! // │ 7     ┆ "foo" ┆ "bar" ┆ "foo"      │
231//! // │ 8     ┆ "foo" ┆ "bar" ┆ "foo"      │
232//! // │ 9     ┆ "foo" ┆ "bar" ┆ "foo"      │
233//! // ╰───────┴───────┴───────┴────────────╯
234//!
235//! # Ok(())
236//! # }
237//! ```
238//!
239//! # Black box function
240//!
241//! The expression API should be expressive enough for most of what you want to achieve, but it can happen
242//! that you need to pass the values to an external function you do not control. The snippet below
243//! shows how we use the [`Struct`] datatype to be able to apply a function over multiple inputs.
244//!
245//! [`Struct`]: crate::datatypes::DataType::Struct
246//!
247//! ```ignore
248//! use polars::prelude::*;
249//! fn my_black_box_function(a: f32, b: f32) -> f32 {
250//!     // do something
251//!     a
252//! }
253//!
254//! fn apply_multiples() -> PolarsResult<DataFrame> {
255//!     df![
256//!         "a" => [1.0f32, 2.0, 3.0],
257//!         "b" => [3.0f32, 5.1, 0.3]
258//!     ]?
259//!     .lazy()
260//!     .select([as_struct(vec![col("a"), col("b")]).map(
261//!         |s| {
262//!             let ca = s.struct_()?;
263//!
264//!             let series_a = ca.field_by_name("a")?;
265//!             let series_b = ca.field_by_name("b")?;
266//!             let chunked_a = series_a.f32()?;
267//!             let chunked_b = series_b.f32()?;
268//!
269//!             let out: Float32Chunked = chunked_a
270//!                 .into_iter()
271//!                 .zip(chunked_b.into_iter())
272//!                 .map(|(opt_a, opt_b)| match (opt_a, opt_b) {
273//!                     (Some(a), Some(b)) => Some(my_black_box_function(a, b)),
274//!                     _ => None,
275//!                 })
276//!                 .collect();
277//!
278//!             Ok(Some(out.into_series()))
279//!         },
280//!         GetOutput::from_type(DataType::Float32),
281//!     )])
282//!     .collect()
283//! }
284//!
285//! ```
286//!
287//!