Expand description
§Polars Eager cookbook
This page should serve as a cookbook to quickly get you started with most fundamental operations
executed on a ChunkedArray, Series or DataFrame.
§Tree Of Contents
- Creation of data structures
- Arithmetic
- Comparisons
- Apply functions/ closures
- Filter
- Sort
- Joins
- GroupBy
- pivot
- Unpivot
- Explode
- IO
- Various
§Creation of data structures
§ChunkedArray
use polars::prelude::*;
// use iterators
let ca: UInt32Chunked = (0..10).map(Some).collect();
// from slices
let ca = UInt32Chunked::new("foo".into(), &[1, 2, 3]);
// use builders
let mut builder = PrimitiveChunkedBuilder::<UInt32Type>::new("foo".into(), 10);
for value in 0..10 {
builder.append_value(value);
}
let ca = builder.finish();§Series
use polars::prelude::*;
// use iterators
let s: Series = (0..10).map(Some).collect();
// from slices
let s = Series::new("foo".into(), &[1, 2, 3]);
// from a chunked-array
let ca = UInt32Chunked::new("foo".into(), &[Some(1), None, Some(3)]);
let s = ca.into_series();
// into a Column
let s = s.into_column();§DataFrame
use polars::prelude::*;
use polars::df;
// use macro
let df = df! [
"names" => ["a", "b", "c"],
"values" => [1, 2, 3],
"values_nulls" => [Some(1), None, Some(3)]
]?;
// from a Vec<Column>
let c1 = Column::new("names".into(), &["a", "b", "c"]);
let c2 = Column::new("values".into(), &[Some(1), None, Some(3)]);
let df = DataFrame::new(vec![c1, c2])?;§Arithmetic
Arithmetic can be done on both Series and ChunkedArray. The most notable difference is that
a Series coerces the data to match the underlying data types.
use polars::prelude::*;
let s_int = Series::new("a".into(), &[1, 2, 3]);
let s_flt = Series::new("b".into(), &[1.0, 2.0, 3.0]);
let added = &s_int + &s_flt;
let subtracted = &s_int - &s_flt;
let multiplied = &s_int * &s_flt;
let divided = &s_int / &s_flt;
let moduloed = &s_int % &s_flt;
// on chunked-arrays we first need to cast to same types
let ca_int = s_int.i32()?;
let ca_flt = s_flt.f32()?;
ca_int.cast(&DataType::Float32)?.f32()? * ca_flt;
ca_flt.cast(&DataType::Int32)?.i32()? * ca_int;
// we can also do arithmetic with numeric values
let multiplied = ca_int * 2.0;
let multiplied = s_flt * 2.0;
// or broadcast Series to match the operands type
let added = &s_int * &Series::new("broadcast_me".into(), &[10]);
Because Rust’s Orphan Rule doesn’t allow us to implement left side operations, we need to call such operations directly.
let series = Series::new("foo".into(), [1, 2, 3]);
// 1 / s
let divide_one_by_s = 1.div(&series);
// 1 - s
let subtract_one_by_s = 1.sub(&series);For ChunkedArray left hand side operations can be done with the apply_values method.
let ca = UInt32Chunked::new("foo".into(), &[1, 2, 3]);
// 1 / ca
let divide_one_by_ca = ca.apply_values(|rhs| 1 / rhs);§Comparisons
Series and ChunkedArray can be used in comparison operations to create boolean masks/predicates.
use polars::prelude::*;
let s = Series::new("a".into(), &[1, 2, 3]);
let ca = UInt32Chunked::new("b".into(), &[Some(3), None, Some(1)]);
// compare Series with numeric values
// ==
s.equal(2);
// !=
s.not_equal(2);
// >
s.gt(2);
// >=
s.gt_eq(2);
// <
s.lt(2);
// <=
s.lt_eq(2);
// compare Series with Series
// ==
s.equal(&s);
// !=
s.not_equal(&s);
// >
s.gt(&s);
// >=
s.gt_eq(&s);
// <
s.lt(&s);
// <=
s.lt_eq(&s);
// compare chunked-array with numeric values
// ==
ca.equal(2);
// !=
ca.not_equal(2);
// >
ca.gt(2);
// >=
ca.gt_eq(2);
// <
ca.lt(2);
// <=
ca.lt_eq(2);
// compare chunked-array with chunked-array
// ==
ca.equal(&ca);
// !=
ca.not_equal(&ca);
// >
ca.gt(&ca);
// >=
ca.gt_eq(&ca);
// <
ca.lt(&ca);
// <=
ca.lt_eq(&ca);
// use iterators
let a: BooleanChunked = ca.iter()
.map(|opt_value| {
match opt_value {
Some(value) => value < 10,
None => false
}}).collect();
§Apply functions/ closures
See all possible apply methods here.
§Series / ChunkedArrays
use polars::prelude::*;
use polars::prelude::arity::unary_elementwise_values;
// apply a closure over all values
let s = Series::new("foo".into(), &[Some(1), Some(2), None]);
s.i32()?.apply_values(|value| value * 20);
// count string lengths
let s = Series::new("foo".into(), &["foo", "bar", "foobar"]);
unary_elementwise_values::<StringType, UInt64Type, _>(s.str()?, |str_val| str_val.len() as u64);
§Multiple columns
use polars::prelude::*;
fn my_black_box_function(a: f32, b: f32) -> f32 {
// do something
a
}
fn apply_multiples(col_a: &Series, col_b: &Series) -> Float32Chunked {
match (col_a.dtype(), col_b.dtype()) {
(DataType::Float32, DataType::Float32) => {
// downcast to `ChunkedArray`
let a = col_a.f32().unwrap();
let b = col_b.f32().unwrap();
a.into_iter()
.zip(b.into_iter())
.map(|(opt_a, opt_b)| match (opt_a, opt_b) {
(Some(a), Some(b)) => Some(my_black_box_function(a, b)),
// if either value is `None` we propagate that null
_ => None,
})
.collect()
}
_ => panic!("unexpected dtypes"),
}
}§DataFrame
use polars::prelude::*;
use polars::df;
let mut df = df![
"letters" => ["a", "b", "c", "d"],
"numbers" => [1, 2, 3, 4]
]?;
// coerce numbers to floats
df.try_apply("number", |s: &Series| s.cast(&DataType::Float64))?;
// transform letters to uppercase letters
df.try_apply("letters", |s: &Series| {
Ok(s.str()?.to_uppercase())
});
§Filter
use polars::prelude::*;
// create a mask to filter out null values
let mask = df.column("sepal_width")?.is_not_null();
// select column
let s = df.column("sepal_length")?;
// apply filter on a Series
let filtered_series = s.filter(&mask);
// apply the filter on a DataFrame
let filtered_df = df.filter(&mask)?;
§Sort
use polars::prelude::*;
use polars::df;
let df = df![
"a" => [1, 2, 3],
"b" => ["a", "a", "b"]
]?;
// sort this DataFrame by multiple columns
// ordering of the columns
let descending = vec![true, false];
// columns to sort by
let by = [PlSmallStr::from_static("b"), PlSmallStr::from_static("a")];
// do the sort operation
let sorted = df.sort(
by,
SortMultipleOptions::default()
.with_order_descending_multi(descending)
.with_maintain_order(true)
)?;
// sorted:
// ╭─────┬─────╮
// │ a ┆ b │
// │ --- ┆ --- │
// │ i64 ┆ str │
// ╞═════╪═════╡
// │ 1 ┆ "a" │
// │ 2 ┆ "a" │
// │ 3 ┆ "b" │
// ╰─────┴─────╯
§Joins
use polars::prelude::*;
use polars::df;
// Create first df.
let temp = df!("days" => &[0, 1, 2, 3, 4],
"temp" => &[22.1, 19.9, 7., 2., 3.],
"other" => &[1, 2, 3, 4, 5]
)?;
// Create second df.
let rain = df!("days" => &[1, 2],
"rain" => &[0.1, 0.2],
"other" => &[1, 2, 3, 4, 5]
)?;
// join on a single column
temp.left_join(&rain, ["days"], ["days"]);
temp.inner_join(&rain, ["days"], ["days"]);
temp.full_join(&rain, ["days"], ["days"]);
// join on multiple columns
temp.join(&rain, vec!["days", "other"], vec!["days", "other"], JoinArgs::new(JoinType::Left), None);
§Groupby
Note that Polars lazy is a lot more powerful in and more performant in group_by operations. In lazy a myriad of aggregations can be combined from expressions.
See more in:
§GroupBy
use polars::prelude::*;
// group_by "groups" | sum "foo"
let out = df.group_by(["groups"])?
.select(["foo"])
.sum();
§Pivot
use polars::prelude::*;
use polars::df;
let df = df!("foo" => ["A", "A", "B", "B", "C"],
"N" => [1, 2, 2, 4, 2],
"bar" => ["k", "l", "m", "n", "0"]
)?;
// group_by "foo" | pivot "bar" column | aggregate "N"
let pivoted = pivot::pivot(
&df,
[PlSmallStr::from_static("foo")],
Some([PlSmallStr::from_static("bar")]),
Some([PlSmallStr::from_static("N")]),
false, Some(first()),
None
);
// pivoted:
// +-----+------+------+------+------+------+
// | foo | o | n | m | l | k |
// | --- | --- | --- | --- | --- | --- |
// | str | i32 | i32 | i32 | i32 | i32 |
// +=====+======+======+======+======+======+
// | "A" | null | null | null | 2 | 1 |
// +-----+------+------+------+------+------+
// | "B" | null | 4 | 2 | null | null |
// +-----+------+------+------+------+------+
// | "C" | 2 | null | null | null | null |
// +-----+------+------+------+------+------+!
§Unpivot
use polars::prelude::*;
use polars::df;
let df = df!["A" => &["a", "b", "a"],
"B" => &[1, 3, 5],
"C" => &[10, 11, 12],
"D" => &[2, 4, 6]
]?;
let unpivoted = df.unpivot(
[PlSmallStr::from_static("A"), PlSmallStr::from_static("B")],
[PlSmallStr::from_static("C"), PlSmallStr::from_static("D")],
).unwrap();
// unpivoted:
// +-----+-----+----------+-------+
// | A | B | variable | value |
// | --- | --- | --- | --- |
// | str | i32 | str | i32 |
// +=====+=====+==========+=======+
// | "a" | 1 | "C" | 10 |
// +-----+-----+----------+-------+
// | "b" | 3 | "C" | 11 |
// +-----+-----+----------+-------+
// | "a" | 5 | "C" | 12 |
// +-----+-----+----------+-------+
// | "a" | 1 | "D" | 2 |
// +-----+-----+----------+-------+
// | "b" | 3 | "D" | 4 |
// +-----+-----+----------+-------+
// | "a" | 5 | "D" | 6 |
// +-----+-----+----------+-------+
§Explode
use polars::prelude::*;
use polars::df;
let s0 = Series::new("a".into(), &[1i64, 2, 3]);
let s1 = Series::new("b".into(), &[1i64, 1, 1]);
let s2 = Series::new("c".into(), &[2i64, 2, 2]);
// construct a new ListChunked for a slice of Series.
let list = Column::new("foo".into(), &[s0, s1, s2]);
// construct a few more Series.
let s0 = Column::new("B".into(), [1, 2, 3]);
let s1 = Column::new("C".into(), [1, 1, 1]);
let df = DataFrame::new(vec![list, s0, s1])?;
let exploded = df.explode([PlSmallStr::from("foo")])?;
// exploded:
// +-----+-----+-----+
// | foo | B | C |
// | --- | --- | --- |
// | i64 | i32 | i32 |
// +=====+=====+=====+
// | 1 | 1 | 1 |
// +-----+-----+-----+
// | 2 | 1 | 1 |
// +-----+-----+-----+
// | 3 | 1 | 1 |
// +-----+-----+-----+
// | 1 | 2 | 1 |
// +-----+-----+-----+
// | 1 | 2 | 1 |
// +-----+-----+-----+
// | 1 | 2 | 1 |
// +-----+-----+-----+
// | 2 | 3 | 1 |
// +-----+-----+-----+
// | 2 | 3 | 1 |
// +-----+-----+-----+
// | 2 | 3 | 1 |
// +-----+-----+-----+
§IO
§Read CSV
use polars::prelude::*;
// read from path
let mut file = std::fs::File::open("iris.csv")?;
let df = CsvReader::new(file).finish()?;§Write CSV
use polars::prelude::*;
use std::fs::File;
// create a file
let mut file = File::create("example.csv").expect("could not create file");
// write DataFrame to file
CsvWriter::new(&mut file)
.include_header(true)
.with_separator(b',')
.finish(df);§Read IPC
use polars::prelude::*;
use std::fs::File;
// open file
let file = File::open("file.ipc").expect("file not found");
// read to DataFrame
let df = IpcReader::new(file)
.finish()?;§Write IPC
use polars::prelude::*;
use std::fs::File;
// create a file
let mut file = File::create("file.ipc").expect("could not create file");
// write DataFrame to file
IpcWriter::new(&mut file)
.finish(df)§Read Parquet
use polars::prelude::*;
use std::fs::File;
// open file
let file = File::open("some_file.parquet").unwrap();
// read to DataFrame
let df = ParquetReader::new(file).finish()?;§Write Parquet
use polars::prelude::*;
use std::fs::File;
// create a file
let file = File::create("example.parquet").expect("could not create file");
ParquetWriter::new(file)
.finish(df)§Various
§Replace NaN with Missing.
The floating point Not a Number: NaN is conceptually different
than missing data in Polars. In the snippet below we show how we can replace NaN values with
missing values, by setting them to None.
use polars::prelude::*;
use polars::df;
/// Replaces NaN with missing values.
fn fill_nan_with_nulls() -> PolarsResult<DataFrame> {
let nan = f64::NAN;
let mut df = df! {
"a" => [nan, 1.0, 2.0],
"b" => [nan, 1.0, 2.0]
}
.unwrap();
for idx in 0..df.width() {
df.try_apply_at_idx(idx, |series| {
let mask = series.is_nan()?;
let ca = series.f64()?;
ca.set(&mask, None)
})?;
}
Ok(df)
}§Extracting data
To iterate over the values of a Series, or to convert the Series into another structure
such as a Vec<T>, we must first downcast to a data type aware ChunkedArray<T>.
use polars::prelude::*;
use polars::df;
fn extract_data() -> PolarsResult<()> {
let df = df! [
"a" => [None, Some(1.0f32), Some(2.0)],
"str" => ["foo", "bar", "ham"]
]?;
// first extract ChunkedArray to get the inner type.
let ca = df.column("a")?.f32()?;
// Then convert to vec
let to_vec: Vec<Option<f32>> = Vec::from(ca);
// We can also do this with iterators
let ca = df.column("str")?.str()?;
let to_vec: Vec<Option<&str>> = ca.into_iter().collect();
let to_vec_no_options: Vec<&str> = ca.into_no_null_iter().collect();
Ok(())
}