polars.Expr.str.split#
- Expr.str.split( ) Expr[source]#
Split the string by a substring.
- Parameters:
- by
Substring to split by.
- inclusive
If True, include the split character/string in the results.
- literal
Treat
byas a literal string, not as a regular expression.- strict
Raise an error if the underlying pattern is not a valid regex, otherwise mask out with a null value.
- Returns:
- Expr
Expression of data type
String.
Examples
>>> df = pl.DataFrame({"s": ["foo bar", "foo_bar", "foo_bar_baz"]}) >>> df.with_columns( ... pl.col("s").str.split(by="_").alias("split"), ... pl.col("s").str.split(by="_", inclusive=True).alias("split_inclusive"), ... ) shape: (3, 3) ┌─────────────┬───────────────────────┬─────────────────────────┐ │ s ┆ split ┆ split_inclusive │ │ --- ┆ --- ┆ --- │ │ str ┆ list[str] ┆ list[str] │ ╞═════════════╪═══════════════════════╪═════════════════════════╡ │ foo bar ┆ ["foo bar"] ┆ ["foo bar"] │ │ foo_bar ┆ ["foo", "bar"] ┆ ["foo_", "bar"] │ │ foo_bar_baz ┆ ["foo", "bar", "baz"] ┆ ["foo_", "bar_", "baz"] │ └─────────────┴───────────────────────┴─────────────────────────┘
>>> df = pl.DataFrame( ... {"s": ["foo^bar", "foo_bar", "foo*bar*baz"], "by": ["_", "_", "*"]} ... ) >>> df.with_columns( ... pl.col("s").str.split(by=pl.col("by")).alias("split"), ... pl.col("s") ... .str.split(by=pl.col("by"), inclusive=True) ... .alias("split_inclusive"), ... ) shape: (3, 4) ┌─────────────┬─────┬───────────────────────┬─────────────────────────┐ │ s ┆ by ┆ split ┆ split_inclusive │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ list[str] ┆ list[str] │ ╞═════════════╪═════╪═══════════════════════╪═════════════════════════╡ │ foo^bar ┆ _ ┆ ["foo^bar"] ┆ ["foo^bar"] │ │ foo_bar ┆ _ ┆ ["foo", "bar"] ┆ ["foo_", "bar"] │ │ foo*bar*baz ┆ * ┆ ["foo", "bar", "baz"] ┆ ["foo*", "bar*", "baz"] │ └─────────────┴─────┴───────────────────────┴─────────────────────────┘
>>> df = pl.DataFrame({"s": ["foo1bar", "foo99bar", "foo1bar2baz"]}) >>> df.with_columns( ... pl.col("s").str.split(by=r"\d+", literal=False).alias("split_regex"), ... pl.col("s") ... .str.split(by=r"\d+", literal=False, inclusive=True) ... .alias("split_regex_inclusive"), ... ) shape: (3, 3) ┌─────────────┬───────────────────────┬─────────────────────────┐ │ s ┆ split_regex ┆ split_regex_inclusive │ │ --- ┆ --- ┆ --- │ │ str ┆ list[str] ┆ list[str] │ ╞═════════════╪═══════════════════════╪═════════════════════════╡ │ foo1bar ┆ ["foo", "bar"] ┆ ["foo1", "bar"] │ │ foo99bar ┆ ["foo", "bar"] ┆ ["foo99", "bar"] │ │ foo1bar2baz ┆ ["foo", "bar", "baz"] ┆ ["foo1", "bar2", "baz"] │ └─────────────┴───────────────────────┴─────────────────────────┘
>>> df = pl.DataFrame( ... { ... "s": ["foo1bar", "foo bar", "foo-bar baz"], ... "by": [r"\d", r"\s", r"-"], ... } ... ) >>> df.with_columns( ... pl.col("s") ... .str.split(by=pl.col("by"), literal=False) ... .alias("split_regex"), ... pl.col("s") ... .str.split(by=pl.col("by"), literal=False, inclusive=True) ... .alias("split_regex_inclusive"), ... ) shape: (3, 4) ┌─────────────┬─────┬────────────────────┬───────────────────────┐ │ s ┆ by ┆ split_regex ┆ split_regex_inclusive │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ list[str] ┆ list[str] │ ╞═════════════╪═════╪════════════════════╪═══════════════════════╡ │ foo1bar ┆ \d ┆ ["foo", "bar"] ┆ ["foo1", "bar"] │ │ foo bar ┆ \s ┆ ["foo", "bar"] ┆ ["foo ", "bar"] │ │ foo-bar baz ┆ - ┆ ["foo", "bar baz"] ┆ ["foo-", "bar baz"] │ └─────────────┴─────┴────────────────────┴───────────────────────┘