Skip to main content

polars_time/chunkedarray/string/
mod.rs

1pub mod infer;
2use chrono::DateTime;
3mod patterns;
4mod strptime;
5pub use patterns::Pattern;
6#[cfg(feature = "dtype-time")]
7use polars_core::chunked_array::temporal::time_to_time64ns;
8use polars_core::prelude::arity::unary_elementwise;
9use polars_utils::cache::LruCachedFunc;
10
11use super::*;
12#[cfg(feature = "dtype-date")]
13use crate::chunkedarray::date::naive_date_to_date;
14use crate::prelude::string::strptime::StrpTimeState;
15
16#[cfg(feature = "dtype-time")]
17fn time_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
18// (string, fmt) -> PolarsResult
19where
20    F: Fn(&str, &str) -> chrono::ParseResult<K>,
21{
22    patterns::TIME_H_M_S
23        .iter()
24        .chain(patterns::TIME_H_M_S)
25        .find(|fmt| convert(val, fmt).is_ok())
26        .copied()
27}
28
29fn datetime_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
30// (string, fmt) -> PolarsResult
31where
32    F: Fn(&str, &str) -> chrono::ParseResult<K>,
33{
34    patterns::DATETIME_Y_M_D
35        .iter()
36        .chain(patterns::DATETIME_D_M_Y)
37        .find(|fmt| convert(val, fmt).is_ok())
38        .copied()
39}
40
41fn date_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
42// (string, fmt) -> PolarsResult
43where
44    F: Fn(&str, &str) -> chrono::ParseResult<K>,
45{
46    patterns::DATE_Y_M_D
47        .iter()
48        .chain(patterns::DATE_D_M_Y)
49        .find(|fmt| convert(val, fmt).is_ok())
50        .copied()
51}
52
53#[cfg(feature = "dtype-datetime")]
54fn sniff_fmt_datetime(val: &str) -> PolarsResult<&'static str> {
55    datetime_pattern(val, NaiveDateTime::parse_from_str)
56        .or_else(|| datetime_pattern(val, NaiveDate::parse_from_str))
57        .ok_or_else(|| polars_err!(parse_fmt_idk = "datetime"))
58}
59
60#[cfg(feature = "dtype-date")]
61fn sniff_fmt_date(val: &str) -> PolarsResult<&'static str> {
62    date_pattern(val, NaiveDate::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "date"))
63}
64
65#[cfg(feature = "dtype-time")]
66fn sniff_fmt_time(val: &str) -> PolarsResult<&'static str> {
67    time_pattern(val, NaiveTime::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "time"))
68}
69
70pub trait StringMethods: AsString {
71    #[cfg(feature = "dtype-time")]
72    /// Parsing string values and return a [`TimeChunked`]
73    fn as_time(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<TimeChunked> {
74        let string_ca = self.as_string();
75        let fmt = match fmt {
76            Some(fmt) => fmt,
77            None => {
78                let Some(idx) = string_ca.first_non_null() else {
79                    return Ok(
80                        Int64Chunked::full_null(string_ca.name().clone(), string_ca.len())
81                            .into_time(),
82                    );
83                };
84                let val = string_ca.get(idx).expect("should not be null");
85                sniff_fmt_time(val)?
86            },
87        };
88        let use_cache = use_cache && string_ca.len() > 50;
89
90        let mut convert = LruCachedFunc::new(
91            |s| {
92                let naive_time = NaiveTime::parse_from_str(s, fmt).ok()?;
93                Some(time_to_time64ns(&naive_time))
94            },
95            (string_ca.len() as f64).sqrt() as usize,
96        );
97        let ca = unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache));
98        Ok(ca.with_name(string_ca.name().clone()).into_time())
99    }
100
101    #[cfg(feature = "dtype-date")]
102    /// Parsing string values and return a [`DateChunked`]
103    /// Different from `as_date` this function allows matches that not contain the whole string
104    /// e.g. "foo-2021-01-01-bar" could match "2021-01-01"
105    fn as_date_not_exact(&self, fmt: Option<&str>) -> PolarsResult<DateChunked> {
106        let string_ca = self.as_string();
107        let fmt = match fmt {
108            Some(fmt) => fmt,
109            None => {
110                let Some(idx) = string_ca.first_non_null() else {
111                    return Ok(
112                        Int32Chunked::full_null(string_ca.name().clone(), string_ca.len())
113                            .into_date(),
114                    );
115                };
116                let val = string_ca.get(idx).expect("should not be null");
117                sniff_fmt_date(val)?
118            },
119        };
120        let ca = unary_elementwise(string_ca, |opt_s| {
121            let mut s = opt_s?;
122            while !s.is_empty() {
123                match NaiveDate::parse_and_remainder(s, fmt) {
124                    Ok((nd, _)) => return Some(naive_date_to_date(nd)),
125                    Err(_) => {
126                        let mut it = s.chars();
127                        it.next();
128                        s = it.as_str();
129                    },
130                }
131            }
132
133            None
134        });
135        Ok(ca.with_name(string_ca.name().clone()).into_date())
136    }
137
138    #[cfg(feature = "dtype-datetime")]
139    /// Parsing string values and return a [`DatetimeChunked`]
140    /// Different from `as_datetime` this function allows matches that not contain the whole string
141    /// e.g. "foo-2021-01-01-bar" could match "2021-01-01"
142    fn as_datetime_not_exact(
143        &self,
144        fmt: Option<&str>,
145        tu: TimeUnit,
146        tz_aware: bool,
147        tz: Option<&TimeZone>,
148        _ambiguous: &StringChunked,
149        // Ensure that the inferred time_zone matches the given time_zone.
150        ensure_matching_tz: bool,
151    ) -> PolarsResult<DatetimeChunked> {
152        let string_ca = self.as_string();
153        let had_format = fmt.is_some();
154        let fmt = match fmt {
155            Some(fmt) => fmt,
156            None => {
157                let Some(idx) = string_ca.first_non_null() else {
158                    return Ok(
159                        Int64Chunked::full_null(string_ca.name().clone(), string_ca.len())
160                            .into_datetime(tu, tz.cloned()),
161                    );
162                };
163                let val = string_ca.get(idx).expect("should not be null");
164                sniff_fmt_datetime(val)?
165            },
166        };
167
168        let func = match tu {
169            TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
170            TimeUnit::Microseconds => datetime_to_timestamp_us,
171            TimeUnit::Milliseconds => datetime_to_timestamp_ms,
172        };
173
174        let ca = unary_elementwise(string_ca, |opt_s| {
175            let mut s = opt_s?;
176            while !s.is_empty() {
177                let timestamp = if tz_aware {
178                    DateTime::parse_and_remainder(s, fmt)
179                        .ok()
180                        .map(|(dt, _r)| func(dt.naive_utc()))
181                } else {
182                    infer::parse_datetime_and_remainder(s, fmt).map(|(nd, _r)| func(nd))
183                };
184                match timestamp {
185                    Some(ts) => return Some(ts),
186                    None => {
187                        let mut it = s.chars();
188                        it.next();
189                        s = it.as_str();
190                    },
191                }
192            }
193            None
194        })
195        .with_name(string_ca.name().clone());
196
197        polars_ensure!(
198            !ensure_matching_tz || had_format || !(tz_aware && tz.is_none()),
199            to_datetime_tz_mismatch
200        );
201
202        match (tz_aware, tz) {
203            #[cfg(feature = "timezones")]
204            (false, Some(tz)) => polars_ops::prelude::replace_time_zone(
205                &ca.into_datetime(tu, None),
206                Some(tz),
207                _ambiguous,
208                NonExistent::Raise,
209            ),
210            #[cfg(feature = "timezones")]
211            (true, tz) => Ok(ca.into_datetime(tu, Some(tz.cloned().unwrap_or(TimeZone::UTC)))),
212            _ => Ok(ca.into_datetime(tu, None)),
213        }
214    }
215
216    #[cfg(feature = "dtype-date")]
217    /// Parsing string values and return a [`DateChunked`]
218    fn as_date(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<DateChunked> {
219        let string_ca = self.as_string();
220        let fmt = match fmt {
221            Some(fmt) => fmt,
222            None => return infer::to_date(string_ca),
223        };
224        let use_cache = use_cache && string_ca.len() > 50;
225        let fmt = strptime::compile_fmt(fmt)?;
226
227        // We can use the fast parser.
228        let ca = if let Some(fmt_len) = strptime::fmt_len(fmt.as_bytes()) {
229            let mut strptime_cache = StrpTimeState::default();
230            let mut convert = LruCachedFunc::new(
231                |s: &str| {
232                    // SAFETY: fmt_len is correct, it was computed with this `fmt` str.
233                    match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) } {
234                        // Fallback to chrono.
235                        None => NaiveDate::parse_from_str(s, &fmt).ok(),
236                        Some(ndt) => Some(ndt.date()),
237                    }
238                    .map(naive_date_to_date)
239                },
240                (string_ca.len() as f64).sqrt() as usize,
241            );
242            unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))
243        } else {
244            let mut convert = LruCachedFunc::new(
245                |s| {
246                    let naive_date = NaiveDate::parse_from_str(s, &fmt).ok()?;
247                    Some(naive_date_to_date(naive_date))
248                },
249                (string_ca.len() as f64).sqrt() as usize,
250            );
251            unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))
252        };
253
254        Ok(ca.with_name(string_ca.name().clone()).into_date())
255    }
256
257    #[cfg(feature = "dtype-datetime")]
258    /// Parsing string values and return a [`DatetimeChunked`].
259    fn as_datetime(
260        &self,
261        fmt: Option<&str>,
262        tu: TimeUnit,
263        use_cache: bool,
264        tz_aware: bool,
265        tz: Option<&TimeZone>,
266        ambiguous: &StringChunked,
267    ) -> PolarsResult<DatetimeChunked> {
268        let string_ca = self.as_string();
269        let fmt = match fmt {
270            Some(fmt) => fmt,
271            None => return infer::to_datetime(string_ca, tu, tz, ambiguous, true),
272        };
273        let fmt = strptime::compile_fmt(fmt)?;
274        let use_cache = use_cache && string_ca.len() > 50;
275
276        let func = match tu {
277            TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
278            TimeUnit::Microseconds => datetime_to_timestamp_us,
279            TimeUnit::Milliseconds => datetime_to_timestamp_ms,
280        };
281
282        if tz_aware {
283            #[cfg(feature = "timezones")]
284            {
285                let mut convert = LruCachedFunc::new(
286                    |s: &str| {
287                        let dt = DateTime::parse_from_str(s, &fmt).ok()?;
288                        Some(func(dt.naive_utc()))
289                    },
290                    (string_ca.len() as f64).sqrt() as usize,
291                );
292                Ok(
293                    unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
294                        .with_name(string_ca.name().clone())
295                        .into_datetime(tu, Some(tz.cloned().unwrap_or(TimeZone::UTC))),
296                )
297            }
298            #[cfg(not(feature = "timezones"))]
299            {
300                panic!("activate 'timezones' feature")
301            }
302        } else {
303            let transform = match tu {
304                TimeUnit::Nanoseconds => infer::transform_datetime_ns,
305                TimeUnit::Microseconds => infer::transform_datetime_us,
306                TimeUnit::Milliseconds => infer::transform_datetime_ms,
307            };
308            // We can use the fast parser.
309            let ca = if let Some(fmt_len) = self::strptime::fmt_len(fmt.as_bytes()) {
310                let mut strptime_cache = StrpTimeState::default();
311                let mut convert = LruCachedFunc::new(
312                    |s: &str| {
313                        // SAFETY: fmt_len is correct, it was computed with this `fmt` str.
314                        match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) }
315                        {
316                            None => transform(s, &fmt),
317                            Some(ndt) => Some(func(ndt)),
318                        }
319                    },
320                    (string_ca.len() as f64).sqrt() as usize,
321                );
322                unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
323            } else {
324                let mut convert = LruCachedFunc::new(
325                    |s| transform(s, &fmt),
326                    (string_ca.len() as f64).sqrt() as usize,
327                );
328                unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
329            };
330            let dt = ca
331                .with_name(string_ca.name().clone())
332                .into_datetime(tu, None);
333            match tz {
334                #[cfg(feature = "timezones")]
335                Some(tz) => polars_ops::prelude::replace_time_zone(
336                    &dt,
337                    Some(tz),
338                    ambiguous,
339                    NonExistent::Raise,
340                ),
341                _ => Ok(dt),
342            }
343        }
344    }
345}
346
347pub trait AsString {
348    fn as_string(&self) -> &StringChunked;
349}
350
351impl AsString for StringChunked {
352    fn as_string(&self) -> &StringChunked {
353        self
354    }
355}
356
357impl StringMethods for StringChunked {}