polars_time/chunkedarray/string/
mod.rs

1pub mod infer;
2use chrono::DateTime;
3mod patterns;
4mod strptime;
5use chrono::ParseError;
6use chrono::format::ParseErrorKind;
7pub use patterns::Pattern;
8#[cfg(feature = "dtype-time")]
9use polars_core::chunked_array::temporal::time_to_time64ns;
10use polars_core::prelude::arity::unary_elementwise;
11use polars_utils::cache::LruCachedFunc;
12
13use super::*;
14#[cfg(feature = "dtype-date")]
15use crate::chunkedarray::date::naive_date_to_date;
16use crate::prelude::string::strptime::StrpTimeState;
17
18#[cfg(feature = "dtype-time")]
19fn time_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
20// (string, fmt) -> PolarsResult
21where
22    F: Fn(&str, &str) -> chrono::ParseResult<K>,
23{
24    patterns::TIME_H_M_S
25        .iter()
26        .chain(patterns::TIME_H_M_S)
27        .find(|fmt| convert(val, fmt).is_ok())
28        .copied()
29}
30
31fn datetime_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
32// (string, fmt) -> PolarsResult
33where
34    F: Fn(&str, &str) -> chrono::ParseResult<K>,
35{
36    patterns::DATETIME_Y_M_D
37        .iter()
38        .chain(patterns::DATETIME_D_M_Y)
39        .find(|fmt| convert(val, fmt).is_ok())
40        .copied()
41}
42
43fn date_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
44// (string, fmt) -> PolarsResult
45where
46    F: Fn(&str, &str) -> chrono::ParseResult<K>,
47{
48    patterns::DATE_Y_M_D
49        .iter()
50        .chain(patterns::DATE_D_M_Y)
51        .find(|fmt| convert(val, fmt).is_ok())
52        .copied()
53}
54
55struct ParseErrorByteCopy(ParseErrorKind);
56
57impl From<ParseError> for ParseErrorByteCopy {
58    fn from(e: ParseError) -> Self {
59        ParseErrorByteCopy(e.kind())
60    }
61}
62
63fn get_first_val(ca: &StringChunked) -> PolarsResult<&str> {
64    let idx = ca.first_non_null().ok_or_else(|| {
65        polars_err!(ComputeError:
66            "unable to determine date parsing format, all values are null",
67        )
68    })?;
69    Ok(ca.get(idx).expect("should not be null"))
70}
71
72#[cfg(feature = "dtype-datetime")]
73fn sniff_fmt_datetime(ca_string: &StringChunked) -> PolarsResult<&'static str> {
74    let val = get_first_val(ca_string)?;
75    datetime_pattern(val, NaiveDateTime::parse_from_str)
76        .or_else(|| datetime_pattern(val, NaiveDate::parse_from_str))
77        .ok_or_else(|| polars_err!(parse_fmt_idk = "datetime"))
78}
79
80#[cfg(feature = "dtype-date")]
81fn sniff_fmt_date(ca_string: &StringChunked) -> PolarsResult<&'static str> {
82    let val = get_first_val(ca_string)?;
83    date_pattern(val, NaiveDate::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "date"))
84}
85
86#[cfg(feature = "dtype-time")]
87fn sniff_fmt_time(ca_string: &StringChunked) -> PolarsResult<&'static str> {
88    let val = get_first_val(ca_string)?;
89    time_pattern(val, NaiveTime::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "time"))
90}
91
92pub trait StringMethods: AsString {
93    #[cfg(feature = "dtype-time")]
94    /// Parsing string values and return a [`TimeChunked`]
95    fn as_time(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<TimeChunked> {
96        let string_ca = self.as_string();
97        let fmt = match fmt {
98            Some(fmt) => fmt,
99            None => sniff_fmt_time(string_ca)?,
100        };
101        let use_cache = use_cache && string_ca.len() > 50;
102
103        let mut convert = LruCachedFunc::new(
104            |s| {
105                let naive_time = NaiveTime::parse_from_str(s, fmt).ok()?;
106                Some(time_to_time64ns(&naive_time))
107            },
108            (string_ca.len() as f64).sqrt() as usize,
109        );
110        let ca = unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache));
111        Ok(ca.with_name(string_ca.name().clone()).into())
112    }
113
114    #[cfg(feature = "dtype-date")]
115    /// Parsing string values and return a [`DateChunked`]
116    /// Different from `as_date` this function allows matches that not contain the whole string
117    /// e.g. "foo-2021-01-01-bar" could match "2021-01-01"
118    fn as_date_not_exact(&self, fmt: Option<&str>) -> PolarsResult<DateChunked> {
119        let string_ca = self.as_string();
120        let fmt = match fmt {
121            Some(fmt) => fmt,
122            None => sniff_fmt_date(string_ca)?,
123        };
124        let ca = unary_elementwise(string_ca, |opt_s| {
125            let mut s = opt_s?;
126            let fmt_len = fmt.len();
127
128            for i in 1..(s.len().saturating_sub(fmt_len)) {
129                if s.is_empty() {
130                    return None;
131                }
132                match NaiveDate::parse_from_str(s, fmt).map(naive_date_to_date) {
133                    Ok(nd) => return Some(nd),
134                    Err(e) => match ParseErrorByteCopy::from(e).0 {
135                        ParseErrorKind::TooLong => {
136                            s = &s[..s.len() - 1];
137                        },
138                        _ => {
139                            s = &s[i..];
140                        },
141                    },
142                }
143            }
144            None
145        });
146        Ok(ca.with_name(string_ca.name().clone()).into())
147    }
148
149    #[cfg(feature = "dtype-datetime")]
150    /// Parsing string values and return a [`DatetimeChunked`]
151    /// Different from `as_datetime` this function allows matches that not contain the whole string
152    /// e.g. "foo-2021-01-01-bar" could match "2021-01-01"
153    fn as_datetime_not_exact(
154        &self,
155        fmt: Option<&str>,
156        tu: TimeUnit,
157        tz_aware: bool,
158        tz: Option<&TimeZone>,
159        _ambiguous: &StringChunked,
160    ) -> PolarsResult<DatetimeChunked> {
161        let string_ca = self.as_string();
162        let fmt = match fmt {
163            Some(fmt) => fmt,
164            None => sniff_fmt_datetime(string_ca)?,
165        };
166
167        let func = match tu {
168            TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
169            TimeUnit::Microseconds => datetime_to_timestamp_us,
170            TimeUnit::Milliseconds => datetime_to_timestamp_ms,
171        };
172
173        let ca = unary_elementwise(string_ca, |opt_s| {
174            let mut s = opt_s?;
175            let fmt_len = fmt.len();
176
177            for i in 1..(s.len().saturating_sub(fmt_len)) {
178                if s.is_empty() {
179                    return None;
180                }
181                let timestamp = if tz_aware {
182                    DateTime::parse_from_str(s, fmt).map(|dt| func(dt.naive_utc()))
183                } else {
184                    NaiveDateTime::parse_from_str(s, fmt).map(func)
185                };
186                match timestamp {
187                    Ok(ts) => return Some(ts),
188                    Err(e) => {
189                        let e: ParseErrorByteCopy = e.into();
190                        match e.0 {
191                            ParseErrorKind::TooLong => {
192                                s = &s[..s.len() - 1];
193                            },
194                            _ => {
195                                s = &s[i..];
196                            },
197                        }
198                    },
199                }
200            }
201            None
202        })
203        .with_name(string_ca.name().clone());
204        match (tz_aware, tz) {
205            #[cfg(feature = "timezones")]
206            (false, Some(tz)) => polars_ops::prelude::replace_time_zone(
207                &ca.into_datetime(tu, None),
208                Some(tz),
209                _ambiguous,
210                NonExistent::Raise,
211            ),
212            #[cfg(feature = "timezones")]
213            (true, tz) => Ok(ca.into_datetime(
214                tu,
215                tz.cloned().or_else(|| Some(PlSmallStr::from_static("UTC"))),
216            )),
217            _ => Ok(ca.into_datetime(tu, None)),
218        }
219    }
220
221    #[cfg(feature = "dtype-date")]
222    /// Parsing string values and return a [`DateChunked`]
223    fn as_date(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<DateChunked> {
224        let string_ca = self.as_string();
225        let fmt = match fmt {
226            Some(fmt) => fmt,
227            None => return infer::to_date(string_ca),
228        };
229        let use_cache = use_cache && string_ca.len() > 50;
230        let fmt = strptime::compile_fmt(fmt)?;
231
232        // We can use the fast parser.
233        let ca = if let Some(fmt_len) = strptime::fmt_len(fmt.as_bytes()) {
234            let mut strptime_cache = StrpTimeState::default();
235            let mut convert = LruCachedFunc::new(
236                |s: &str| {
237                    // SAFETY: fmt_len is correct, it was computed with this `fmt` str.
238                    match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) } {
239                        // Fallback to chrono.
240                        None => NaiveDate::parse_from_str(s, &fmt).ok(),
241                        Some(ndt) => Some(ndt.date()),
242                    }
243                    .map(naive_date_to_date)
244                },
245                (string_ca.len() as f64).sqrt() as usize,
246            );
247            unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))
248        } else {
249            let mut convert = LruCachedFunc::new(
250                |s| {
251                    let naive_date = NaiveDate::parse_from_str(s, &fmt).ok()?;
252                    Some(naive_date_to_date(naive_date))
253                },
254                (string_ca.len() as f64).sqrt() as usize,
255            );
256            unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))
257        };
258
259        Ok(ca.with_name(string_ca.name().clone()).into())
260    }
261
262    #[cfg(feature = "dtype-datetime")]
263    /// Parsing string values and return a [`DatetimeChunked`].
264    fn as_datetime(
265        &self,
266        fmt: Option<&str>,
267        tu: TimeUnit,
268        use_cache: bool,
269        tz_aware: bool,
270        tz: Option<&TimeZone>,
271        ambiguous: &StringChunked,
272    ) -> PolarsResult<DatetimeChunked> {
273        let string_ca = self.as_string();
274        let fmt = match fmt {
275            Some(fmt) => fmt,
276            None => return infer::to_datetime(string_ca, tu, tz, ambiguous),
277        };
278        let fmt = strptime::compile_fmt(fmt)?;
279        let use_cache = use_cache && string_ca.len() > 50;
280
281        let func = match tu {
282            TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
283            TimeUnit::Microseconds => datetime_to_timestamp_us,
284            TimeUnit::Milliseconds => datetime_to_timestamp_ms,
285        };
286
287        if tz_aware {
288            #[cfg(feature = "timezones")]
289            {
290                let mut convert = LruCachedFunc::new(
291                    |s: &str| {
292                        let dt = DateTime::parse_from_str(s, &fmt).ok()?;
293                        Some(func(dt.naive_utc()))
294                    },
295                    (string_ca.len() as f64).sqrt() as usize,
296                );
297                Ok(
298                    unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
299                        .with_name(string_ca.name().clone())
300                        .into_datetime(
301                            tu,
302                            Some(
303                                tz.cloned()
304                                    .unwrap_or_else(|| PlSmallStr::from_static("UTC")),
305                            ),
306                        ),
307                )
308            }
309            #[cfg(not(feature = "timezones"))]
310            {
311                panic!("activate 'timezones' feature")
312            }
313        } else {
314            let transform = match tu {
315                TimeUnit::Nanoseconds => infer::transform_datetime_ns,
316                TimeUnit::Microseconds => infer::transform_datetime_us,
317                TimeUnit::Milliseconds => infer::transform_datetime_ms,
318            };
319            // We can use the fast parser.
320            let ca = if let Some(fmt_len) = self::strptime::fmt_len(fmt.as_bytes()) {
321                let mut strptime_cache = StrpTimeState::default();
322                let mut convert = LruCachedFunc::new(
323                    |s: &str| {
324                        // SAFETY: fmt_len is correct, it was computed with this `fmt` str.
325                        match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) }
326                        {
327                            None => transform(s, &fmt),
328                            Some(ndt) => Some(func(ndt)),
329                        }
330                    },
331                    (string_ca.len() as f64).sqrt() as usize,
332                );
333                unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
334            } else {
335                let mut convert = LruCachedFunc::new(
336                    |s| transform(s, &fmt),
337                    (string_ca.len() as f64).sqrt() as usize,
338                );
339                unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
340            };
341            let dt = ca
342                .with_name(string_ca.name().clone())
343                .into_datetime(tu, None);
344            match tz {
345                #[cfg(feature = "timezones")]
346                Some(tz) => polars_ops::prelude::replace_time_zone(
347                    &dt,
348                    Some(tz),
349                    ambiguous,
350                    NonExistent::Raise,
351                ),
352                _ => Ok(dt),
353            }
354        }
355    }
356}
357
358pub trait AsString {
359    fn as_string(&self) -> &StringChunked;
360}
361
362impl AsString for StringChunked {
363    fn as_string(&self) -> &StringChunked {
364        self
365    }
366}
367
368impl StringMethods for StringChunked {}