polars_time/chunkedarray/string/
infer.rs

1use arrow::array::PrimitiveArray;
2use chrono::format::ParseErrorKind;
3use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime};
4use polars_core::prelude::*;
5
6use super::patterns::{self, Pattern};
7#[cfg(feature = "dtype-date")]
8use crate::chunkedarray::date::naive_date_to_date;
9use crate::chunkedarray::string::strptime;
10use crate::prelude::string::strptime::StrpTimeState;
11
12polars_utils::regex_cache::cached_regex! {
13    static DATETIME_DMY_RE = r#"(?x)
14        ^
15        ['"]?                        # optional quotes
16        (?:\d{1,2})                  # day
17        [-/\.]                       # separator
18        (?P<month>[01]?\d{1})        # month
19        [-/\.]                       # separator
20        (?:\d{4,})                   # year
21        (?:
22            [T\ ]                    # separator
23            (?:\d{1,2})              # hour
24            :?                       # separator
25            (?:\d{1,2})              # minute
26            (?:
27                :?                   # separator
28                (?:\d{1,2})          # second
29                (?:
30                    \.(?:\d{1,9})    # subsecond
31                )?
32            )?
33        )?
34        ['"]?                        # optional quotes
35        $
36        "#;
37
38    static DATETIME_YMD_RE = r#"(?x)
39            ^
40            ['"]?                      # optional quotes
41            (?:\d{4,})                 # year
42            [-/\.]                     # separator
43            (?P<month>[01]?\d{1})      # month
44            [-/\.]                     # separator
45            (?:\d{1,2})                # day
46            (?:
47                [T\ ]                  # separator
48                (?:\d{1,2})            # hour
49                :?                     # separator
50                (?:\d{1,2})            # minute
51                (?:
52                    :?                 # separator
53                    (?:\d{1,2})        # seconds
54                    (?:
55                        \.(?:\d{1,9})  # subsecond
56                    )?
57                )?
58            )?
59            ['"]?                      # optional quotes
60            $
61            "#;
62
63    static DATETIME_YMDZ_RE = r#"(?x)
64            ^
65            ['"]?                  # optional quotes
66            (?:\d{4,})             # year
67            [-/\.]                 # separator
68            (?P<month>[01]?\d{1})  # month
69            [-/\.]                 # separator
70            (?:\d{1,2})            # year
71            [T\ ]                  # separator
72            (?:\d{2})              # hour
73            :?                     # separator
74            (?:\d{2})              # minute
75            (?:
76                :?                 # separator
77                (?:\d{2})          # second
78                (?:
79                    \.(?:\d{1,9})  # subsecond
80                )?
81            )?
82            (?:
83                # offset (e.g. +01:00)
84                [+-](?:\d{2})
85                :?
86                (?:\d{2})
87                # or Zulu suffix
88                |Z
89            )
90            ['"]?                  # optional quotes
91            $
92            "#;
93}
94
95impl Pattern {
96    pub fn is_inferable(&self, val: &str) -> bool {
97        match self {
98            Pattern::DateDMY => true, // there are very few Date patterns, so it's cheaper
99            Pattern::DateYMD => true, // to just try them
100            Pattern::Time => true,
101            Pattern::DatetimeDMY => match DATETIME_DMY_RE.captures(val) {
102                Some(search) => (1..=12).contains(
103                    &search
104                        .name("month")
105                        .unwrap()
106                        .as_str()
107                        .parse::<u8>()
108                        .unwrap(),
109                ),
110                None => false,
111            },
112            Pattern::DatetimeYMD => match DATETIME_YMD_RE.captures(val) {
113                Some(search) => (1..=12).contains(
114                    &search
115                        .name("month")
116                        .unwrap()
117                        .as_str()
118                        .parse::<u8>()
119                        .unwrap(),
120                ),
121                None => false,
122            },
123            Pattern::DatetimeYMDZ => match DATETIME_YMDZ_RE.captures(val) {
124                Some(search) => (1..=12).contains(
125                    &search
126                        .name("month")
127                        .unwrap()
128                        .as_str()
129                        .parse::<u8>()
130                        .unwrap(),
131                ),
132                None => false,
133            },
134        }
135    }
136}
137
138pub trait StrpTimeParser<T> {
139    fn parse_bytes(&mut self, val: &[u8], time_unit: Option<TimeUnit>) -> Option<T>;
140}
141
142#[cfg(feature = "dtype-datetime")]
143impl StrpTimeParser<i64> for DatetimeInfer<Int64Type> {
144    fn parse_bytes(&mut self, val: &[u8], time_unit: Option<TimeUnit>) -> Option<i64> {
145        if self.fmt_len == 0 {
146            self.fmt_len = strptime::fmt_len(self.latest_fmt.as_bytes())?;
147        }
148        let transform = match time_unit {
149            Some(TimeUnit::Nanoseconds) => datetime_to_timestamp_ns,
150            Some(TimeUnit::Microseconds) => datetime_to_timestamp_us,
151            Some(TimeUnit::Milliseconds) => datetime_to_timestamp_ms,
152            _ => unreachable!(), // time_unit has to be provided for datetime
153        };
154        unsafe {
155            self.transform_bytes
156                .parse(val, self.latest_fmt.as_bytes(), self.fmt_len)
157                .map(transform)
158                .or_else(|| {
159                    // TODO! this will try all patterns.
160                    // somehow we must early escape if value is invalid
161                    for fmt in self.patterns {
162                        if self.fmt_len == 0 {
163                            self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;
164                        }
165                        if let Some(parsed) = self
166                            .transform_bytes
167                            .parse(val, fmt.as_bytes(), self.fmt_len)
168                            .map(datetime_to_timestamp_us)
169                        {
170                            self.latest_fmt = fmt;
171                            return Some(parsed);
172                        }
173                    }
174                    None
175                })
176        }
177    }
178}
179
180#[cfg(feature = "dtype-date")]
181impl StrpTimeParser<i32> for DatetimeInfer<Int32Type> {
182    fn parse_bytes(&mut self, val: &[u8], _time_unit: Option<TimeUnit>) -> Option<i32> {
183        if self.fmt_len == 0 {
184            self.fmt_len = strptime::fmt_len(self.latest_fmt.as_bytes())?;
185        }
186        unsafe {
187            self.transform_bytes
188                .parse(val, self.latest_fmt.as_bytes(), self.fmt_len)
189                .map(|ndt| naive_date_to_date(ndt.date()))
190                .or_else(|| {
191                    // TODO! this will try all patterns.
192                    // somehow we must early escape if value is invalid
193                    for fmt in self.patterns {
194                        if self.fmt_len == 0 {
195                            self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;
196                        }
197                        if let Some(parsed) = self
198                            .transform_bytes
199                            .parse(val, fmt.as_bytes(), self.fmt_len)
200                            .map(|ndt| naive_date_to_date(ndt.date()))
201                        {
202                            self.latest_fmt = fmt;
203                            return Some(parsed);
204                        }
205                    }
206                    None
207                })
208        }
209    }
210}
211
212#[derive(Clone)]
213pub struct DatetimeInfer<T: PolarsNumericType> {
214    pub pattern: Pattern,
215    patterns: &'static [&'static str],
216    latest_fmt: &'static str,
217    transform: fn(&str, &str) -> Option<T::Native>,
218    transform_bytes: StrpTimeState,
219    fmt_len: u16,
220    pub logical_type: DataType,
221}
222
223pub trait TryFromWithUnit<T>: Sized {
224    type Error;
225    fn try_from_with_unit(pattern: T, unit: Option<TimeUnit>) -> PolarsResult<Self>;
226}
227
228#[cfg(feature = "dtype-datetime")]
229impl TryFromWithUnit<Pattern> for DatetimeInfer<Int64Type> {
230    type Error = PolarsError;
231
232    fn try_from_with_unit(value: Pattern, time_unit: Option<TimeUnit>) -> PolarsResult<Self> {
233        let time_unit = time_unit.expect("time_unit must be provided for datetime");
234
235        let transform = match (time_unit, value) {
236            (TimeUnit::Milliseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_ms,
237            (TimeUnit::Milliseconds, _) => transform_datetime_ms,
238            (TimeUnit::Microseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_us,
239            (TimeUnit::Microseconds, _) => transform_datetime_us,
240            (TimeUnit::Nanoseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_ns,
241            (TimeUnit::Nanoseconds, _) => transform_datetime_ns,
242        };
243        let (pattern, patterns) = match value {
244            Pattern::DatetimeDMY | Pattern::DateDMY => {
245                (Pattern::DatetimeDMY, patterns::DATETIME_D_M_Y)
246            },
247            Pattern::DatetimeYMD | Pattern::DateYMD => {
248                (Pattern::DatetimeYMD, patterns::DATETIME_Y_M_D)
249            },
250            Pattern::DatetimeYMDZ => (Pattern::DatetimeYMDZ, patterns::DATETIME_Y_M_D_Z),
251            Pattern::Time => (Pattern::Time, patterns::TIME_H_M_S),
252        };
253
254        Ok(DatetimeInfer {
255            pattern,
256            patterns,
257            latest_fmt: patterns[0],
258            transform,
259            transform_bytes: StrpTimeState::default(),
260            fmt_len: 0,
261            logical_type: DataType::Datetime(time_unit, None),
262        })
263    }
264}
265
266#[cfg(feature = "dtype-date")]
267impl TryFromWithUnit<Pattern> for DatetimeInfer<Int32Type> {
268    type Error = PolarsError;
269
270    fn try_from_with_unit(value: Pattern, _time_unit: Option<TimeUnit>) -> PolarsResult<Self> {
271        match value {
272            Pattern::DateDMY => Ok(DatetimeInfer {
273                pattern: Pattern::DateDMY,
274                patterns: patterns::DATE_D_M_Y,
275                latest_fmt: patterns::DATE_D_M_Y[0],
276                transform: transform_date,
277                transform_bytes: StrpTimeState::default(),
278                fmt_len: 0,
279                logical_type: DataType::Date,
280            }),
281            Pattern::DateYMD => Ok(DatetimeInfer {
282                pattern: Pattern::DateYMD,
283                patterns: patterns::DATE_Y_M_D,
284                latest_fmt: patterns::DATE_Y_M_D[0],
285                transform: transform_date,
286                transform_bytes: StrpTimeState::default(),
287                fmt_len: 0,
288                logical_type: DataType::Date,
289            }),
290            _ => polars_bail!(ComputeError: "could not convert pattern"),
291        }
292    }
293}
294
295impl<T: PolarsNumericType> DatetimeInfer<T> {
296    pub fn parse(&mut self, val: &str) -> Option<T::Native> {
297        match (self.transform)(val, self.latest_fmt) {
298            Some(parsed) => Some(parsed),
299            // try other patterns
300            None => {
301                if !self.pattern.is_inferable(val) {
302                    return None;
303                }
304                for fmt in self.patterns {
305                    self.fmt_len = 0;
306                    if let Some(parsed) = (self.transform)(val, fmt) {
307                        self.latest_fmt = fmt;
308                        return Some(parsed);
309                    }
310                }
311                None
312            },
313        }
314    }
315}
316
317impl<T: PolarsNumericType> DatetimeInfer<T>
318where
319    ChunkedArray<T>: IntoSeries,
320{
321    fn coerce_string(&mut self, ca: &StringChunked) -> Series {
322        let chunks = ca.downcast_iter().map(|array| {
323            let iter = array
324                .into_iter()
325                .map(|opt_val| opt_val.and_then(|val| self.parse(val)));
326            PrimitiveArray::from_trusted_len_iter(iter)
327        });
328        ChunkedArray::from_chunk_iter(ca.name().clone(), chunks)
329            .into_series()
330            .cast(&self.logical_type)
331            .unwrap()
332            .with_name(ca.name().clone())
333    }
334}
335
336#[cfg(feature = "dtype-date")]
337fn transform_date(val: &str, fmt: &str) -> Option<i32> {
338    NaiveDate::parse_from_str(val, fmt)
339        .ok()
340        .map(naive_date_to_date)
341}
342
343#[cfg(feature = "dtype-datetime")]
344pub(crate) fn transform_datetime_ns(val: &str, fmt: &str) -> Option<i64> {
345    match NaiveDateTime::parse_from_str(val, fmt) {
346        Ok(ndt) => Some(datetime_to_timestamp_ns(ndt)),
347        Err(parse_error) => match parse_error.kind() {
348            ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)
349                .ok()
350                .map(|nd| datetime_to_timestamp_ns(nd.and_hms_opt(0, 0, 0).unwrap())),
351            _ => None,
352        },
353    }
354}
355
356fn transform_tzaware_datetime_ns(val: &str, fmt: &str) -> Option<i64> {
357    let dt = DateTime::parse_from_str(val, fmt);
358    dt.ok().map(|dt| datetime_to_timestamp_ns(dt.naive_utc()))
359}
360
361#[cfg(feature = "dtype-datetime")]
362pub(crate) fn transform_datetime_us(val: &str, fmt: &str) -> Option<i64> {
363    match NaiveDateTime::parse_from_str(val, fmt) {
364        Ok(ndt) => Some(datetime_to_timestamp_us(ndt)),
365        Err(parse_error) => match parse_error.kind() {
366            ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)
367                .ok()
368                .map(|nd| datetime_to_timestamp_us(nd.and_hms_opt(0, 0, 0).unwrap())),
369            _ => None,
370        },
371    }
372}
373
374fn transform_tzaware_datetime_us(val: &str, fmt: &str) -> Option<i64> {
375    let dt = DateTime::parse_from_str(val, fmt);
376    dt.ok().map(|dt| datetime_to_timestamp_us(dt.naive_utc()))
377}
378
379#[cfg(feature = "dtype-datetime")]
380pub(crate) fn transform_datetime_ms(val: &str, fmt: &str) -> Option<i64> {
381    match NaiveDateTime::parse_from_str(val, fmt) {
382        Ok(ndt) => Some(datetime_to_timestamp_ms(ndt)),
383        Err(parse_error) => match parse_error.kind() {
384            ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)
385                .ok()
386                .map(|nd| datetime_to_timestamp_ms(nd.and_hms_opt(0, 0, 0).unwrap())),
387            _ => None,
388        },
389    }
390}
391
392fn transform_tzaware_datetime_ms(val: &str, fmt: &str) -> Option<i64> {
393    let dt = DateTime::parse_from_str(val, fmt);
394    dt.ok().map(|dt| datetime_to_timestamp_ms(dt.naive_utc()))
395}
396
397pub fn infer_pattern_single(val: &str) -> Option<Pattern> {
398    // Dates come first, because we see datetimes as superset of dates
399    infer_pattern_date_single(val)
400        .or_else(|| infer_pattern_time_single(val))
401        .or_else(|| infer_pattern_datetime_single(val))
402}
403
404fn infer_pattern_datetime_single(val: &str) -> Option<Pattern> {
405    if patterns::DATETIME_D_M_Y.iter().any(|fmt| {
406        NaiveDateTime::parse_from_str(val, fmt).is_ok()
407            || NaiveDate::parse_from_str(val, fmt).is_ok()
408    }) {
409        Some(Pattern::DatetimeDMY)
410    } else if patterns::DATETIME_Y_M_D.iter().any(|fmt| {
411        NaiveDateTime::parse_from_str(val, fmt).is_ok()
412            || NaiveDate::parse_from_str(val, fmt).is_ok()
413    }) {
414        Some(Pattern::DatetimeYMD)
415    } else if patterns::DATETIME_Y_M_D_Z
416        .iter()
417        .any(|fmt| NaiveDateTime::parse_from_str(val, fmt).is_ok())
418    {
419        Some(Pattern::DatetimeYMDZ)
420    } else {
421        None
422    }
423}
424
425fn infer_pattern_date_single(val: &str) -> Option<Pattern> {
426    if patterns::DATE_D_M_Y
427        .iter()
428        .any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())
429    {
430        Some(Pattern::DateDMY)
431    } else if patterns::DATE_Y_M_D
432        .iter()
433        .any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())
434    {
435        Some(Pattern::DateYMD)
436    } else {
437        None
438    }
439}
440
441fn infer_pattern_time_single(val: &str) -> Option<Pattern> {
442    patterns::TIME_H_M_S
443        .iter()
444        .any(|fmt| NaiveTime::parse_from_str(val, fmt).is_ok())
445        .then_some(Pattern::Time)
446}
447
448#[cfg(feature = "dtype-datetime")]
449pub(crate) fn to_datetime(
450    ca: &StringChunked,
451    tu: TimeUnit,
452    tz: Option<&TimeZone>,
453    _ambiguous: &StringChunked,
454) -> PolarsResult<DatetimeChunked> {
455    match ca.first_non_null() {
456        None => {
457            Ok(Int64Chunked::full_null(ca.name().clone(), ca.len()).into_datetime(tu, tz.cloned()))
458        },
459        Some(idx) => {
460            let subset = ca.slice(idx as i64, ca.len());
461            let pattern = subset
462                .into_iter()
463                .find_map(|opt_val| opt_val.and_then(infer_pattern_datetime_single))
464                .ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;
465            let mut infer = DatetimeInfer::<Int64Type>::try_from_with_unit(pattern, Some(tu))?;
466            match pattern {
467                #[cfg(feature = "timezones")]
468                Pattern::DatetimeYMDZ => infer.coerce_string(ca).datetime().map(|ca| {
469                    let mut ca = ca.clone();
470                    // `tz` has already been validated.
471                    ca.set_time_unit_and_time_zone(
472                        tu,
473                        tz.cloned()
474                            .unwrap_or_else(|| PlSmallStr::from_static("UTC")),
475                    )?;
476                    Ok(ca)
477                })?,
478                _ => infer.coerce_string(ca).datetime().map(|ca| {
479                    let mut ca = ca.clone();
480                    ca.set_time_unit(tu);
481                    match tz {
482                        #[cfg(feature = "timezones")]
483                        Some(tz) => polars_ops::prelude::replace_time_zone(
484                            &ca,
485                            Some(tz),
486                            _ambiguous,
487                            NonExistent::Raise,
488                        ),
489                        _ => Ok(ca),
490                    }
491                })?,
492            }
493        },
494    }
495}
496#[cfg(feature = "dtype-date")]
497pub(crate) fn to_date(ca: &StringChunked) -> PolarsResult<DateChunked> {
498    match ca.first_non_null() {
499        None => Ok(Int32Chunked::full_null(ca.name().clone(), ca.len()).into_date()),
500        Some(idx) => {
501            let subset = ca.slice(idx as i64, ca.len());
502            let pattern = subset
503                .into_iter()
504                .find_map(|opt_val| opt_val.and_then(infer_pattern_date_single))
505                .ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;
506            let mut infer = DatetimeInfer::<Int32Type>::try_from_with_unit(pattern, None).unwrap();
507            infer.coerce_string(ca).date().cloned()
508        },
509    }
510}