polars_time/chunkedarray/string/
infer.rs

1use arrow::array::PrimitiveArray;
2use chrono::format::ParseErrorKind;
3use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime};
4use polars_core::prelude::*;
5
6use super::patterns::{self, Pattern};
7#[cfg(feature = "dtype-date")]
8use crate::chunkedarray::date::naive_date_to_date;
9use crate::chunkedarray::string::strptime;
10use crate::prelude::string::strptime::StrpTimeState;
11
12polars_utils::regex_cache::cached_regex! {
13    static DATETIME_DMY_RE = r#"(?x)
14        ^
15        ['"]?                        # optional quotes
16        (?:\d{1,2})                  # day
17        [-/\.]                       # separator
18        (?P<month>[01]?\d{1})        # month
19        [-/\.]                       # separator
20        (?:\d{4,})                   # year
21        (?:
22            [T\ ]                    # separator
23            (?:\d{1,2})              # hour
24            :?                       # separator
25            (?:\d{1,2})              # minute
26            (?:
27                :?                   # separator
28                (?:\d{1,2})          # second
29                (?:
30                    \.(?:\d{1,9})    # subsecond
31                )?
32            )?
33        )?
34        ['"]?                        # optional quotes
35        $
36        "#;
37
38    static DATETIME_YMD_RE = r#"(?x)
39            ^
40            ['"]?                      # optional quotes
41            (?:\d{4,})                 # year
42            [-/\.]                     # separator
43            (?P<month>[01]?\d{1})      # month
44            [-/\.]                     # separator
45            (?:\d{1,2})                # day
46            (?:
47                [T\ ]                  # separator
48                (?:\d{1,2})            # hour
49                :?                     # separator
50                (?:\d{1,2})            # minute
51                (?:
52                    :?                 # separator
53                    (?:\d{1,2})        # seconds
54                    (?:
55                        \.(?:\d{1,9})  # subsecond
56                    )?
57                )?
58            )?
59            ['"]?                      # optional quotes
60            $
61            "#;
62
63    static DATETIME_YMDZ_RE = r#"(?x)
64            ^
65            ['"]?                  # optional quotes
66            (?:\d{4,})             # year
67            [-/\.]                 # separator
68            (?P<month>[01]?\d{1})  # month
69            [-/\.]                 # separator
70            (?:\d{1,2})            # year
71            [T\ ]                  # separator
72            (?:\d{2})              # hour
73            :?                     # separator
74            (?:\d{2})              # minute
75            (?:
76                :?                 # separator
77                (?:\d{2})          # second
78                (?:
79                    \.(?:\d{1,9})  # subsecond
80                )?
81            )?
82            (?:
83                # offset (e.g. +01:00, +0100, or +01)
84                [+-](?:\d{2})
85                (?::?\d{2})?
86                # or Zulu suffix
87                |Z
88            )
89            ['"]?                  # optional quotes
90            $
91            "#;
92}
93
94impl Pattern {
95    pub fn is_inferable(&self, val: &str) -> bool {
96        match self {
97            Pattern::DateDMY => true, // there are very few Date patterns, so it's cheaper
98            Pattern::DateYMD => true, // to just try them
99            Pattern::Time => true,
100            Pattern::DatetimeDMY => match DATETIME_DMY_RE.captures(val) {
101                Some(search) => (1..=12).contains(
102                    &search
103                        .name("month")
104                        .unwrap()
105                        .as_str()
106                        .parse::<u8>()
107                        .unwrap(),
108                ),
109                None => false,
110            },
111            Pattern::DatetimeYMD => match DATETIME_YMD_RE.captures(val) {
112                Some(search) => (1..=12).contains(
113                    &search
114                        .name("month")
115                        .unwrap()
116                        .as_str()
117                        .parse::<u8>()
118                        .unwrap(),
119                ),
120                None => false,
121            },
122            Pattern::DatetimeYMDZ => match DATETIME_YMDZ_RE.captures(val) {
123                Some(search) => (1..=12).contains(
124                    &search
125                        .name("month")
126                        .unwrap()
127                        .as_str()
128                        .parse::<u8>()
129                        .unwrap(),
130                ),
131                None => false,
132            },
133        }
134    }
135}
136
137pub trait StrpTimeParser<T> {
138    fn parse_bytes(&mut self, val: &[u8], time_unit: Option<TimeUnit>) -> Option<T>;
139}
140
141#[cfg(feature = "dtype-datetime")]
142impl StrpTimeParser<i64> for DatetimeInfer<Int64Type> {
143    fn parse_bytes(&mut self, val: &[u8], time_unit: Option<TimeUnit>) -> Option<i64> {
144        if self.fmt_len == 0 {
145            self.fmt_len = strptime::fmt_len(self.latest_fmt.as_bytes())?;
146        }
147        let transform = match time_unit {
148            Some(TimeUnit::Nanoseconds) => datetime_to_timestamp_ns,
149            Some(TimeUnit::Microseconds) => datetime_to_timestamp_us,
150            Some(TimeUnit::Milliseconds) => datetime_to_timestamp_ms,
151            _ => unreachable!(), // time_unit has to be provided for datetime
152        };
153        unsafe {
154            self.transform_bytes
155                .parse(val, self.latest_fmt.as_bytes(), self.fmt_len)
156                .map(transform)
157                .or_else(|| {
158                    // TODO! this will try all patterns.
159                    // somehow we must early escape if value is invalid
160                    for fmt in self.patterns {
161                        if self.fmt_len == 0 {
162                            self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;
163                        }
164                        if let Some(parsed) = self
165                            .transform_bytes
166                            .parse(val, fmt.as_bytes(), self.fmt_len)
167                            .map(datetime_to_timestamp_us)
168                        {
169                            self.latest_fmt = fmt;
170                            return Some(parsed);
171                        }
172                    }
173                    None
174                })
175        }
176    }
177}
178
179#[cfg(feature = "dtype-date")]
180impl StrpTimeParser<i32> for DatetimeInfer<Int32Type> {
181    fn parse_bytes(&mut self, val: &[u8], _time_unit: Option<TimeUnit>) -> Option<i32> {
182        if self.fmt_len == 0 {
183            self.fmt_len = strptime::fmt_len(self.latest_fmt.as_bytes())?;
184        }
185        unsafe {
186            self.transform_bytes
187                .parse(val, self.latest_fmt.as_bytes(), self.fmt_len)
188                .map(|ndt| naive_date_to_date(ndt.date()))
189                .or_else(|| {
190                    // TODO! this will try all patterns.
191                    // somehow we must early escape if value is invalid
192                    for fmt in self.patterns {
193                        if self.fmt_len == 0 {
194                            self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;
195                        }
196                        if let Some(parsed) = self
197                            .transform_bytes
198                            .parse(val, fmt.as_bytes(), self.fmt_len)
199                            .map(|ndt| naive_date_to_date(ndt.date()))
200                        {
201                            self.latest_fmt = fmt;
202                            return Some(parsed);
203                        }
204                    }
205                    None
206                })
207        }
208    }
209}
210
211#[derive(Clone)]
212pub struct DatetimeInfer<T: PolarsNumericType> {
213    pub pattern: Pattern,
214    patterns: &'static [&'static str],
215    latest_fmt: &'static str,
216    transform: fn(&str, &str) -> Option<T::Native>,
217    transform_bytes: StrpTimeState,
218    fmt_len: u16,
219    pub logical_type: DataType,
220}
221
222pub trait TryFromWithUnit<T>: Sized {
223    type Error;
224    fn try_from_with_unit(pattern: T, unit: Option<TimeUnit>) -> PolarsResult<Self>;
225}
226
227#[cfg(feature = "dtype-datetime")]
228impl TryFromWithUnit<Pattern> for DatetimeInfer<Int64Type> {
229    type Error = PolarsError;
230
231    fn try_from_with_unit(value: Pattern, time_unit: Option<TimeUnit>) -> PolarsResult<Self> {
232        let time_unit = time_unit.expect("time_unit must be provided for datetime");
233
234        let transform = match (time_unit, value) {
235            (TimeUnit::Milliseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_ms,
236            (TimeUnit::Milliseconds, _) => transform_datetime_ms,
237            (TimeUnit::Microseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_us,
238            (TimeUnit::Microseconds, _) => transform_datetime_us,
239            (TimeUnit::Nanoseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_ns,
240            (TimeUnit::Nanoseconds, _) => transform_datetime_ns,
241        };
242        let (pattern, patterns) = match value {
243            Pattern::DatetimeDMY | Pattern::DateDMY => {
244                (Pattern::DatetimeDMY, patterns::DATETIME_D_M_Y)
245            },
246            Pattern::DatetimeYMD | Pattern::DateYMD => {
247                (Pattern::DatetimeYMD, patterns::DATETIME_Y_M_D)
248            },
249            Pattern::DatetimeYMDZ => (Pattern::DatetimeYMDZ, patterns::DATETIME_Y_M_D_Z),
250            Pattern::Time => (Pattern::Time, patterns::TIME_H_M_S),
251        };
252
253        Ok(DatetimeInfer {
254            pattern,
255            patterns,
256            latest_fmt: patterns[0],
257            transform,
258            transform_bytes: StrpTimeState::default(),
259            fmt_len: 0,
260            logical_type: DataType::Datetime(time_unit, None),
261        })
262    }
263}
264
265#[cfg(feature = "dtype-date")]
266impl TryFromWithUnit<Pattern> for DatetimeInfer<Int32Type> {
267    type Error = PolarsError;
268
269    fn try_from_with_unit(value: Pattern, _time_unit: Option<TimeUnit>) -> PolarsResult<Self> {
270        match value {
271            Pattern::DateDMY => Ok(DatetimeInfer {
272                pattern: Pattern::DateDMY,
273                patterns: patterns::DATE_D_M_Y,
274                latest_fmt: patterns::DATE_D_M_Y[0],
275                transform: transform_date,
276                transform_bytes: StrpTimeState::default(),
277                fmt_len: 0,
278                logical_type: DataType::Date,
279            }),
280            Pattern::DateYMD => Ok(DatetimeInfer {
281                pattern: Pattern::DateYMD,
282                patterns: patterns::DATE_Y_M_D,
283                latest_fmt: patterns::DATE_Y_M_D[0],
284                transform: transform_date,
285                transform_bytes: StrpTimeState::default(),
286                fmt_len: 0,
287                logical_type: DataType::Date,
288            }),
289            _ => polars_bail!(ComputeError: "could not convert pattern"),
290        }
291    }
292}
293
294impl<T: PolarsNumericType> DatetimeInfer<T> {
295    pub fn parse(&mut self, val: &str) -> Option<T::Native> {
296        match (self.transform)(val, self.latest_fmt) {
297            Some(parsed) => Some(parsed),
298            // try other patterns
299            None => {
300                if !self.pattern.is_inferable(val) {
301                    return None;
302                }
303                for fmt in self.patterns {
304                    self.fmt_len = 0;
305                    if let Some(parsed) = (self.transform)(val, fmt) {
306                        self.latest_fmt = fmt;
307                        return Some(parsed);
308                    }
309                }
310                None
311            },
312        }
313    }
314}
315
316impl<T: PolarsNumericType> DatetimeInfer<T> {
317    fn coerce_string(&mut self, ca: &StringChunked) -> Series {
318        let chunks = ca.downcast_iter().map(|array| {
319            let iter = array
320                .into_iter()
321                .map(|opt_val| opt_val.and_then(|val| self.parse(val)));
322            PrimitiveArray::from_trusted_len_iter(iter)
323        });
324        ChunkedArray::<T>::from_chunk_iter(ca.name().clone(), chunks)
325            .into_series()
326            .cast(&self.logical_type)
327            .unwrap()
328            .with_name(ca.name().clone())
329    }
330}
331
332#[cfg(feature = "dtype-date")]
333fn transform_date(val: &str, fmt: &str) -> Option<i32> {
334    NaiveDate::parse_from_str(val, fmt)
335        .ok()
336        .map(naive_date_to_date)
337}
338
339#[cfg(feature = "dtype-datetime")]
340pub(crate) fn transform_datetime_ns(val: &str, fmt: &str) -> Option<i64> {
341    match NaiveDateTime::parse_from_str(val, fmt) {
342        Ok(ndt) => Some(datetime_to_timestamp_ns(ndt)),
343        Err(parse_error) => match parse_error.kind() {
344            ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)
345                .ok()
346                .map(|nd| datetime_to_timestamp_ns(nd.and_hms_opt(0, 0, 0).unwrap())),
347            _ => None,
348        },
349    }
350}
351
352fn transform_tzaware_datetime_ns(val: &str, fmt: &str) -> Option<i64> {
353    let dt = DateTime::parse_from_str(val, fmt);
354    dt.ok().map(|dt| datetime_to_timestamp_ns(dt.naive_utc()))
355}
356
357#[cfg(feature = "dtype-datetime")]
358pub(crate) fn transform_datetime_us(val: &str, fmt: &str) -> Option<i64> {
359    match NaiveDateTime::parse_from_str(val, fmt) {
360        Ok(ndt) => Some(datetime_to_timestamp_us(ndt)),
361        Err(parse_error) => match parse_error.kind() {
362            ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)
363                .ok()
364                .map(|nd| datetime_to_timestamp_us(nd.and_hms_opt(0, 0, 0).unwrap())),
365            _ => None,
366        },
367    }
368}
369
370fn transform_tzaware_datetime_us(val: &str, fmt: &str) -> Option<i64> {
371    let dt = DateTime::parse_from_str(val, fmt);
372    dt.ok().map(|dt| datetime_to_timestamp_us(dt.naive_utc()))
373}
374
375#[cfg(feature = "dtype-datetime")]
376pub(crate) fn transform_datetime_ms(val: &str, fmt: &str) -> Option<i64> {
377    match NaiveDateTime::parse_from_str(val, fmt) {
378        Ok(ndt) => Some(datetime_to_timestamp_ms(ndt)),
379        Err(parse_error) => match parse_error.kind() {
380            ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)
381                .ok()
382                .map(|nd| datetime_to_timestamp_ms(nd.and_hms_opt(0, 0, 0).unwrap())),
383            _ => None,
384        },
385    }
386}
387
388fn transform_tzaware_datetime_ms(val: &str, fmt: &str) -> Option<i64> {
389    let dt = DateTime::parse_from_str(val, fmt);
390    dt.ok().map(|dt| datetime_to_timestamp_ms(dt.naive_utc()))
391}
392
393pub fn infer_pattern_single(val: &str) -> Option<Pattern> {
394    // Dates come first, because we see datetimes as superset of dates
395    infer_pattern_date_single(val)
396        .or_else(|| infer_pattern_time_single(val))
397        .or_else(|| infer_pattern_datetime_single(val))
398}
399
400fn infer_pattern_datetime_single(val: &str) -> Option<Pattern> {
401    if patterns::DATETIME_D_M_Y.iter().any(|fmt| {
402        NaiveDateTime::parse_from_str(val, fmt).is_ok()
403            || NaiveDate::parse_from_str(val, fmt).is_ok()
404    }) {
405        Some(Pattern::DatetimeDMY)
406    } else if patterns::DATETIME_Y_M_D.iter().any(|fmt| {
407        NaiveDateTime::parse_from_str(val, fmt).is_ok()
408            || NaiveDate::parse_from_str(val, fmt).is_ok()
409    }) {
410        Some(Pattern::DatetimeYMD)
411    } else if patterns::DATETIME_Y_M_D_Z
412        .iter()
413        .any(|fmt| NaiveDateTime::parse_from_str(val, fmt).is_ok())
414    {
415        Some(Pattern::DatetimeYMDZ)
416    } else {
417        None
418    }
419}
420
421fn infer_pattern_date_single(val: &str) -> Option<Pattern> {
422    if patterns::DATE_D_M_Y
423        .iter()
424        .any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())
425    {
426        Some(Pattern::DateDMY)
427    } else if patterns::DATE_Y_M_D
428        .iter()
429        .any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())
430    {
431        Some(Pattern::DateYMD)
432    } else {
433        None
434    }
435}
436
437fn infer_pattern_time_single(val: &str) -> Option<Pattern> {
438    patterns::TIME_H_M_S
439        .iter()
440        .any(|fmt| NaiveTime::parse_from_str(val, fmt).is_ok())
441        .then_some(Pattern::Time)
442}
443
444#[cfg(feature = "dtype-datetime")]
445pub fn to_datetime_with_inferred_tz(
446    ca: &StringChunked,
447    tu: TimeUnit,
448    strict: bool,
449    exact: bool,
450    ambiguous: &StringChunked,
451) -> PolarsResult<DatetimeChunked> {
452    use super::StringMethods;
453
454    let out = if exact {
455        to_datetime(ca, tu, None, ambiguous, false)
456    } else {
457        ca.as_datetime_not_exact(None, tu, false, None, ambiguous, false)
458    }?;
459
460    if strict && ca.null_count() != out.null_count() {
461        polars_core::utils::handle_casting_failures(
462            &ca.clone().into_series(),
463            &out.clone().into_series(),
464        )?;
465    }
466
467    Ok(out)
468}
469
470#[cfg(feature = "dtype-datetime")]
471pub fn to_datetime(
472    ca: &StringChunked,
473    tu: TimeUnit,
474    tz: Option<&TimeZone>,
475    _ambiguous: &StringChunked,
476    // Ensure that the inferred time_zone matches the given time_zone.
477    ensure_matching_time_zone: bool,
478) -> PolarsResult<DatetimeChunked> {
479    match ca.first_non_null() {
480        None => {
481            Ok(Int64Chunked::full_null(ca.name().clone(), ca.len()).into_datetime(tu, tz.cloned()))
482        },
483        Some(idx) => {
484            let subset = ca.slice(idx as i64, ca.len());
485            let pattern = subset
486                .into_iter()
487                .find_map(|opt_val| opt_val.and_then(infer_pattern_datetime_single))
488                .ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;
489            let mut infer = DatetimeInfer::<Int64Type>::try_from_with_unit(pattern, Some(tu))?;
490            match pattern {
491                #[cfg(feature = "timezones")]
492                Pattern::DatetimeYMDZ => infer.coerce_string(ca).datetime().map(|ca| {
493                    polars_ensure!(
494                        !ensure_matching_time_zone || tz.is_some(),
495                        to_datetime_tz_mismatch
496                    );
497
498                    let mut ca = ca.clone();
499                    // `tz` has already been validated.
500                    ca.set_time_unit_and_time_zone(tu, tz.cloned().unwrap_or(TimeZone::UTC))?;
501                    Ok(ca)
502                })?,
503                _ => infer.coerce_string(ca).datetime().map(|ca| {
504                    let mut ca = ca.clone();
505                    ca.set_time_unit(tu);
506                    match tz {
507                        #[cfg(feature = "timezones")]
508                        Some(tz) => polars_ops::prelude::replace_time_zone(
509                            &ca,
510                            Some(tz),
511                            _ambiguous,
512                            NonExistent::Raise,
513                        ),
514                        _ => Ok(ca),
515                    }
516                })?,
517            }
518        },
519    }
520}
521#[cfg(feature = "dtype-date")]
522pub(crate) fn to_date(ca: &StringChunked) -> PolarsResult<DateChunked> {
523    match ca.first_non_null() {
524        None => Ok(Int32Chunked::full_null(ca.name().clone(), ca.len()).into_date()),
525        Some(idx) => {
526            let subset = ca.slice(idx as i64, ca.len());
527            let pattern = subset
528                .into_iter()
529                .find_map(|opt_val| opt_val.and_then(infer_pattern_date_single))
530                .ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;
531            let mut infer = DatetimeInfer::<Int32Type>::try_from_with_unit(pattern, None).unwrap();
532            infer.coerce_string(ca).date().cloned()
533        },
534    }
535}