polars_time/chunkedarray/string/
strptime.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! Much more opinionated, but also much faster strptrime than the one given in Chrono.
3
4use chrono::{NaiveDate, NaiveDateTime};
5
6use crate::chunkedarray::{PolarsResult, polars_bail};
7
8polars_utils::regex_cache::cached_regex! {
9    static HOUR_PATTERN = r"%[_-]?[HkIl]";
10    static MINUTE_PATTERN = r"%[_-]?M";
11    static SECOND_PATTERN = r"%[_-]?S";
12    static TWELVE_HOUR_PATTERN = r"%[_-]?[Il]";
13    static MERIDIEM_PATTERN = r"%[_-]?[pP]";
14}
15
16#[inline]
17fn update_and_parse<T: atoi_simd::Parse>(
18    incr: usize,
19    offset: usize,
20    vals: &[u8],
21) -> Option<(T, usize)> {
22    // this maybe oob because we cannot entirely sure about fmt lengths
23    let new_offset = offset + incr;
24    let bytes = vals.get(offset..new_offset)?;
25    let (val, parsed) = atoi_simd::parse_any(bytes).ok()?;
26    if parsed != incr {
27        None
28    } else {
29        Some((val, new_offset))
30    }
31}
32
33#[inline]
34fn parse_month_abbrev(val: &[u8], offset: usize) -> Option<(u32, usize)> {
35    let new_offset = offset + 3;
36    match &val[offset..new_offset] {
37        b"Jan" => Some((1, new_offset)),
38        b"Feb" => Some((2, new_offset)),
39        b"Mar" => Some((3, new_offset)),
40        b"Apr" => Some((4, new_offset)),
41        b"May" => Some((5, new_offset)),
42        b"Jun" => Some((6, new_offset)),
43        b"Jul" => Some((7, new_offset)),
44        b"Aug" => Some((8, new_offset)),
45        b"Sep" => Some((9, new_offset)),
46        b"Oct" => Some((10, new_offset)),
47        b"Nov" => Some((11, new_offset)),
48        b"Dec" => Some((12, new_offset)),
49        _ => None,
50    }
51}
52#[inline]
53fn parse_month_full(val: &[u8], offset: usize) -> Option<(u32, usize)> {
54    let min_offset = offset + 3;
55    match &val[offset..min_offset] {
56        b"Jan" => {
57            let new_offset = min_offset + 4;
58            match &val[min_offset..new_offset] {
59                b"uary" => Some((1, new_offset)),
60                _ => None,
61            }
62        },
63        b"Feb" => {
64            let new_offset = min_offset + 5;
65            match &val[min_offset..new_offset] {
66                b"ruary" => Some((2, new_offset)),
67                _ => None,
68            }
69        },
70        b"Mar" => {
71            let new_offset = min_offset + 2;
72            match &val[min_offset..new_offset] {
73                b"ch" => Some((3, new_offset)),
74                _ => None,
75            }
76        },
77        b"Apr" => {
78            let new_offset = min_offset + 2;
79            match &val[min_offset..new_offset] {
80                b"il" => Some((4, new_offset)),
81                _ => None,
82            }
83        },
84        b"May" => Some((5, min_offset)),
85        b"Jun" => {
86            let new_offset = min_offset + 1;
87            match &val[min_offset..new_offset] {
88                b"e" => Some((6, new_offset)),
89                _ => None,
90            }
91        },
92        b"Jul" => {
93            let new_offset = min_offset + 1;
94            match &val[min_offset..new_offset] {
95                b"y" => Some((7, new_offset)),
96                _ => None,
97            }
98        },
99        b"Aug" => {
100            let new_offset = min_offset + 3;
101            match &val[min_offset..new_offset] {
102                b"ust" => Some((8, new_offset)),
103                _ => None,
104            }
105        },
106        b"Sep" => {
107            let new_offset = min_offset + 6;
108            match &val[min_offset..new_offset] {
109                b"tember" => Some((9, new_offset)),
110                _ => None,
111            }
112        },
113        b"Oct" => {
114            let new_offset = min_offset + 4;
115            match &val[min_offset..new_offset] {
116                b"ober" => Some((10, new_offset)),
117                _ => None,
118            }
119        },
120        b"Nov" => {
121            let new_offset = min_offset + 5;
122            match &val[min_offset..new_offset] {
123                b"ember" => Some((11, new_offset)),
124                _ => None,
125            }
126        },
127        b"Dec" => {
128            let new_offset = min_offset + 5;
129            match &val[min_offset..new_offset] {
130                b"ember" => Some((12, new_offset)),
131                _ => None,
132            }
133        },
134        _ => None,
135    }
136}
137/// Tries to convert a chrono `fmt` to a `fmt` that the polars parser consumes.
138/// E.g. chrono supports single letter date identifiers like %F, whereas polars only consumes
139/// year, day, month distinctively with %Y, %d, %m.
140pub(super) fn compile_fmt(fmt: &str) -> PolarsResult<String> {
141    // (hopefully) temporary hacks. Ideally, chrono would return a ParseKindError indicating
142    // if `fmt` is too long for NaiveDate. If that's implemented, then this check could
143    // be removed, and that error could be matched against in `transform_datetime_*s`
144    // See https://github.com/chronotope/chrono/issues/1075.
145    if HOUR_PATTERN.is_match(fmt) ^ MINUTE_PATTERN.is_match(fmt) {
146        polars_bail!(ComputeError: "Invalid format string: \
147            Please either specify both hour and minute, or neither.");
148    }
149    if SECOND_PATTERN.is_match(fmt) && !HOUR_PATTERN.is_match(fmt) {
150        polars_bail!(ComputeError: "Invalid format string: \
151            Found seconds directive, but no hours directive.");
152    }
153    if TWELVE_HOUR_PATTERN.is_match(fmt) ^ MERIDIEM_PATTERN.is_match(fmt) {
154        polars_bail!(ComputeError: "Invalid format string: \
155            Please either specify both 12-hour directive and meridiem directive, or neither.");
156    }
157
158    Ok(fmt
159        .replace("%D", "%m/%d/%y")
160        .replace("%R", "%H:%M")
161        .replace("%T", "%H:%M:%S")
162        .replace("%X", "%H:%M:%S")
163        .replace("%F", "%Y-%m-%d"))
164}
165
166#[derive(Default, Clone)]
167pub(super) struct StrpTimeState {}
168
169impl StrpTimeState {
170    #[inline]
171    // # Safety
172    // Caller must ensure that fmt adheres to the fmt rules of chrono and `fmt_len` is correct.
173    pub(super) unsafe fn parse(
174        &mut self,
175        val: &[u8],
176        fmt: &[u8],
177        fmt_len_val: u16,
178    ) -> Option<NaiveDateTime> {
179        let mut offset = 0;
180        let mut negative = false;
181        if val.starts_with(b"-") && fmt.starts_with(b"%Y") {
182            offset = 1;
183            negative = true;
184        }
185        #[allow(non_snake_case)]
186        let has_B_code = fmt.windows(2).any(|w| w == b"%B");
187        // SAFETY: this still ensures get_unchecked won't be out of bounds as val will be at least as big as we expect.
188        // After consuming the full month name, we'll double check remaining len is exactly equal.
189        let is_too_short = has_B_code && val.len() - offset < (fmt_len_val as usize);
190        if (!has_B_code && val.len() - offset != (fmt_len_val as usize)) || is_too_short {
191            return None;
192        }
193
194        const ESCAPE: u8 = b'%';
195        let mut year: i32 = 1;
196        // minimal day/month is always 1
197        // otherwise chrono may panic.
198        let mut month: u32 = 1;
199        let mut day: u32 = 1;
200        let mut hour: u32 = 0;
201        let mut min: u32 = 0;
202        let mut sec: u32 = 0;
203        let mut nano: u32 = 0;
204
205        let mut fmt_iter = fmt.iter();
206
207        while let Some(fmt_b) = fmt_iter.next() {
208            debug_assert!(offset < val.len());
209            let b = *val.get_unchecked(offset);
210            if *fmt_b == ESCAPE {
211                // SAFETY: we must ensure we provide valid patterns
212                let next = fmt_iter.next();
213                debug_assert!(next.is_some());
214                match next.unwrap_unchecked() {
215                    b'Y' => {
216                        (year, offset) = update_and_parse(4, offset, val)?;
217                        if negative {
218                            year *= -1
219                        }
220                    },
221                    b'm' => {
222                        (month, offset) = update_and_parse(2, offset, val)?;
223                        if month > 12 {
224                            return None;
225                        }
226                    },
227                    b'b' => {
228                        (month, offset) = parse_month_abbrev(val, offset)?;
229                    },
230                    b'B' => {
231                        (month, offset) = parse_month_full(val, offset)?;
232                        // After variable sized month is consumed, verify remaining is exact len
233                        let new_fmt_len = fmt_len(fmt_iter.as_slice())?;
234                        let remaining_val_len = val.len() - offset;
235                        if remaining_val_len != (new_fmt_len as usize) {
236                            return None;
237                        }
238                    },
239                    b'd' => {
240                        (day, offset) = update_and_parse(2, offset, val)?;
241                    },
242                    b'H' => {
243                        (hour, offset) = update_and_parse(2, offset, val)?;
244                    },
245                    b'M' => {
246                        (min, offset) = update_and_parse(2, offset, val)?;
247                    },
248                    b'S' => {
249                        (sec, offset) = update_and_parse(2, offset, val)?;
250                    },
251                    b'y' => {
252                        let new_offset = offset + 2;
253                        let bytes = val.get_unchecked(offset..new_offset);
254
255                        let (decade, parsed) = atoi_simd::parse_any::<i32>(bytes).ok()?;
256                        if parsed == 0 {
257                            return None;
258                        }
259
260                        if decade < 70 {
261                            year = 2000 + decade;
262                        } else {
263                            year = 1900 + decade;
264                        }
265                        offset = new_offset;
266                    },
267                    b'9' => {
268                        (nano, offset) = update_and_parse(9, offset, val)?;
269                        break;
270                    },
271                    b'6' => {
272                        (nano, offset) = update_and_parse(6, offset, val)?;
273                        nano *= 1000;
274                        break;
275                    },
276                    b'3' => {
277                        (nano, offset) = update_and_parse(3, offset, val)?;
278                        nano *= 1_000_000;
279                        break;
280                    },
281                    _ => return None,
282                }
283            }
284            // consume
285            else if b == *fmt_b {
286                offset += 1;
287            } else {
288                return None;
289            }
290        }
291        // all values processed
292        if offset == val.len() {
293            NaiveDate::from_ymd_opt(year, month, day)
294                .and_then(|nd| nd.and_hms_nano_opt(hour, min, sec, nano))
295        }
296        // remaining values did not match pattern
297        else {
298            None
299        }
300    }
301}
302
303pub(super) fn fmt_len(fmt: &[u8]) -> Option<u16> {
304    let mut iter = fmt.iter();
305    let mut cnt = 0;
306
307    while let Some(&val) = iter.next() {
308        match val {
309            b'%' => match iter.next() {
310                Some(&next_val) => match next_val {
311                    b'Y' => cnt += 4,
312                    b'y' => cnt += 2,
313                    b'd' => cnt += 2,
314                    b'm' => cnt += 2,
315                    b'b' => cnt += 3,
316                    b'B' => cnt += 3, // This is minimum size for full month
317                    b'H' => cnt += 2,
318                    b'M' => cnt += 2,
319                    b'S' => cnt += 2,
320                    b'9' => {
321                        cnt += 9;
322                        if matches!(iter.next(), Some(&b'f')) && iter.next().is_none() {
323                            return Some(cnt);
324                        } else {
325                            return None;
326                        }
327                    },
328                    b'6' => {
329                        cnt += 6;
330                        if matches!(iter.next(), Some(&b'f')) && iter.next().is_none() {
331                            return Some(cnt);
332                        } else {
333                            return None;
334                        }
335                    },
336                    b'3' => {
337                        cnt += 3;
338                        if matches!(iter.next(), Some(&b'f')) && iter.next().is_none() {
339                            return Some(cnt);
340                        } else {
341                            return None;
342                        }
343                    },
344                    _ => return None,
345                },
346                None => return None,
347            },
348            _ => {
349                cnt += 1;
350            },
351        }
352    }
353    Some(cnt)
354}
355
356#[cfg(test)]
357mod test {
358    use super::*;
359
360    #[test]
361    fn test_parsing() {
362        let patterns = [
363            (
364                "2021-01-01",
365                "%Y-%m-%d",
366                10,
367                Some(
368                    NaiveDate::from_ymd_opt(2021, 1, 1)
369                        .unwrap()
370                        .and_hms_nano_opt(0, 0, 0, 0)
371                        .unwrap(),
372                ),
373            ),
374            (
375                "2021-01-01 07:45:12",
376                "%Y-%m-%d %H:%M:%S",
377                19,
378                Some(
379                    NaiveDate::from_ymd_opt(2021, 1, 1)
380                        .unwrap()
381                        .and_hms_nano_opt(7, 45, 12, 0)
382                        .unwrap(),
383                ),
384            ),
385            (
386                "2021-01-01 07:45:12",
387                "%Y-%m-%d %H:%M:%S",
388                19,
389                Some(
390                    NaiveDate::from_ymd_opt(2021, 1, 1)
391                        .unwrap()
392                        .and_hms_nano_opt(7, 45, 12, 0)
393                        .unwrap(),
394                ),
395            ),
396            (
397                "2019-04-18T02:45:55.555000000",
398                "%Y-%m-%dT%H:%M:%S.%9f",
399                29,
400                Some(
401                    NaiveDate::from_ymd_opt(2019, 4, 18)
402                        .unwrap()
403                        .and_hms_nano_opt(2, 45, 55, 555000000)
404                        .unwrap(),
405                ),
406            ),
407            (
408                "2019-04-18T02:45:55.555000",
409                "%Y-%m-%dT%H:%M:%S.%6f",
410                26,
411                Some(
412                    NaiveDate::from_ymd_opt(2019, 4, 18)
413                        .unwrap()
414                        .and_hms_nano_opt(2, 45, 55, 555000000)
415                        .unwrap(),
416                ),
417            ),
418            (
419                "2019-04-18T02:45:55.555",
420                "%Y-%m-%dT%H:%M:%S.%3f",
421                23,
422                Some(
423                    NaiveDate::from_ymd_opt(2019, 4, 18)
424                        .unwrap()
425                        .and_hms_nano_opt(2, 45, 55, 555000000)
426                        .unwrap(),
427                ),
428            ),
429        ];
430
431        for (val, fmt, len, expected) in patterns {
432            assert_eq!(fmt_len(fmt.as_bytes()).unwrap(), len);
433            unsafe {
434                assert_eq!(
435                    StrpTimeState::default().parse(val.as_bytes(), fmt.as_bytes(), len),
436                    expected
437                )
438            };
439        }
440    }
441}