Skip to main content

polars_time/chunkedarray/string/
strptime.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! Much more opinionated, but also much faster strptrime than the one given in Chrono.
3
4use chrono::{NaiveDate, NaiveDateTime};
5
6use crate::chunkedarray::{PolarsResult, polars_bail};
7
8polars_utils::regex_cache::cached_regex! {
9    static HOUR_PATTERN = r"%[_-]?[HkIl]";
10    static MINUTE_PATTERN = r"%[_-]?M";
11    static SECOND_PATTERN = r"%[_-]?S";
12    static TWELVE_HOUR_PATTERN = r"%[_-]?[Il]";
13    static MERIDIEM_PATTERN = r"%[_-]?[pP]";
14}
15
16#[inline]
17fn update_and_parse<T: atoi_simd::Parse>(
18    incr: usize,
19    offset: usize,
20    vals: &[u8],
21) -> Option<(T, usize)> {
22    // this maybe oob because we cannot entirely sure about fmt lengths
23    let new_offset = offset + incr;
24    let bytes = vals.get(offset..new_offset)?;
25    let (val, parsed) = atoi_simd::parse_prefix::<T, true, false>(bytes).ok()?;
26    if parsed != incr {
27        None
28    } else {
29        Some((val, new_offset))
30    }
31}
32
33#[inline]
34fn parse_month_abbrev(val: &[u8], offset: usize) -> Option<(u32, usize)> {
35    let new_offset = offset + 3;
36    match &val[offset..new_offset] {
37        b"Jan" => Some((1, new_offset)),
38        b"Feb" => Some((2, new_offset)),
39        b"Mar" => Some((3, new_offset)),
40        b"Apr" => Some((4, new_offset)),
41        b"May" => Some((5, new_offset)),
42        b"Jun" => Some((6, new_offset)),
43        b"Jul" => Some((7, new_offset)),
44        b"Aug" => Some((8, new_offset)),
45        b"Sep" => Some((9, new_offset)),
46        b"Oct" => Some((10, new_offset)),
47        b"Nov" => Some((11, new_offset)),
48        b"Dec" => Some((12, new_offset)),
49        _ => None,
50    }
51}
52#[inline]
53fn parse_month_full(val: &[u8], offset: usize) -> Option<(u32, usize)> {
54    let min_offset = offset + 3;
55    match &val[offset..min_offset] {
56        b"Jan" => {
57            let new_offset = min_offset + 4;
58            match &val[min_offset..new_offset] {
59                b"uary" => Some((1, new_offset)),
60                _ => None,
61            }
62        },
63        b"Feb" => {
64            let new_offset = min_offset + 5;
65            match &val[min_offset..new_offset] {
66                b"ruary" => Some((2, new_offset)),
67                _ => None,
68            }
69        },
70        b"Mar" => {
71            let new_offset = min_offset + 2;
72            match &val[min_offset..new_offset] {
73                b"ch" => Some((3, new_offset)),
74                _ => None,
75            }
76        },
77        b"Apr" => {
78            let new_offset = min_offset + 2;
79            match &val[min_offset..new_offset] {
80                b"il" => Some((4, new_offset)),
81                _ => None,
82            }
83        },
84        b"May" => Some((5, min_offset)),
85        b"Jun" => {
86            let new_offset = min_offset + 1;
87            match &val[min_offset..new_offset] {
88                b"e" => Some((6, new_offset)),
89                _ => None,
90            }
91        },
92        b"Jul" => {
93            let new_offset = min_offset + 1;
94            match &val[min_offset..new_offset] {
95                b"y" => Some((7, new_offset)),
96                _ => None,
97            }
98        },
99        b"Aug" => {
100            let new_offset = min_offset + 3;
101            match &val[min_offset..new_offset] {
102                b"ust" => Some((8, new_offset)),
103                _ => None,
104            }
105        },
106        b"Sep" => {
107            let new_offset = min_offset + 6;
108            match &val[min_offset..new_offset] {
109                b"tember" => Some((9, new_offset)),
110                _ => None,
111            }
112        },
113        b"Oct" => {
114            let new_offset = min_offset + 4;
115            match &val[min_offset..new_offset] {
116                b"ober" => Some((10, new_offset)),
117                _ => None,
118            }
119        },
120        b"Nov" => {
121            let new_offset = min_offset + 5;
122            match &val[min_offset..new_offset] {
123                b"ember" => Some((11, new_offset)),
124                _ => None,
125            }
126        },
127        b"Dec" => {
128            let new_offset = min_offset + 5;
129            match &val[min_offset..new_offset] {
130                b"ember" => Some((12, new_offset)),
131                _ => None,
132            }
133        },
134        _ => None,
135    }
136}
137/// Tries to convert a chrono `fmt` to a `fmt` that the polars parser consumes.
138/// E.g. chrono supports single letter date identifiers like %F, whereas polars only consumes
139/// year, day, month distinctively with %Y, %d, %m.
140pub(super) fn compile_fmt(fmt: &str) -> PolarsResult<String> {
141    // (hopefully) temporary hacks. Ideally, chrono would return a ParseKindError indicating
142    // if `fmt` is too long for NaiveDate. If that's implemented, then this check could
143    // be removed, and that error could be matched against in `transform_datetime_*s`
144    // See https://github.com/chronotope/chrono/issues/1075.
145    if HOUR_PATTERN.is_match(fmt) ^ MINUTE_PATTERN.is_match(fmt) {
146        polars_bail!(ComputeError: "Invalid format string: \
147            Please either specify both hour and minute, or neither.");
148    }
149    if SECOND_PATTERN.is_match(fmt) && !HOUR_PATTERN.is_match(fmt) {
150        polars_bail!(ComputeError: "Invalid format string: \
151            Found seconds directive, but no hours directive.");
152    }
153    if TWELVE_HOUR_PATTERN.is_match(fmt) ^ MERIDIEM_PATTERN.is_match(fmt) {
154        polars_bail!(ComputeError: "Invalid format string: \
155            Please either specify both 12-hour directive and meridiem directive, or neither.");
156    }
157
158    Ok(fmt
159        .replace("%D", "%m/%d/%y")
160        .replace("%R", "%H:%M")
161        .replace("%T", "%H:%M:%S")
162        .replace("%X", "%H:%M:%S")
163        .replace("%F", "%Y-%m-%d"))
164}
165
166#[derive(Default, Clone)]
167pub(super) struct StrpTimeState {}
168
169impl StrpTimeState {
170    #[inline]
171    // # Safety
172    // Caller must ensure that fmt adheres to the fmt rules of chrono and `fmt_len` is correct.
173    pub(super) unsafe fn parse(
174        &mut self,
175        val: &[u8],
176        fmt: &[u8],
177        fmt_len_val: u16,
178    ) -> Option<NaiveDateTime> {
179        let mut offset = 0;
180        let mut negative = false;
181        if val.starts_with(b"-") && fmt.starts_with(b"%Y") {
182            offset = 1;
183            negative = true;
184        }
185        #[allow(non_snake_case)]
186        let has_B_code = fmt.windows(2).any(|w| w == b"%B");
187        // SAFETY: this still ensures get_unchecked won't be out of bounds as val will be at least as big as we expect.
188        // After consuming the full month name, we'll double check remaining len is exactly equal.
189        let is_too_short = has_B_code && val.len() - offset < (fmt_len_val as usize);
190        if (!has_B_code && val.len() - offset != (fmt_len_val as usize)) || is_too_short {
191            return None;
192        }
193
194        const ESCAPE: u8 = b'%';
195        let mut year: i32 = 1;
196        // minimal day/month is always 1
197        // otherwise chrono may panic.
198        let mut month: u32 = 1;
199        let mut day: u32 = 1;
200        let mut hour: u32 = 0;
201        let mut min: u32 = 0;
202        let mut sec: u32 = 0;
203        let mut nano: u32 = 0;
204
205        let mut fmt_iter = fmt.iter();
206
207        while let Some(fmt_b) = fmt_iter.next() {
208            debug_assert!(offset < val.len());
209            let b = *val.get_unchecked(offset);
210            if *fmt_b == ESCAPE {
211                // SAFETY: we must ensure we provide valid patterns
212                let next = fmt_iter.next();
213                debug_assert!(next.is_some());
214                match next.unwrap_unchecked() {
215                    b'Y' => {
216                        (year, offset) = update_and_parse(4, offset, val)?;
217                        if negative {
218                            year *= -1
219                        }
220                    },
221                    b'm' => {
222                        (month, offset) = update_and_parse(2, offset, val)?;
223                        if month > 12 {
224                            return None;
225                        }
226                    },
227                    b'b' => {
228                        (month, offset) = parse_month_abbrev(val, offset)?;
229                    },
230                    b'B' => {
231                        (month, offset) = parse_month_full(val, offset)?;
232                        // After variable sized month is consumed, verify remaining is exact len
233                        let new_fmt_len = fmt_len(fmt_iter.as_slice())?;
234                        let remaining_val_len = val.len() - offset;
235                        if remaining_val_len != (new_fmt_len as usize) {
236                            return None;
237                        }
238                    },
239                    b'd' => {
240                        (day, offset) = update_and_parse(2, offset, val)?;
241                    },
242                    b'H' => {
243                        (hour, offset) = update_and_parse(2, offset, val)?;
244                    },
245                    b'M' => {
246                        (min, offset) = update_and_parse(2, offset, val)?;
247                    },
248                    b'S' => {
249                        (sec, offset) = update_and_parse(2, offset, val)?;
250                    },
251                    b'y' => {
252                        let new_offset = offset + 2;
253                        let bytes = val.get_unchecked(offset..new_offset);
254
255                        let (decade, parsed) =
256                            atoi_simd::parse_prefix::<i32, true, false>(bytes).ok()?;
257                        if parsed == 0 {
258                            return None;
259                        }
260
261                        if decade < 70 {
262                            year = 2000 + decade;
263                        } else {
264                            year = 1900 + decade;
265                        }
266                        offset = new_offset;
267                    },
268                    b'9' => {
269                        (nano, offset) = update_and_parse(9, offset, val)?;
270                        break;
271                    },
272                    b'6' => {
273                        (nano, offset) = update_and_parse(6, offset, val)?;
274                        nano *= 1000;
275                        break;
276                    },
277                    b'3' => {
278                        (nano, offset) = update_and_parse(3, offset, val)?;
279                        nano *= 1_000_000;
280                        break;
281                    },
282                    _ => return None,
283                }
284            }
285            // consume
286            else if b == *fmt_b {
287                offset += 1;
288            } else {
289                return None;
290            }
291        }
292        // all values processed
293        if offset == val.len() {
294            NaiveDate::from_ymd_opt(year, month, day)
295                .and_then(|nd| nd.and_hms_nano_opt(hour, min, sec, nano))
296        }
297        // remaining values did not match pattern
298        else {
299            None
300        }
301    }
302}
303
304pub(super) fn fmt_len(fmt: &[u8]) -> Option<u16> {
305    let mut iter = fmt.iter();
306    let mut cnt = 0;
307
308    while let Some(&val) = iter.next() {
309        match val {
310            b'%' => match iter.next() {
311                Some(&next_val) => match next_val {
312                    b'Y' => cnt += 4,
313                    b'y' => cnt += 2,
314                    b'd' => cnt += 2,
315                    b'm' => cnt += 2,
316                    b'b' => cnt += 3,
317                    b'B' => cnt += 3, // This is minimum size for full month
318                    b'H' => cnt += 2,
319                    b'M' => cnt += 2,
320                    b'S' => cnt += 2,
321                    b'9' => {
322                        cnt += 9;
323                        if matches!(iter.next(), Some(&b'f')) && iter.next().is_none() {
324                            return Some(cnt);
325                        } else {
326                            return None;
327                        }
328                    },
329                    b'6' => {
330                        cnt += 6;
331                        if matches!(iter.next(), Some(&b'f')) && iter.next().is_none() {
332                            return Some(cnt);
333                        } else {
334                            return None;
335                        }
336                    },
337                    b'3' => {
338                        cnt += 3;
339                        if matches!(iter.next(), Some(&b'f')) && iter.next().is_none() {
340                            return Some(cnt);
341                        } else {
342                            return None;
343                        }
344                    },
345                    _ => return None,
346                },
347                None => return None,
348            },
349            _ => {
350                cnt += 1;
351            },
352        }
353    }
354    Some(cnt)
355}
356
357#[cfg(test)]
358mod test {
359    use super::*;
360
361    #[test]
362    fn test_parsing() {
363        let patterns = [
364            (
365                "2021-01-01",
366                "%Y-%m-%d",
367                10,
368                Some(
369                    NaiveDate::from_ymd_opt(2021, 1, 1)
370                        .unwrap()
371                        .and_hms_nano_opt(0, 0, 0, 0)
372                        .unwrap(),
373                ),
374            ),
375            (
376                "2021-01-01 07:45:12",
377                "%Y-%m-%d %H:%M:%S",
378                19,
379                Some(
380                    NaiveDate::from_ymd_opt(2021, 1, 1)
381                        .unwrap()
382                        .and_hms_nano_opt(7, 45, 12, 0)
383                        .unwrap(),
384                ),
385            ),
386            (
387                "2021-01-01 07:45:12",
388                "%Y-%m-%d %H:%M:%S",
389                19,
390                Some(
391                    NaiveDate::from_ymd_opt(2021, 1, 1)
392                        .unwrap()
393                        .and_hms_nano_opt(7, 45, 12, 0)
394                        .unwrap(),
395                ),
396            ),
397            (
398                "2019-04-18T02:45:55.555000000",
399                "%Y-%m-%dT%H:%M:%S.%9f",
400                29,
401                Some(
402                    NaiveDate::from_ymd_opt(2019, 4, 18)
403                        .unwrap()
404                        .and_hms_nano_opt(2, 45, 55, 555000000)
405                        .unwrap(),
406                ),
407            ),
408            (
409                "2019-04-18T02:45:55.555000",
410                "%Y-%m-%dT%H:%M:%S.%6f",
411                26,
412                Some(
413                    NaiveDate::from_ymd_opt(2019, 4, 18)
414                        .unwrap()
415                        .and_hms_nano_opt(2, 45, 55, 555000000)
416                        .unwrap(),
417                ),
418            ),
419            (
420                "2019-04-18T02:45:55.555",
421                "%Y-%m-%dT%H:%M:%S.%3f",
422                23,
423                Some(
424                    NaiveDate::from_ymd_opt(2019, 4, 18)
425                        .unwrap()
426                        .and_hms_nano_opt(2, 45, 55, 555000000)
427                        .unwrap(),
428                ),
429            ),
430        ];
431
432        for (val, fmt, len, expected) in patterns {
433            assert_eq!(fmt_len(fmt.as_bytes()).unwrap(), len);
434            unsafe {
435                assert_eq!(
436                    StrpTimeState::default().parse(val.as_bytes(), fmt.as_bytes(), len),
437                    expected
438                )
439            };
440        }
441    }
442}