polars_time/chunkedarray/string/
strptime.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2//! Much more opinionated, but also much faster strptrime than the one given in Chrono.
3
4use chrono::{NaiveDate, NaiveDateTime};
5
6use crate::chunkedarray::{PolarsResult, polars_bail};
7
8polars_utils::regex_cache::cached_regex! {
9    static HOUR_PATTERN = r"%[_-]?[HkIl]";
10    static MINUTE_PATTERN = r"%[_-]?M";
11    static SECOND_PATTERN = r"%[_-]?S";
12    static TWELVE_HOUR_PATTERN = r"%[_-]?[Il]";
13    static MERIDIEM_PATTERN = r"%[_-]?[pP]";
14}
15
16#[inline]
17fn update_and_parse<T: atoi_simd::Parse>(
18    incr: usize,
19    offset: usize,
20    vals: &[u8],
21) -> Option<(T, usize)> {
22    // this maybe oob because we cannot entirely sure about fmt lengths
23    let new_offset = offset + incr;
24    let bytes = vals.get(offset..new_offset)?;
25    let (val, parsed) = atoi_simd::parse_any(bytes).ok()?;
26    if parsed == 0 {
27        None
28    } else {
29        Some((val, new_offset))
30    }
31}
32
33#[inline]
34fn parse_month_abbrev(val: &[u8], offset: usize) -> Option<(u32, usize)> {
35    let new_offset = offset + 3;
36    match &val[offset..new_offset] {
37        b"Jan" => Some((1, new_offset)),
38        b"Feb" => Some((2, new_offset)),
39        b"Mar" => Some((3, new_offset)),
40        b"Apr" => Some((4, new_offset)),
41        b"May" => Some((5, new_offset)),
42        b"Jun" => Some((6, new_offset)),
43        b"Jul" => Some((7, new_offset)),
44        b"Aug" => Some((8, new_offset)),
45        b"Sep" => Some((9, new_offset)),
46        b"Oct" => Some((10, new_offset)),
47        b"Nov" => Some((11, new_offset)),
48        b"Dec" => Some((12, new_offset)),
49        _ => None,
50    }
51}
52
53/// Tries to convert a chrono `fmt` to a `fmt` that the polars parser consumes.
54/// E.g. chrono supports single letter date identifiers like %F, whereas polars only consumes
55/// year, day, month distinctively with %Y, %d, %m.
56pub(super) fn compile_fmt(fmt: &str) -> PolarsResult<String> {
57    // (hopefully) temporary hacks. Ideally, chrono would return a ParseKindError indicating
58    // if `fmt` is too long for NaiveDate. If that's implemented, then this check could
59    // be removed, and that error could be matched against in `transform_datetime_*s`
60    // See https://github.com/chronotope/chrono/issues/1075.
61    if HOUR_PATTERN.is_match(fmt) ^ MINUTE_PATTERN.is_match(fmt) {
62        polars_bail!(ComputeError: "Invalid format string: \
63            Please either specify both hour and minute, or neither.");
64    }
65    if SECOND_PATTERN.is_match(fmt) && !HOUR_PATTERN.is_match(fmt) {
66        polars_bail!(ComputeError: "Invalid format string: \
67            Found seconds directive, but no hours directive.");
68    }
69    if TWELVE_HOUR_PATTERN.is_match(fmt) ^ MERIDIEM_PATTERN.is_match(fmt) {
70        polars_bail!(ComputeError: "Invalid format string: \
71            Please either specify both 12-hour directive and meridiem directive, or neither.");
72    }
73
74    Ok(fmt
75        .replace("%D", "%m/%d/%y")
76        .replace("%R", "%H:%M")
77        .replace("%T", "%H:%M:%S")
78        .replace("%X", "%H:%M:%S")
79        .replace("%F", "%Y-%m-%d"))
80}
81
82#[derive(Default, Clone)]
83pub(super) struct StrpTimeState {}
84
85impl StrpTimeState {
86    #[inline]
87    // # Safety
88    // Caller must ensure that fmt adheres to the fmt rules of chrono and `fmt_len` is correct.
89    pub(super) unsafe fn parse(
90        &mut self,
91        val: &[u8],
92        fmt: &[u8],
93        fmt_len: u16,
94    ) -> Option<NaiveDateTime> {
95        let mut offset = 0;
96        let mut negative = false;
97        if val.starts_with(b"-") && fmt.starts_with(b"%Y") {
98            offset = 1;
99            negative = true;
100        }
101        if val.len() - offset != (fmt_len as usize) {
102            return None;
103        }
104
105        const ESCAPE: u8 = b'%';
106        let mut year: i32 = 1;
107        // minimal day/month is always 1
108        // otherwise chrono may panic.
109        let mut month: u32 = 1;
110        let mut day: u32 = 1;
111        let mut hour: u32 = 0;
112        let mut min: u32 = 0;
113        let mut sec: u32 = 0;
114        let mut nano: u32 = 0;
115
116        let mut fmt_iter = fmt.iter();
117
118        while let Some(fmt_b) = fmt_iter.next() {
119            debug_assert!(offset < val.len());
120            let b = *val.get_unchecked(offset);
121            if *fmt_b == ESCAPE {
122                // SAFETY: we must ensure we provide valid patterns
123                let next = fmt_iter.next();
124                debug_assert!(next.is_some());
125                match next.unwrap_unchecked() {
126                    b'Y' => {
127                        (year, offset) = update_and_parse(4, offset, val)?;
128                        if negative {
129                            year *= -1
130                        }
131                    },
132                    b'm' => {
133                        (month, offset) = update_and_parse(2, offset, val)?;
134                        if month > 12 {
135                            return None;
136                        }
137                    },
138                    b'b' => {
139                        (month, offset) = parse_month_abbrev(val, offset)?;
140                    },
141                    b'd' => {
142                        (day, offset) = update_and_parse(2, offset, val)?;
143                    },
144                    b'H' => {
145                        (hour, offset) = update_and_parse(2, offset, val)?;
146                    },
147                    b'M' => {
148                        (min, offset) = update_and_parse(2, offset, val)?;
149                    },
150                    b'S' => {
151                        (sec, offset) = update_and_parse(2, offset, val)?;
152                    },
153                    b'y' => {
154                        let new_offset = offset + 2;
155                        let bytes = val.get_unchecked(offset..new_offset);
156
157                        let (decade, parsed) = atoi_simd::parse_any::<i32>(bytes).ok()?;
158                        if parsed == 0 {
159                            return None;
160                        }
161
162                        if decade < 70 {
163                            year = 2000 + decade;
164                        } else {
165                            year = 1900 + decade;
166                        }
167                        offset = new_offset;
168                    },
169                    b'9' => {
170                        (nano, offset) = update_and_parse(9, offset, val)?;
171                        break;
172                    },
173                    b'6' => {
174                        (nano, offset) = update_and_parse(6, offset, val)?;
175                        nano *= 1000;
176                        break;
177                    },
178                    b'3' => {
179                        (nano, offset) = update_and_parse(3, offset, val)?;
180                        nano *= 1_000_000;
181                        break;
182                    },
183                    _ => return None,
184                }
185            }
186            // consume
187            else if b == *fmt_b {
188                offset += 1;
189            } else {
190                return None;
191            }
192        }
193        // all values processed
194        if offset == val.len() {
195            NaiveDate::from_ymd_opt(year, month, day)
196                .and_then(|nd| nd.and_hms_nano_opt(hour, min, sec, nano))
197        }
198        // remaining values did not match pattern
199        else {
200            None
201        }
202    }
203}
204
205pub(super) fn fmt_len(fmt: &[u8]) -> Option<u16> {
206    let mut iter = fmt.iter();
207    let mut cnt = 0;
208
209    while let Some(&val) = iter.next() {
210        match val {
211            b'%' => match iter.next() {
212                Some(&next_val) => match next_val {
213                    b'Y' => cnt += 4,
214                    b'y' => cnt += 2,
215                    b'd' => cnt += 2,
216                    b'm' => cnt += 2,
217                    b'b' => cnt += 3,
218                    b'H' => cnt += 2,
219                    b'M' => cnt += 2,
220                    b'S' => cnt += 2,
221                    b'9' => {
222                        cnt += 9;
223                        debug_assert_eq!(iter.next(), Some(&b'f'));
224                        return Some(cnt);
225                    },
226                    b'6' => {
227                        cnt += 6;
228                        debug_assert_eq!(iter.next(), Some(&b'f'));
229                        return Some(cnt);
230                    },
231                    b'3' => {
232                        cnt += 3;
233                        debug_assert_eq!(iter.next(), Some(&b'f'));
234                        return Some(cnt);
235                    },
236                    _ => return None,
237                },
238                None => return None,
239            },
240            _ => {
241                cnt += 1;
242            },
243        }
244    }
245    Some(cnt)
246}
247
248#[cfg(test)]
249mod test {
250    use super::*;
251
252    #[test]
253    fn test_parsing() {
254        let patterns = [
255            (
256                "2021-01-01",
257                "%Y-%m-%d",
258                10,
259                Some(
260                    NaiveDate::from_ymd_opt(2021, 1, 1)
261                        .unwrap()
262                        .and_hms_nano_opt(0, 0, 0, 0)
263                        .unwrap(),
264                ),
265            ),
266            (
267                "2021-01-01 07:45:12",
268                "%Y-%m-%d %H:%M:%S",
269                19,
270                Some(
271                    NaiveDate::from_ymd_opt(2021, 1, 1)
272                        .unwrap()
273                        .and_hms_nano_opt(7, 45, 12, 0)
274                        .unwrap(),
275                ),
276            ),
277            (
278                "2021-01-01 07:45:12",
279                "%Y-%m-%d %H:%M:%S",
280                19,
281                Some(
282                    NaiveDate::from_ymd_opt(2021, 1, 1)
283                        .unwrap()
284                        .and_hms_nano_opt(7, 45, 12, 0)
285                        .unwrap(),
286                ),
287            ),
288            (
289                "2019-04-18T02:45:55.555000000",
290                "%Y-%m-%dT%H:%M:%S.%9f",
291                29,
292                Some(
293                    NaiveDate::from_ymd_opt(2019, 4, 18)
294                        .unwrap()
295                        .and_hms_nano_opt(2, 45, 55, 555000000)
296                        .unwrap(),
297                ),
298            ),
299            (
300                "2019-04-18T02:45:55.555000",
301                "%Y-%m-%dT%H:%M:%S.%6f",
302                26,
303                Some(
304                    NaiveDate::from_ymd_opt(2019, 4, 18)
305                        .unwrap()
306                        .and_hms_nano_opt(2, 45, 55, 555000000)
307                        .unwrap(),
308                ),
309            ),
310            (
311                "2019-04-18T02:45:55.555",
312                "%Y-%m-%dT%H:%M:%S.%3f",
313                23,
314                Some(
315                    NaiveDate::from_ymd_opt(2019, 4, 18)
316                        .unwrap()
317                        .and_hms_nano_opt(2, 45, 55, 555000000)
318                        .unwrap(),
319                ),
320            ),
321        ];
322
323        for (val, fmt, len, expected) in patterns {
324            assert_eq!(fmt_len(fmt.as_bytes()).unwrap(), len);
325            unsafe {
326                assert_eq!(
327                    StrpTimeState::default().parse(val.as_bytes(), fmt.as_bytes(), len),
328                    expected
329                )
330            };
331        }
332    }
333}