1use arrow::array::PrimitiveArray;
2use chrono::format::ParseErrorKind;
3use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime};
4use polars_core::prelude::*;
5
6use super::patterns::{self, Pattern};
7#[cfg(feature = "dtype-date")]
8use crate::chunkedarray::date::naive_date_to_date;
9use crate::chunkedarray::string::strptime;
10use crate::prelude::string::strptime::StrpTimeState;
11
12polars_utils::regex_cache::cached_regex! {
13 static DATETIME_DMY_RE = r#"(?x)
14 ^
15 ['"]? # optional quotes
16 (?:\d{1,2}) # day
17 [-/\.] # separator
18 (?P<month>[01]?\d{1}) # month
19 [-/\.] # separator
20 (?:\d{4,}) # year
21 (?:
22 [T\ ] # separator
23 (?:\d{1,2}) # hour
24 :? # separator
25 (?:\d{1,2}) # minute
26 (?:
27 :? # separator
28 (?:\d{1,2}) # second
29 (?:
30 \.(?:\d{1,9}) # subsecond
31 )?
32 )?
33 )?
34 ['"]? # optional quotes
35 $
36 "#;
37
38 static DATETIME_YMD_RE = r#"(?x)
39 ^
40 ['"]? # optional quotes
41 (?:\d{4,}) # year
42 [-/\.] # separator
43 (?P<month>[01]?\d{1}) # month
44 [-/\.] # separator
45 (?:\d{1,2}) # day
46 (?:
47 [T\ ] # separator
48 (?:\d{1,2}) # hour
49 :? # separator
50 (?:\d{1,2}) # minute
51 (?:
52 :? # separator
53 (?:\d{1,2}) # seconds
54 (?:
55 \.(?:\d{1,9}) # subsecond
56 )?
57 )?
58 )?
59 ['"]? # optional quotes
60 $
61 "#;
62
63 static DATETIME_YMDZ_RE = r#"(?x)
64 ^
65 ['"]? # optional quotes
66 (?:\d{4,}) # year
67 [-/\.] # separator
68 (?P<month>[01]?\d{1}) # month
69 [-/\.] # separator
70 (?:\d{1,2}) # year
71 [T\ ] # separator
72 (?:\d{2}) # hour
73 :? # separator
74 (?:\d{2}) # minute
75 (?:
76 :? # separator
77 (?:\d{2}) # second
78 (?:
79 \.(?:\d{1,9}) # subsecond
80 )?
81 )?
82 (?:
83 # offset (e.g. +01:00)
84 [+-](?:\d{2})
85 :?
86 (?:\d{2})
87 # or Zulu suffix
88 |Z
89 )
90 ['"]? # optional quotes
91 $
92 "#;
93}
94
95impl Pattern {
96 pub fn is_inferable(&self, val: &str) -> bool {
97 match self {
98 Pattern::DateDMY => true, Pattern::DateYMD => true, Pattern::Time => true,
101 Pattern::DatetimeDMY => match DATETIME_DMY_RE.captures(val) {
102 Some(search) => (1..=12).contains(
103 &search
104 .name("month")
105 .unwrap()
106 .as_str()
107 .parse::<u8>()
108 .unwrap(),
109 ),
110 None => false,
111 },
112 Pattern::DatetimeYMD => match DATETIME_YMD_RE.captures(val) {
113 Some(search) => (1..=12).contains(
114 &search
115 .name("month")
116 .unwrap()
117 .as_str()
118 .parse::<u8>()
119 .unwrap(),
120 ),
121 None => false,
122 },
123 Pattern::DatetimeYMDZ => match DATETIME_YMDZ_RE.captures(val) {
124 Some(search) => (1..=12).contains(
125 &search
126 .name("month")
127 .unwrap()
128 .as_str()
129 .parse::<u8>()
130 .unwrap(),
131 ),
132 None => false,
133 },
134 }
135 }
136}
137
138pub trait StrpTimeParser<T> {
139 fn parse_bytes(&mut self, val: &[u8], time_unit: Option<TimeUnit>) -> Option<T>;
140}
141
142#[cfg(feature = "dtype-datetime")]
143impl StrpTimeParser<i64> for DatetimeInfer<Int64Type> {
144 fn parse_bytes(&mut self, val: &[u8], time_unit: Option<TimeUnit>) -> Option<i64> {
145 if self.fmt_len == 0 {
146 self.fmt_len = strptime::fmt_len(self.latest_fmt.as_bytes())?;
147 }
148 let transform = match time_unit {
149 Some(TimeUnit::Nanoseconds) => datetime_to_timestamp_ns,
150 Some(TimeUnit::Microseconds) => datetime_to_timestamp_us,
151 Some(TimeUnit::Milliseconds) => datetime_to_timestamp_ms,
152 _ => unreachable!(), };
154 unsafe {
155 self.transform_bytes
156 .parse(val, self.latest_fmt.as_bytes(), self.fmt_len)
157 .map(transform)
158 .or_else(|| {
159 for fmt in self.patterns {
162 if self.fmt_len == 0 {
163 self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;
164 }
165 if let Some(parsed) = self
166 .transform_bytes
167 .parse(val, fmt.as_bytes(), self.fmt_len)
168 .map(datetime_to_timestamp_us)
169 {
170 self.latest_fmt = fmt;
171 return Some(parsed);
172 }
173 }
174 None
175 })
176 }
177 }
178}
179
180#[cfg(feature = "dtype-date")]
181impl StrpTimeParser<i32> for DatetimeInfer<Int32Type> {
182 fn parse_bytes(&mut self, val: &[u8], _time_unit: Option<TimeUnit>) -> Option<i32> {
183 if self.fmt_len == 0 {
184 self.fmt_len = strptime::fmt_len(self.latest_fmt.as_bytes())?;
185 }
186 unsafe {
187 self.transform_bytes
188 .parse(val, self.latest_fmt.as_bytes(), self.fmt_len)
189 .map(|ndt| naive_date_to_date(ndt.date()))
190 .or_else(|| {
191 for fmt in self.patterns {
194 if self.fmt_len == 0 {
195 self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;
196 }
197 if let Some(parsed) = self
198 .transform_bytes
199 .parse(val, fmt.as_bytes(), self.fmt_len)
200 .map(|ndt| naive_date_to_date(ndt.date()))
201 {
202 self.latest_fmt = fmt;
203 return Some(parsed);
204 }
205 }
206 None
207 })
208 }
209 }
210}
211
212#[derive(Clone)]
213pub struct DatetimeInfer<T: PolarsNumericType> {
214 pub pattern: Pattern,
215 patterns: &'static [&'static str],
216 latest_fmt: &'static str,
217 transform: fn(&str, &str) -> Option<T::Native>,
218 transform_bytes: StrpTimeState,
219 fmt_len: u16,
220 pub logical_type: DataType,
221}
222
223pub trait TryFromWithUnit<T>: Sized {
224 type Error;
225 fn try_from_with_unit(pattern: T, unit: Option<TimeUnit>) -> PolarsResult<Self>;
226}
227
228#[cfg(feature = "dtype-datetime")]
229impl TryFromWithUnit<Pattern> for DatetimeInfer<Int64Type> {
230 type Error = PolarsError;
231
232 fn try_from_with_unit(value: Pattern, time_unit: Option<TimeUnit>) -> PolarsResult<Self> {
233 let time_unit = time_unit.expect("time_unit must be provided for datetime");
234
235 let transform = match (time_unit, value) {
236 (TimeUnit::Milliseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_ms,
237 (TimeUnit::Milliseconds, _) => transform_datetime_ms,
238 (TimeUnit::Microseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_us,
239 (TimeUnit::Microseconds, _) => transform_datetime_us,
240 (TimeUnit::Nanoseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_ns,
241 (TimeUnit::Nanoseconds, _) => transform_datetime_ns,
242 };
243 let (pattern, patterns) = match value {
244 Pattern::DatetimeDMY | Pattern::DateDMY => {
245 (Pattern::DatetimeDMY, patterns::DATETIME_D_M_Y)
246 },
247 Pattern::DatetimeYMD | Pattern::DateYMD => {
248 (Pattern::DatetimeYMD, patterns::DATETIME_Y_M_D)
249 },
250 Pattern::DatetimeYMDZ => (Pattern::DatetimeYMDZ, patterns::DATETIME_Y_M_D_Z),
251 Pattern::Time => (Pattern::Time, patterns::TIME_H_M_S),
252 };
253
254 Ok(DatetimeInfer {
255 pattern,
256 patterns,
257 latest_fmt: patterns[0],
258 transform,
259 transform_bytes: StrpTimeState::default(),
260 fmt_len: 0,
261 logical_type: DataType::Datetime(time_unit, None),
262 })
263 }
264}
265
266#[cfg(feature = "dtype-date")]
267impl TryFromWithUnit<Pattern> for DatetimeInfer<Int32Type> {
268 type Error = PolarsError;
269
270 fn try_from_with_unit(value: Pattern, _time_unit: Option<TimeUnit>) -> PolarsResult<Self> {
271 match value {
272 Pattern::DateDMY => Ok(DatetimeInfer {
273 pattern: Pattern::DateDMY,
274 patterns: patterns::DATE_D_M_Y,
275 latest_fmt: patterns::DATE_D_M_Y[0],
276 transform: transform_date,
277 transform_bytes: StrpTimeState::default(),
278 fmt_len: 0,
279 logical_type: DataType::Date,
280 }),
281 Pattern::DateYMD => Ok(DatetimeInfer {
282 pattern: Pattern::DateYMD,
283 patterns: patterns::DATE_Y_M_D,
284 latest_fmt: patterns::DATE_Y_M_D[0],
285 transform: transform_date,
286 transform_bytes: StrpTimeState::default(),
287 fmt_len: 0,
288 logical_type: DataType::Date,
289 }),
290 _ => polars_bail!(ComputeError: "could not convert pattern"),
291 }
292 }
293}
294
295impl<T: PolarsNumericType> DatetimeInfer<T> {
296 pub fn parse(&mut self, val: &str) -> Option<T::Native> {
297 match (self.transform)(val, self.latest_fmt) {
298 Some(parsed) => Some(parsed),
299 None => {
301 if !self.pattern.is_inferable(val) {
302 return None;
303 }
304 for fmt in self.patterns {
305 self.fmt_len = 0;
306 if let Some(parsed) = (self.transform)(val, fmt) {
307 self.latest_fmt = fmt;
308 return Some(parsed);
309 }
310 }
311 None
312 },
313 }
314 }
315}
316
317impl<T: PolarsNumericType> DatetimeInfer<T>
318where
319 ChunkedArray<T>: IntoSeries,
320{
321 fn coerce_string(&mut self, ca: &StringChunked) -> Series {
322 let chunks = ca.downcast_iter().map(|array| {
323 let iter = array
324 .into_iter()
325 .map(|opt_val| opt_val.and_then(|val| self.parse(val)));
326 PrimitiveArray::from_trusted_len_iter(iter)
327 });
328 ChunkedArray::from_chunk_iter(ca.name().clone(), chunks)
329 .into_series()
330 .cast(&self.logical_type)
331 .unwrap()
332 .with_name(ca.name().clone())
333 }
334}
335
336#[cfg(feature = "dtype-date")]
337fn transform_date(val: &str, fmt: &str) -> Option<i32> {
338 NaiveDate::parse_from_str(val, fmt)
339 .ok()
340 .map(naive_date_to_date)
341}
342
343#[cfg(feature = "dtype-datetime")]
344pub(crate) fn transform_datetime_ns(val: &str, fmt: &str) -> Option<i64> {
345 match NaiveDateTime::parse_from_str(val, fmt) {
346 Ok(ndt) => Some(datetime_to_timestamp_ns(ndt)),
347 Err(parse_error) => match parse_error.kind() {
348 ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)
349 .ok()
350 .map(|nd| datetime_to_timestamp_ns(nd.and_hms_opt(0, 0, 0).unwrap())),
351 _ => None,
352 },
353 }
354}
355
356fn transform_tzaware_datetime_ns(val: &str, fmt: &str) -> Option<i64> {
357 let dt = DateTime::parse_from_str(val, fmt);
358 dt.ok().map(|dt| datetime_to_timestamp_ns(dt.naive_utc()))
359}
360
361#[cfg(feature = "dtype-datetime")]
362pub(crate) fn transform_datetime_us(val: &str, fmt: &str) -> Option<i64> {
363 match NaiveDateTime::parse_from_str(val, fmt) {
364 Ok(ndt) => Some(datetime_to_timestamp_us(ndt)),
365 Err(parse_error) => match parse_error.kind() {
366 ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)
367 .ok()
368 .map(|nd| datetime_to_timestamp_us(nd.and_hms_opt(0, 0, 0).unwrap())),
369 _ => None,
370 },
371 }
372}
373
374fn transform_tzaware_datetime_us(val: &str, fmt: &str) -> Option<i64> {
375 let dt = DateTime::parse_from_str(val, fmt);
376 dt.ok().map(|dt| datetime_to_timestamp_us(dt.naive_utc()))
377}
378
379#[cfg(feature = "dtype-datetime")]
380pub(crate) fn transform_datetime_ms(val: &str, fmt: &str) -> Option<i64> {
381 match NaiveDateTime::parse_from_str(val, fmt) {
382 Ok(ndt) => Some(datetime_to_timestamp_ms(ndt)),
383 Err(parse_error) => match parse_error.kind() {
384 ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)
385 .ok()
386 .map(|nd| datetime_to_timestamp_ms(nd.and_hms_opt(0, 0, 0).unwrap())),
387 _ => None,
388 },
389 }
390}
391
392fn transform_tzaware_datetime_ms(val: &str, fmt: &str) -> Option<i64> {
393 let dt = DateTime::parse_from_str(val, fmt);
394 dt.ok().map(|dt| datetime_to_timestamp_ms(dt.naive_utc()))
395}
396
397pub fn infer_pattern_single(val: &str) -> Option<Pattern> {
398 infer_pattern_date_single(val)
400 .or_else(|| infer_pattern_time_single(val))
401 .or_else(|| infer_pattern_datetime_single(val))
402}
403
404fn infer_pattern_datetime_single(val: &str) -> Option<Pattern> {
405 if patterns::DATETIME_D_M_Y.iter().any(|fmt| {
406 NaiveDateTime::parse_from_str(val, fmt).is_ok()
407 || NaiveDate::parse_from_str(val, fmt).is_ok()
408 }) {
409 Some(Pattern::DatetimeDMY)
410 } else if patterns::DATETIME_Y_M_D.iter().any(|fmt| {
411 NaiveDateTime::parse_from_str(val, fmt).is_ok()
412 || NaiveDate::parse_from_str(val, fmt).is_ok()
413 }) {
414 Some(Pattern::DatetimeYMD)
415 } else if patterns::DATETIME_Y_M_D_Z
416 .iter()
417 .any(|fmt| NaiveDateTime::parse_from_str(val, fmt).is_ok())
418 {
419 Some(Pattern::DatetimeYMDZ)
420 } else {
421 None
422 }
423}
424
425fn infer_pattern_date_single(val: &str) -> Option<Pattern> {
426 if patterns::DATE_D_M_Y
427 .iter()
428 .any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())
429 {
430 Some(Pattern::DateDMY)
431 } else if patterns::DATE_Y_M_D
432 .iter()
433 .any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())
434 {
435 Some(Pattern::DateYMD)
436 } else {
437 None
438 }
439}
440
441fn infer_pattern_time_single(val: &str) -> Option<Pattern> {
442 patterns::TIME_H_M_S
443 .iter()
444 .any(|fmt| NaiveTime::parse_from_str(val, fmt).is_ok())
445 .then_some(Pattern::Time)
446}
447
448#[cfg(feature = "dtype-datetime")]
449pub(crate) fn to_datetime(
450 ca: &StringChunked,
451 tu: TimeUnit,
452 tz: Option<&TimeZone>,
453 _ambiguous: &StringChunked,
454) -> PolarsResult<DatetimeChunked> {
455 match ca.first_non_null() {
456 None => {
457 Ok(Int64Chunked::full_null(ca.name().clone(), ca.len()).into_datetime(tu, tz.cloned()))
458 },
459 Some(idx) => {
460 let subset = ca.slice(idx as i64, ca.len());
461 let pattern = subset
462 .into_iter()
463 .find_map(|opt_val| opt_val.and_then(infer_pattern_datetime_single))
464 .ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;
465 let mut infer = DatetimeInfer::<Int64Type>::try_from_with_unit(pattern, Some(tu))?;
466 match pattern {
467 #[cfg(feature = "timezones")]
468 Pattern::DatetimeYMDZ => infer.coerce_string(ca).datetime().map(|ca| {
469 let mut ca = ca.clone();
470 ca.set_time_unit_and_time_zone(
472 tu,
473 tz.cloned()
474 .unwrap_or_else(|| PlSmallStr::from_static("UTC")),
475 )?;
476 Ok(ca)
477 })?,
478 _ => infer.coerce_string(ca).datetime().map(|ca| {
479 let mut ca = ca.clone();
480 ca.set_time_unit(tu);
481 match tz {
482 #[cfg(feature = "timezones")]
483 Some(tz) => polars_ops::prelude::replace_time_zone(
484 &ca,
485 Some(tz),
486 _ambiguous,
487 NonExistent::Raise,
488 ),
489 _ => Ok(ca),
490 }
491 })?,
492 }
493 },
494 }
495}
496#[cfg(feature = "dtype-date")]
497pub(crate) fn to_date(ca: &StringChunked) -> PolarsResult<DateChunked> {
498 match ca.first_non_null() {
499 None => Ok(Int32Chunked::full_null(ca.name().clone(), ca.len()).into_date()),
500 Some(idx) => {
501 let subset = ca.slice(idx as i64, ca.len());
502 let pattern = subset
503 .into_iter()
504 .find_map(|opt_val| opt_val.and_then(infer_pattern_date_single))
505 .ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;
506 let mut infer = DatetimeInfer::<Int32Type>::try_from_with_unit(pattern, None).unwrap();
507 infer.coerce_string(ca).date().cloned()
508 },
509 }
510}