Skip to main content

polars_time/chunkedarray/string/
mod.rs

1pub mod infer;
2use chrono::DateTime;
3mod patterns;
4mod strptime;
5pub use patterns::Pattern;
6#[cfg(feature = "dtype-time")]
7use polars_core::chunked_array::temporal::time_to_time64ns;
8use polars_core::prelude::arity::unary_elementwise;
9use polars_utils::cache::LruCachedFunc;
10
11use super::*;
12#[cfg(feature = "dtype-date")]
13use crate::chunkedarray::date::naive_date_to_date;
14use crate::prelude::string::strptime::StrpTimeState;
15
16#[cfg(feature = "dtype-time")]
17fn time_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
18// (string, fmt) -> PolarsResult
19where
20    F: Fn(&str, &str) -> chrono::ParseResult<K>,
21{
22    patterns::TIME_H_M_S
23        .iter()
24        .chain(patterns::TIME_H_M_S)
25        .find(|fmt| convert(val, fmt).is_ok())
26        .copied()
27}
28
29fn datetime_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
30// (string, fmt) -> PolarsResult
31where
32    F: Fn(&str, &str) -> chrono::ParseResult<K>,
33{
34    patterns::DATETIME_Y_M_D
35        .iter()
36        .chain(patterns::DATETIME_D_M_Y)
37        .find(|fmt| convert(val, fmt).is_ok())
38        .copied()
39}
40
41fn date_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
42// (string, fmt) -> PolarsResult
43where
44    F: Fn(&str, &str) -> chrono::ParseResult<K>,
45{
46    patterns::DATE_Y_M_D
47        .iter()
48        .chain(patterns::DATE_D_M_Y)
49        .find(|fmt| convert(val, fmt).is_ok())
50        .copied()
51}
52
53fn get_first_val(ca: &StringChunked) -> PolarsResult<&str> {
54    let idx = ca.first_non_null().ok_or_else(|| {
55        polars_err!(ComputeError:
56            "unable to determine date parsing format, all values are null",
57        )
58    })?;
59    Ok(ca.get(idx).expect("should not be null"))
60}
61
62#[cfg(feature = "dtype-datetime")]
63fn sniff_fmt_datetime(ca_string: &StringChunked) -> PolarsResult<&'static str> {
64    let val = get_first_val(ca_string)?;
65    datetime_pattern(val, NaiveDateTime::parse_from_str)
66        .or_else(|| datetime_pattern(val, NaiveDate::parse_from_str))
67        .ok_or_else(|| polars_err!(parse_fmt_idk = "datetime"))
68}
69
70#[cfg(feature = "dtype-date")]
71fn sniff_fmt_date(ca_string: &StringChunked) -> PolarsResult<&'static str> {
72    let val = get_first_val(ca_string)?;
73    date_pattern(val, NaiveDate::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "date"))
74}
75
76#[cfg(feature = "dtype-time")]
77fn sniff_fmt_time(ca_string: &StringChunked) -> PolarsResult<&'static str> {
78    let val = get_first_val(ca_string)?;
79    time_pattern(val, NaiveTime::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "time"))
80}
81
82pub trait StringMethods: AsString {
83    #[cfg(feature = "dtype-time")]
84    /// Parsing string values and return a [`TimeChunked`]
85    fn as_time(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<TimeChunked> {
86        let string_ca = self.as_string();
87        let fmt = match fmt {
88            Some(fmt) => fmt,
89            None => sniff_fmt_time(string_ca)?,
90        };
91        let use_cache = use_cache && string_ca.len() > 50;
92
93        let mut convert = LruCachedFunc::new(
94            |s| {
95                let naive_time = NaiveTime::parse_from_str(s, fmt).ok()?;
96                Some(time_to_time64ns(&naive_time))
97            },
98            (string_ca.len() as f64).sqrt() as usize,
99        );
100        let ca = unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache));
101        Ok(ca.with_name(string_ca.name().clone()).into_time())
102    }
103
104    #[cfg(feature = "dtype-date")]
105    /// Parsing string values and return a [`DateChunked`]
106    /// Different from `as_date` this function allows matches that not contain the whole string
107    /// e.g. "foo-2021-01-01-bar" could match "2021-01-01"
108    fn as_date_not_exact(&self, fmt: Option<&str>) -> PolarsResult<DateChunked> {
109        let string_ca = self.as_string();
110        let fmt = match fmt {
111            Some(fmt) => fmt,
112            None => sniff_fmt_date(string_ca)?,
113        };
114        let ca = unary_elementwise(string_ca, |opt_s| {
115            let mut s = opt_s?;
116            while !s.is_empty() {
117                match NaiveDate::parse_and_remainder(s, fmt) {
118                    Ok((nd, _)) => return Some(naive_date_to_date(nd)),
119                    Err(_) => {
120                        let mut it = s.chars();
121                        it.next();
122                        s = it.as_str();
123                    },
124                }
125            }
126
127            None
128        });
129        Ok(ca.with_name(string_ca.name().clone()).into_date())
130    }
131
132    #[cfg(feature = "dtype-datetime")]
133    /// Parsing string values and return a [`DatetimeChunked`]
134    /// Different from `as_datetime` this function allows matches that not contain the whole string
135    /// e.g. "foo-2021-01-01-bar" could match "2021-01-01"
136    fn as_datetime_not_exact(
137        &self,
138        fmt: Option<&str>,
139        tu: TimeUnit,
140        tz_aware: bool,
141        tz: Option<&TimeZone>,
142        _ambiguous: &StringChunked,
143        // Ensure that the inferred time_zone matches the given time_zone.
144        ensure_matching_tz: bool,
145    ) -> PolarsResult<DatetimeChunked> {
146        let string_ca = self.as_string();
147        let had_format = fmt.is_some();
148        let fmt = match fmt {
149            Some(fmt) => fmt,
150            None => sniff_fmt_datetime(string_ca)?,
151        };
152
153        let func = match tu {
154            TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
155            TimeUnit::Microseconds => datetime_to_timestamp_us,
156            TimeUnit::Milliseconds => datetime_to_timestamp_ms,
157        };
158
159        let ca = unary_elementwise(string_ca, |opt_s| {
160            let mut s = opt_s?;
161            while !s.is_empty() {
162                let timestamp = if tz_aware {
163                    DateTime::parse_and_remainder(s, fmt)
164                        .ok()
165                        .map(|(dt, _r)| func(dt.naive_utc()))
166                } else {
167                    infer::parse_datetime_and_remainder(s, fmt).map(|(nd, _r)| func(nd))
168                };
169                match timestamp {
170                    Some(ts) => return Some(ts),
171                    None => {
172                        let mut it = s.chars();
173                        it.next();
174                        s = it.as_str();
175                    },
176                }
177            }
178            None
179        })
180        .with_name(string_ca.name().clone());
181
182        polars_ensure!(
183            !ensure_matching_tz || had_format || !(tz_aware && tz.is_none()),
184            to_datetime_tz_mismatch
185        );
186
187        match (tz_aware, tz) {
188            #[cfg(feature = "timezones")]
189            (false, Some(tz)) => polars_ops::prelude::replace_time_zone(
190                &ca.into_datetime(tu, None),
191                Some(tz),
192                _ambiguous,
193                NonExistent::Raise,
194            ),
195            #[cfg(feature = "timezones")]
196            (true, tz) => Ok(ca.into_datetime(tu, Some(tz.cloned().unwrap_or(TimeZone::UTC)))),
197            _ => Ok(ca.into_datetime(tu, None)),
198        }
199    }
200
201    #[cfg(feature = "dtype-date")]
202    /// Parsing string values and return a [`DateChunked`]
203    fn as_date(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<DateChunked> {
204        let string_ca = self.as_string();
205        let fmt = match fmt {
206            Some(fmt) => fmt,
207            None => return infer::to_date(string_ca),
208        };
209        let use_cache = use_cache && string_ca.len() > 50;
210        let fmt = strptime::compile_fmt(fmt)?;
211
212        // We can use the fast parser.
213        let ca = if let Some(fmt_len) = strptime::fmt_len(fmt.as_bytes()) {
214            let mut strptime_cache = StrpTimeState::default();
215            let mut convert = LruCachedFunc::new(
216                |s: &str| {
217                    // SAFETY: fmt_len is correct, it was computed with this `fmt` str.
218                    match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) } {
219                        // Fallback to chrono.
220                        None => NaiveDate::parse_from_str(s, &fmt).ok(),
221                        Some(ndt) => Some(ndt.date()),
222                    }
223                    .map(naive_date_to_date)
224                },
225                (string_ca.len() as f64).sqrt() as usize,
226            );
227            unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))
228        } else {
229            let mut convert = LruCachedFunc::new(
230                |s| {
231                    let naive_date = NaiveDate::parse_from_str(s, &fmt).ok()?;
232                    Some(naive_date_to_date(naive_date))
233                },
234                (string_ca.len() as f64).sqrt() as usize,
235            );
236            unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))
237        };
238
239        Ok(ca.with_name(string_ca.name().clone()).into_date())
240    }
241
242    #[cfg(feature = "dtype-datetime")]
243    /// Parsing string values and return a [`DatetimeChunked`].
244    fn as_datetime(
245        &self,
246        fmt: Option<&str>,
247        tu: TimeUnit,
248        use_cache: bool,
249        tz_aware: bool,
250        tz: Option<&TimeZone>,
251        ambiguous: &StringChunked,
252    ) -> PolarsResult<DatetimeChunked> {
253        let string_ca = self.as_string();
254        let fmt = match fmt {
255            Some(fmt) => fmt,
256            None => return infer::to_datetime(string_ca, tu, tz, ambiguous, true),
257        };
258        let fmt = strptime::compile_fmt(fmt)?;
259        let use_cache = use_cache && string_ca.len() > 50;
260
261        let func = match tu {
262            TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
263            TimeUnit::Microseconds => datetime_to_timestamp_us,
264            TimeUnit::Milliseconds => datetime_to_timestamp_ms,
265        };
266
267        if tz_aware {
268            #[cfg(feature = "timezones")]
269            {
270                let mut convert = LruCachedFunc::new(
271                    |s: &str| {
272                        let dt = DateTime::parse_from_str(s, &fmt).ok()?;
273                        Some(func(dt.naive_utc()))
274                    },
275                    (string_ca.len() as f64).sqrt() as usize,
276                );
277                Ok(
278                    unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
279                        .with_name(string_ca.name().clone())
280                        .into_datetime(tu, Some(tz.cloned().unwrap_or(TimeZone::UTC))),
281                )
282            }
283            #[cfg(not(feature = "timezones"))]
284            {
285                panic!("activate 'timezones' feature")
286            }
287        } else {
288            let transform = match tu {
289                TimeUnit::Nanoseconds => infer::transform_datetime_ns,
290                TimeUnit::Microseconds => infer::transform_datetime_us,
291                TimeUnit::Milliseconds => infer::transform_datetime_ms,
292            };
293            // We can use the fast parser.
294            let ca = if let Some(fmt_len) = self::strptime::fmt_len(fmt.as_bytes()) {
295                let mut strptime_cache = StrpTimeState::default();
296                let mut convert = LruCachedFunc::new(
297                    |s: &str| {
298                        // SAFETY: fmt_len is correct, it was computed with this `fmt` str.
299                        match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) }
300                        {
301                            None => transform(s, &fmt),
302                            Some(ndt) => Some(func(ndt)),
303                        }
304                    },
305                    (string_ca.len() as f64).sqrt() as usize,
306                );
307                unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
308            } else {
309                let mut convert = LruCachedFunc::new(
310                    |s| transform(s, &fmt),
311                    (string_ca.len() as f64).sqrt() as usize,
312                );
313                unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
314            };
315            let dt = ca
316                .with_name(string_ca.name().clone())
317                .into_datetime(tu, None);
318            match tz {
319                #[cfg(feature = "timezones")]
320                Some(tz) => polars_ops::prelude::replace_time_zone(
321                    &dt,
322                    Some(tz),
323                    ambiguous,
324                    NonExistent::Raise,
325                ),
326                _ => Ok(dt),
327            }
328        }
329    }
330}
331
332pub trait AsString {
333    fn as_string(&self) -> &StringChunked;
334}
335
336impl AsString for StringChunked {
337    fn as_string(&self) -> &StringChunked {
338        self
339    }
340}
341
342impl StringMethods for StringChunked {}