polars_utils/
pl_path.rs

1use std::borrow::{Borrow, Cow};
2use std::ffi::OsStr;
3use std::fmt::Display;
4use std::ops::{Deref, Range};
5use std::path::{Path, PathBuf};
6
7use polars_error::{PolarsResult, polars_err};
8
9use crate::format_pl_refstr;
10use crate::pl_str::PlRefStr;
11
12/// Windows paths can be prefixed with this.
13/// <https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry>
14pub const WINDOWS_EXTPATH_PREFIX: &str = r#"\\?\"#;
15
16/// Path represented as a UTF-8 string.
17///
18/// Equality and ordering are based on the string value, which can be sensitive to duplicate
19/// separators. `as_std_path()` can be used to return a `&std::path::Path` for comparisons / API
20/// access.
21#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
22#[repr(transparent)]
23pub struct PlPath {
24    inner: str,
25}
26
27#[derive(Default, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
28// TODO: Derive for next release.
29// #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
30// #[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
31/// Reference-counted [`PlPath`].
32///
33/// # Windows paths invariant
34/// Windows paths will have leading `\\?\` prefix stripped, and all backslashes normalized to
35/// forward slashes.
36pub struct PlRefPath {
37    inner: PlRefStr,
38}
39
40impl PlPath {
41    // Note: Do not expose the following constructors, they do not normalize paths.
42    fn _new<S: AsRef<str> + ?Sized>(s: &S) -> &PlPath {
43        let s: &str = s.as_ref();
44        // Safety: `PlPath` is `repr(transparent)` on `str`.
45        unsafe { &*(s as *const str as *const PlPath) }
46    }
47
48    fn _try_from_path(path: &Path) -> PolarsResult<&PlPath> {
49        path.to_str()
50            .ok_or_else(|| polars_err!(non_utf8_path))
51            .map(Self::_new)
52    }
53
54    pub fn as_str(&self) -> &str {
55        unsafe { &*(self as *const PlPath as *const str) }
56    }
57
58    pub fn as_bytes(&self) -> &[u8] {
59        self.as_str().as_bytes()
60    }
61
62    pub fn as_os_str(&self) -> &OsStr {
63        OsStr::new(self)
64    }
65
66    pub fn as_std_path(&self) -> &Path {
67        Path::new(self)
68    }
69
70    pub fn to_ref_path(&self) -> PlRefPath {
71        PlRefPath::_new_no_normalize(self.as_str().into())
72    }
73
74    pub fn scheme(&self) -> Option<CloudScheme> {
75        CloudScheme::from_path(self.as_str())
76    }
77
78    /// Shorthand for `self.scheme().is_some()`.
79    pub fn has_scheme(&self) -> bool {
80        self.scheme().is_some()
81    }
82
83    /// Return a string with the scheme prefix removed (if any).
84    pub fn strip_scheme(&self) -> &str {
85        &self.as_str()[self.scheme().map_or(0, |x| x.strip_scheme_index())..self.inner.len()]
86    }
87
88    pub fn file_name(&self) -> Option<&OsStr> {
89        Path::new(self.strip_scheme()).file_name()
90    }
91
92    pub fn extension(&self) -> Option<&str> {
93        Path::new(self.strip_scheme())
94            .extension()
95            .map(|x| x.to_str().unwrap())
96    }
97
98    pub fn parent(&self) -> Option<&str> {
99        Path::new(self.strip_scheme())
100            .parent()
101            .map(|x| x.to_str().unwrap())
102    }
103
104    /// Slices the path.
105    pub fn sliced(&self, range: Range<usize>) -> &PlPath {
106        Self::_new(&self.as_str()[range])
107    }
108
109    /// Strips the scheme, then returns the authority component, and the remaining
110    /// string after the authority component. This can be understood as extracting
111    /// the bucket/prefix for cloud URIs.
112    ///
113    ///  E.g. `https://user@host:port/dir/file?param=value`
114    /// * Authority: `user@host:port`
115    /// * Remaining: `/dir/file?param=value`
116    ///
117    /// Note, for local / `file:` URIs, the returned authority will be empty, and
118    /// the remainder will be the full URI.
119    ///
120    /// # Returns
121    /// (authority, remaining).
122    pub fn strip_scheme_split_authority(&self) -> Option<(&'_ str, &'_ str)> {
123        match self.scheme() {
124            None | Some(CloudScheme::FileNoHostname) => Some(("", self.strip_scheme())),
125            Some(scheme) => {
126                let path_str = self.as_str();
127                let position = self.authority_end_position();
128
129                if position < path_str.len() {
130                    assert!(path_str[position..].starts_with('/'));
131                }
132
133                (position < path_str.len()).then_some((
134                    &path_str[scheme.strip_scheme_index()..position],
135                    &path_str[position..],
136                ))
137            },
138        }
139    }
140
141    /// Returns 0 if `self.scheme()` is `None`. Otherwise, returns `i` such that
142    /// `&self.to_str()[..i]` trims to the authority.
143    /// * If there is no '/', separator found, `i` will simply be the length of the string.
144    ///   * This is except if the scheme is `FileNoHostname`, where instead `i` will be "file:".len()
145    /// * If `self` has no `CloudScheme`, returns 0
146    pub fn authority_end_position(&self) -> usize {
147        match self.scheme() {
148            None => 0,
149            Some(scheme @ CloudScheme::FileNoHostname) => scheme.strip_scheme_index(),
150            Some(_) => {
151                let after_scheme = self.strip_scheme();
152                let offset = self.as_str().len() - after_scheme.len();
153
154                offset + after_scheme.find('/').unwrap_or(after_scheme.len())
155            },
156        }
157    }
158
159    pub fn to_absolute_path(&self) -> PolarsResult<PlRefPath> {
160        PlRefPath::try_from_pathbuf(std::path::absolute(Path::new(self.strip_scheme()))?)
161    }
162
163    pub fn join(&self, other: impl AsRef<str>) -> PlRefPath {
164        let other = other.as_ref();
165
166        if CloudScheme::from_path(other).is_some() {
167            PlRefPath::new(other)
168        } else {
169            PlRefPath::try_from_pathbuf(self.as_std_path().join(other)).unwrap()
170        }
171    }
172
173    /// Converts backslashes to forward-slashes, and removes `\\?\` prefix.
174    pub fn normalize_windows_path(path_str: &str) -> Option<PlRefPath> {
175        let has_extpath_prefix = path_str.starts_with(WINDOWS_EXTPATH_PREFIX);
176
177        if has_extpath_prefix || cfg!(target_family = "windows") {
178            let path_str = path_str
179                .strip_prefix(WINDOWS_EXTPATH_PREFIX)
180                .unwrap_or(path_str);
181
182            if matches!(
183                CloudScheme::from_path(path_str),
184                None | Some(CloudScheme::File | CloudScheme::FileNoHostname)
185            ) && path_str.contains('\\')
186            {
187                let new_path = path_str.replace('\\', "/");
188                let inner = PlRefStr::from_string(new_path);
189                return Some(PlRefPath { inner });
190            }
191        }
192
193        None
194    }
195}
196
197impl AsRef<str> for PlPath {
198    fn as_ref(&self) -> &str {
199        self.as_str()
200    }
201}
202
203impl AsRef<OsStr> for PlPath {
204    fn as_ref(&self) -> &OsStr {
205        OsStr::new(self.as_str())
206    }
207}
208
209impl AsRef<Path> for PlPath {
210    fn as_ref(&self) -> &Path {
211        self.as_std_path()
212    }
213}
214
215impl From<&PlPath> for Box<PlPath> {
216    fn from(value: &PlPath) -> Self {
217        let s: &str = value.as_str();
218        let s: Box<str> = s.into();
219        // Safety: `PlPath` is `repr(transparent)` on `str`.
220        let out: Box<PlPath> = unsafe { std::mem::transmute(s) };
221        out
222    }
223}
224
225impl Display for PlPath {
226    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
227        Display::fmt(self.as_str(), f)
228    }
229}
230
231impl PlRefPath {
232    pub fn empty() -> Self {
233        Self::default()
234    }
235
236    /// Normalizes Windows paths.
237    pub fn new(path: impl AsRef<str> + Into<PlRefStr>) -> Self {
238        if let Some(path) = PlPath::normalize_windows_path(path.as_ref()) {
239            return path;
240        }
241
242        Self::_new_no_normalize(path.into())
243    }
244
245    const fn _new_no_normalize(path: PlRefStr) -> Self {
246        Self { inner: path }
247    }
248
249    pub fn try_from_path(path: &Path) -> PolarsResult<PlRefPath> {
250        Ok(Self::new(PlPath::_try_from_path(path)?.as_str()))
251    }
252
253    pub fn try_from_pathbuf(path: PathBuf) -> PolarsResult<PlRefPath> {
254        Self::try_from_path(&path)
255    }
256
257    pub fn as_str(&self) -> &str {
258        &self.inner
259    }
260
261    pub fn as_ref_str(&self) -> &PlRefStr {
262        &self.inner
263    }
264
265    pub fn into_ref_str(self) -> PlRefStr {
266        self.inner
267    }
268
269    /// Slices the path.
270    pub fn sliced(&self, range: Range<usize>) -> PlRefPath {
271        if range == (0..self.as_str().len()) {
272            self.clone()
273        } else {
274            Self::_new_no_normalize(PlPath::sliced(self, range).as_str().into())
275        }
276    }
277
278    /// # Returns
279    /// Returns an absolute local path if this path ref is a relative local path, otherwise returns None.
280    pub fn to_absolute_path(&self) -> PolarsResult<Cow<'_, PlRefPath>> {
281        Ok(if self.has_scheme() || self.as_std_path().is_absolute() {
282            Cow::Borrowed(self)
283        } else {
284            Cow::Owned(PlPath::to_absolute_path(self)?)
285        })
286    }
287
288    /// Checks if references point to the same allocation.
289    pub fn ptr_eq(this: &Self, other: &Self) -> bool {
290        PlRefStr::ptr_eq(this.as_ref_str(), other.as_ref_str())
291    }
292}
293
294impl AsRef<str> for PlRefPath {
295    fn as_ref(&self) -> &str {
296        self.as_str()
297    }
298}
299
300impl AsRef<OsStr> for PlRefPath {
301    fn as_ref(&self) -> &OsStr {
302        self.as_os_str()
303    }
304}
305
306impl AsRef<Path> for PlRefPath {
307    fn as_ref(&self) -> &Path {
308        self.as_std_path()
309    }
310}
311
312impl Deref for PlRefPath {
313    type Target = PlPath;
314
315    fn deref(&self) -> &Self::Target {
316        PlPath::_new(self)
317    }
318}
319
320impl Display for PlRefPath {
321    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
322        Display::fmt(self.as_str(), f)
323    }
324}
325
326impl ToOwned for PlPath {
327    type Owned = PlRefPath;
328
329    fn to_owned(&self) -> Self::Owned {
330        self.to_ref_path()
331    }
332}
333
334impl Borrow<PlPath> for PlRefPath {
335    fn borrow(&self) -> &PlPath {
336        self
337    }
338}
339
340impl From<&str> for PlRefPath {
341    fn from(value: &str) -> Self {
342        Self::new(value)
343    }
344}
345
346macro_rules! impl_cloud_scheme {
347    ($($t:ident = $n:literal,)+) => {
348        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
349        #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
350        #[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
351        pub enum CloudScheme {
352            $($t,)+
353        }
354
355        impl CloudScheme {
356            /// Note, private function. Users should use [`CloudScheme::from_path`], that will handle e.g.
357            /// `file:/` without hostname properly.
358            #[expect(unreachable_patterns)]
359            fn from_scheme_str(s: &str) -> Option<Self> {
360                Some(match s {
361                    $($n => Self::$t,)+
362                    _ => return None,
363                })
364            }
365
366            pub const fn as_str(&self) -> &'static str {
367                match self {
368                    $(Self::$t => $n,)+
369                }
370            }
371        }
372    };
373}
374
375/// This must be at least the length of the longest scheme listed below.
376const MAX_SCHEME_LEN: usize = 8;
377impl_cloud_scheme! {
378    Abfs = "abfs",
379    Abfss = "abfss",
380    Adl = "adl",
381    Az = "az",
382    Azure = "azure",
383    File = "file",
384    FileNoHostname = "file",
385    Gcs = "gcs",
386    Gs = "gs",
387    Hf = "hf",
388    Http = "http",
389    Https = "https",
390    S3 = "s3",
391    S3a = "s3a",
392}
393
394impl CloudScheme {
395    pub fn from_path(mut path: &str) -> Option<Self> {
396        if let Some(stripped) = path.strip_prefix("file:") {
397            return Some(if stripped.starts_with("//") {
398                Self::File
399            } else {
400                Self::FileNoHostname
401            });
402        }
403
404        if path.len() > MAX_SCHEME_LEN {
405            path = &path[..MAX_SCHEME_LEN]
406        }
407
408        Self::from_scheme_str(&path[..path.find("://")?])
409    }
410
411    /// Returns `i` such that `&self.as_str()[i..]` strips the scheme, as well as the `://` if it
412    /// exists.
413    pub fn strip_scheme_index(&self) -> usize {
414        if let Self::FileNoHostname = self {
415            5
416        } else {
417            self.as_str().len() + 3
418        }
419    }
420}
421
422impl Display for CloudScheme {
423    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
424        Display::fmt(self.as_str(), f)
425    }
426}
427
428/// Formats a local path to begin with `file:///`.
429///
430/// # Panics
431/// May panic if `absolute_local_path` is not an absolute local path.
432pub fn format_file_uri(absolute_local_path: &str) -> PlRefPath {
433    // Windows needs an extra slash, i.e.:
434    //
435    // # Windows
436    // Absolute path: "C:/Windows/system32"
437    // Formatted: "file:///C:/Windows/system32"
438    //
439    // # Unix
440    // Absolute path: "/root/.vimrc"
441    // Formatted: "file:///root/.vimrc"
442    if cfg!(target_family = "windows") || absolute_local_path.starts_with(WINDOWS_EXTPATH_PREFIX) {
443        if let Some(path) = PlPath::normalize_windows_path(absolute_local_path) {
444            PlRefPath::new(format_pl_refstr!("file:///{path}"))
445        } else {
446            PlRefPath::new(format_pl_refstr!("file:///{absolute_local_path}"))
447        }
448    } else {
449        PlRefPath::new(format_pl_refstr!("file://{absolute_local_path}"))
450    }
451}
452
453#[cfg(feature = "serde")]
454mod _serde_impl {
455    use serde::{Deserialize, Serialize};
456
457    use super::super::plpath::PlPath as LegacyPlPath;
458    use crate::pl_path::PlRefPath;
459
460    impl Serialize for PlRefPath {
461        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
462        where
463            S: serde::Serializer,
464        {
465            LegacyPlPath::serialize(&self.clone().into(), serializer)
466        }
467    }
468
469    impl<'de> Deserialize<'de> for PlRefPath {
470        fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
471        where
472            D: serde::Deserializer<'de>,
473        {
474            LegacyPlPath::deserialize(deserializer).map(Into::into)
475        }
476    }
477}
478
479#[cfg(feature = "dsl-schema")]
480use super::plpath::PlPath as LegacyPlPath;
481
482#[cfg(feature = "dsl-schema")]
483impl schemars::JsonSchema for PlRefPath {
484    fn schema_name() -> std::borrow::Cow<'static, str> {
485        LegacyPlPath::schema_name()
486    }
487
488    fn schema_id() -> std::borrow::Cow<'static, str> {
489        LegacyPlPath::schema_id()
490    }
491
492    fn json_schema(generator: &mut schemars::SchemaGenerator) -> schemars::Schema {
493        LegacyPlPath::json_schema(generator)
494    }
495}
496
497#[cfg(test)]
498mod tests {
499    use super::*;
500
501    #[test]
502    fn test_plpath_file() {
503        let p = PlRefPath::new("file:///home/user");
504        assert_eq!(
505            (
506                p.scheme(),
507                p.scheme().map(|x| x.as_str()),
508                p.as_str(),
509                p.strip_scheme(),
510            ),
511            (
512                Some(CloudScheme::File),
513                Some("file"),
514                "file:///home/user",
515                "/home/user"
516            )
517        );
518
519        let p = PlRefPath::new("file:/home/user");
520        assert_eq!(
521            (
522                p.scheme(),
523                p.scheme().map(|x| x.as_str()),
524                p.as_str(),
525                p.strip_scheme(),
526            ),
527            (
528                Some(CloudScheme::FileNoHostname),
529                Some("file"),
530                "file:/home/user",
531                "/home/user"
532            )
533        );
534
535        assert_eq!(PlRefPath::new("file://").scheme(), Some(CloudScheme::File));
536
537        assert_eq!(
538            PlRefPath::new("file://").strip_scheme_split_authority(),
539            None
540        );
541
542        assert_eq!(
543            PlRefPath::new("file:///").strip_scheme_split_authority(),
544            Some(("", "/"))
545        );
546
547        assert_eq!(
548            PlRefPath::new("file:///path").strip_scheme_split_authority(),
549            Some(("", "/path"))
550        );
551
552        assert_eq!(
553            PlRefPath::new("file://hostname:80/path").strip_scheme_split_authority(),
554            Some(("hostname:80", "/path"))
555        );
556
557        assert_eq!(
558            PlRefPath::new("file:").scheme(),
559            Some(CloudScheme::FileNoHostname)
560        );
561        assert_eq!(
562            PlRefPath::new("file:/").scheme(),
563            Some(CloudScheme::FileNoHostname)
564        );
565        assert_eq!(
566            PlRefPath::new("file:").strip_scheme_split_authority(),
567            Some(("", ""))
568        );
569        assert_eq!(
570            PlRefPath::new("file:/Local/path").strip_scheme_split_authority(),
571            Some(("", "/Local/path"))
572        );
573
574        assert_eq!(
575            PlRefPath::new(r#"\\?\C:\Windows\system32"#).as_str(),
576            "C:/Windows/system32"
577        );
578    }
579
580    #[test]
581    fn test_plpath_join() {
582        assert_eq!(
583            PlRefPath::new("s3://.../...").join("az://.../...").as_str(),
584            "az://.../..."
585        );
586
587        fn _assert_plpath_join(base: &str, added: &str, expect: &str) {
588            // Normal path test
589            let expect = PlRefPath::new(expect);
590            let base = base.replace('/', std::path::MAIN_SEPARATOR_STR);
591            let added = added.replace('/', std::path::MAIN_SEPARATOR_STR);
592
593            assert_eq!(PlRefPath::new(&base).join(&added), expect);
594
595            // URI path test
596            let uri_base = format_file_uri(&base);
597            let expect_uri = if added.starts_with(std::path::MAIN_SEPARATOR_STR) {
598                expect.clone()
599            } else {
600                format_file_uri(expect.as_str())
601            };
602
603            assert_eq!(PlRefPath::new(uri_base.as_str()).join(added), expect_uri);
604        }
605
606        macro_rules! assert_plpath_join {
607            ($base:literal + $added:literal => $expect:literal) => {
608                _assert_plpath_join($base, $added, $expect)
609            };
610        }
611
612        assert_plpath_join!("a/b/c/" + "d/e" => "a/b/c/d/e");
613        assert_plpath_join!("a/b/c" + "d/e" => "a/b/c/d/e");
614        assert_plpath_join!("a/b/c" + "d/e/" => "a/b/c/d/e/");
615        assert_plpath_join!("a/b/c" + "/d" => "/d");
616        assert_plpath_join!("a/b/c" + "/d/" => "/d/");
617        assert_plpath_join!("" + "/d/" => "/d/");
618        assert_plpath_join!("/" + "/d/" => "/d/");
619        assert_plpath_join!("/x/y" + "/d/" => "/d/");
620        assert_plpath_join!("/x/y" + "/d" => "/d");
621        assert_plpath_join!("/x/y" + "d" => "/x/y/d");
622
623        assert_plpath_join!("/a/longer" + "path" => "/a/longer/path");
624        assert_plpath_join!("/a/longer" + "/path" => "/path");
625        assert_plpath_join!("/a/longer" + "path/test" => "/a/longer/path/test");
626        assert_plpath_join!("/a/longer" + "/path/test" => "/path/test");
627    }
628
629    #[test]
630    fn test_plpath_name() {
631        assert_eq!(PlRefPath::new("s3://...").file_name(), Some("...".as_ref()));
632        assert_eq!(
633            PlRefPath::new("a/b/file.parquet").file_name(),
634            Some("file.parquet".as_ref())
635        );
636        assert_eq!(
637            PlRefPath::new("file.parquet").file_name(),
638            Some("file.parquet".as_ref())
639        );
640
641        assert_eq!(PlRefPath::new("s3://").file_name(), None);
642        assert_eq!(PlRefPath::new("").file_name(), None);
643    }
644}