Skip to main content

polars_utils/
pl_path.rs

1use std::borrow::{Borrow, Cow};
2use std::ffi::OsStr;
3use std::fmt::Display;
4use std::ops::{Deref, Range};
5use std::path::{Path, PathBuf};
6
7use polars_error::{PolarsResult, polars_err};
8
9use crate::format_pl_refstr;
10use crate::pl_str::PlRefStr;
11
12/// Windows paths can be prefixed with this.
13/// <https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry>
14pub const WINDOWS_EXTPATH_PREFIX: &str = r#"\\?\"#;
15
16/// Path represented as a UTF-8 string.
17///
18/// Equality and ordering are based on the string value, which can be sensitive to duplicate
19/// separators. `as_std_path()` can be used to return a `&std::path::Path` for comparisons / API
20/// access.
21#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
22#[repr(transparent)]
23pub struct PlPath {
24    inner: str,
25}
26
27#[derive(Default, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
28#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
29#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
30/// Reference-counted [`PlPath`].
31///
32/// # Windows paths invariant
33/// Windows paths will have leading `\\?\` prefix stripped, and all backslashes normalized to
34/// forward slashes.
35pub struct PlRefPath {
36    inner: PlRefStr,
37}
38
39impl PlPath {
40    // Note: Do not expose the following constructors, they do not normalize paths.
41    fn _new<S: AsRef<str> + ?Sized>(s: &S) -> &PlPath {
42        let s: &str = s.as_ref();
43        // Safety: `PlPath` is `repr(transparent)` on `str`.
44        unsafe { &*(s as *const str as *const PlPath) }
45    }
46
47    fn _try_from_path(path: &Path) -> PolarsResult<&PlPath> {
48        path.to_str()
49            .ok_or_else(|| polars_err!(non_utf8_path))
50            .map(Self::_new)
51    }
52
53    pub fn as_str(&self) -> &str {
54        unsafe { &*(self as *const PlPath as *const str) }
55    }
56
57    pub fn as_bytes(&self) -> &[u8] {
58        self.as_str().as_bytes()
59    }
60
61    pub fn as_os_str(&self) -> &OsStr {
62        OsStr::new(self)
63    }
64
65    pub fn as_std_path(&self) -> &Path {
66        Path::new(self)
67    }
68
69    pub fn to_ref_path(&self) -> PlRefPath {
70        PlRefPath::_new_no_normalize(self.as_str().into())
71    }
72
73    pub fn scheme(&self) -> Option<CloudScheme> {
74        CloudScheme::from_path(self.as_str())
75    }
76
77    /// Shorthand for `self.scheme().is_some()`.
78    pub fn has_scheme(&self) -> bool {
79        self.scheme().is_some()
80    }
81
82    /// Return a string with the scheme prefix removed (if any).
83    pub fn strip_scheme(&self) -> &str {
84        &self.as_str()[self.scheme().map_or(0, |x| x.strip_scheme_index())..self.inner.len()]
85    }
86
87    pub fn file_name(&self) -> Option<&OsStr> {
88        Path::new(self.strip_scheme()).file_name()
89    }
90
91    pub fn extension(&self) -> Option<&str> {
92        Path::new(self.strip_scheme())
93            .extension()
94            .map(|x| x.to_str().unwrap())
95    }
96
97    pub fn parent(&self) -> Option<&str> {
98        Path::new(self.strip_scheme())
99            .parent()
100            .map(|x| x.to_str().unwrap())
101    }
102
103    /// Slices the path.
104    pub fn sliced(&self, range: Range<usize>) -> &PlPath {
105        Self::_new(&self.as_str()[range])
106    }
107
108    /// Strips the scheme, then returns the authority component, and the remaining
109    /// string after the authority component. This can be understood as extracting
110    /// the bucket/prefix for cloud URIs.
111    ///
112    ///  E.g. `https://user@host:port/dir/file?param=value`
113    /// * Authority: `user@host:port`
114    /// * Remaining: `/dir/file?param=value`
115    ///
116    /// Note, for local / `file:` URIs, the returned authority will be empty, and
117    /// the remainder will be the full URI.
118    ///
119    /// # Returns
120    /// (authority, remaining).
121    pub fn strip_scheme_split_authority(&self) -> Option<(&'_ str, &'_ str)> {
122        match self.scheme() {
123            None | Some(CloudScheme::FileNoHostname) => Some(("", self.strip_scheme())),
124            Some(scheme) => {
125                let path_str = self.as_str();
126                let position = self.authority_end_position();
127
128                if position < path_str.len() {
129                    assert!(path_str[position..].starts_with('/'));
130                }
131
132                (position < path_str.len()).then_some((
133                    &path_str[scheme.strip_scheme_index()..position],
134                    &path_str[position..],
135                ))
136            },
137        }
138    }
139
140    /// Returns 0 if `self.scheme()` is `None`. Otherwise, returns `i` such that
141    /// `&self.to_str()[..i]` trims to the authority.
142    /// * If there is no '/', separator found, `i` will simply be the length of the string.
143    ///   * This is except if the scheme is `FileNoHostname`, where instead `i` will be "file:".len()
144    /// * If `self` has no `CloudScheme`, returns 0
145    pub fn authority_end_position(&self) -> usize {
146        match self.scheme() {
147            None => 0,
148            Some(scheme @ CloudScheme::FileNoHostname) => scheme.strip_scheme_index(),
149            Some(_) => {
150                let after_scheme = self.strip_scheme();
151                let offset = self.as_str().len() - after_scheme.len();
152
153                offset + after_scheme.find('/').unwrap_or(after_scheme.len())
154            },
155        }
156    }
157
158    pub fn to_absolute_path(&self) -> PolarsResult<PlRefPath> {
159        PlRefPath::try_from_pathbuf(std::path::absolute(Path::new(self.strip_scheme()))?)
160    }
161
162    pub fn join(&self, other: impl AsRef<str>) -> PlRefPath {
163        let other = other.as_ref();
164
165        if CloudScheme::from_path(other).is_some()
166            || other.starts_with('/')
167            || other.starts_with('\\')
168        {
169            PlRefPath::new(other)
170        } else if CloudScheme::from_path(self.as_str()).is_some() {
171            let lhs = self.as_str().trim_end_matches('/');
172            PlRefPath::new(format!("{lhs}/{other}"))
173        } else {
174            PlRefPath::try_from_pathbuf(self.as_std_path().join(other)).unwrap()
175        }
176    }
177
178    /// Converts backslashes to forward-slashes, and removes `\\?\` prefix.
179    pub fn normalize_windows_path(path_str: &str) -> Option<PlRefPath> {
180        let has_extpath_prefix = path_str.starts_with(WINDOWS_EXTPATH_PREFIX);
181
182        if has_extpath_prefix || cfg!(target_family = "windows") {
183            let path_str = path_str
184                .strip_prefix(WINDOWS_EXTPATH_PREFIX)
185                .unwrap_or(path_str);
186
187            if matches!(
188                CloudScheme::from_path(path_str),
189                None | Some(CloudScheme::File | CloudScheme::FileNoHostname)
190            ) && path_str.contains('\\')
191            {
192                let new_path = path_str.replace('\\', "/");
193                let inner = PlRefStr::from_string(new_path);
194                return Some(PlRefPath { inner });
195            }
196        }
197
198        None
199    }
200}
201
202impl AsRef<str> for PlPath {
203    fn as_ref(&self) -> &str {
204        self.as_str()
205    }
206}
207
208impl AsRef<OsStr> for PlPath {
209    fn as_ref(&self) -> &OsStr {
210        OsStr::new(self.as_str())
211    }
212}
213
214impl AsRef<Path> for PlPath {
215    fn as_ref(&self) -> &Path {
216        self.as_std_path()
217    }
218}
219
220impl From<&PlPath> for Box<PlPath> {
221    fn from(value: &PlPath) -> Self {
222        let s: &str = value.as_str();
223        let s: Box<str> = s.into();
224        // Safety: `PlPath` is `repr(transparent)` on `str`.
225        let out: Box<PlPath> = unsafe { std::mem::transmute(s) };
226        out
227    }
228}
229
230impl Display for PlPath {
231    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
232        Display::fmt(self.as_str(), f)
233    }
234}
235
236impl PlRefPath {
237    pub fn empty() -> Self {
238        Self::default()
239    }
240
241    /// Normalizes Windows paths.
242    pub fn new(path: impl AsRef<str> + Into<PlRefStr>) -> Self {
243        if let Some(path) = PlPath::normalize_windows_path(path.as_ref()) {
244            return path;
245        }
246
247        Self::_new_no_normalize(path.into())
248    }
249
250    const fn _new_no_normalize(path: PlRefStr) -> Self {
251        Self { inner: path }
252    }
253
254    pub fn try_from_path(path: &Path) -> PolarsResult<PlRefPath> {
255        Ok(Self::new(PlPath::_try_from_path(path)?.as_str()))
256    }
257
258    pub fn try_from_pathbuf(path: PathBuf) -> PolarsResult<PlRefPath> {
259        Self::try_from_path(&path)
260    }
261
262    pub fn as_str(&self) -> &str {
263        &self.inner
264    }
265
266    pub fn as_ref_str(&self) -> &PlRefStr {
267        &self.inner
268    }
269
270    pub fn into_ref_str(self) -> PlRefStr {
271        self.inner
272    }
273
274    /// Slices the path.
275    pub fn sliced(&self, range: Range<usize>) -> PlRefPath {
276        if range == (0..self.as_str().len()) {
277            self.clone()
278        } else {
279            Self::_new_no_normalize(PlPath::sliced(self, range).as_str().into())
280        }
281    }
282
283    /// # Returns
284    /// Returns an absolute local path if this path ref is a relative local path, otherwise returns None.
285    pub fn to_absolute_path(&self) -> PolarsResult<Cow<'_, PlRefPath>> {
286        Ok(if self.has_scheme() || self.as_std_path().is_absolute() {
287            Cow::Borrowed(self)
288        } else {
289            Cow::Owned(PlPath::to_absolute_path(self)?)
290        })
291    }
292
293    /// Checks if references point to the same allocation.
294    pub fn ptr_eq(this: &Self, other: &Self) -> bool {
295        PlRefStr::ptr_eq(this.as_ref_str(), other.as_ref_str())
296    }
297}
298
299impl AsRef<str> for PlRefPath {
300    fn as_ref(&self) -> &str {
301        self.as_str()
302    }
303}
304
305impl AsRef<OsStr> for PlRefPath {
306    fn as_ref(&self) -> &OsStr {
307        self.as_os_str()
308    }
309}
310
311impl AsRef<Path> for PlRefPath {
312    fn as_ref(&self) -> &Path {
313        self.as_std_path()
314    }
315}
316
317impl Deref for PlRefPath {
318    type Target = PlPath;
319
320    fn deref(&self) -> &Self::Target {
321        PlPath::_new(self)
322    }
323}
324
325impl Display for PlRefPath {
326    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
327        Display::fmt(self.as_str(), f)
328    }
329}
330
331impl ToOwned for PlPath {
332    type Owned = PlRefPath;
333
334    fn to_owned(&self) -> Self::Owned {
335        self.to_ref_path()
336    }
337}
338
339impl Borrow<PlPath> for PlRefPath {
340    fn borrow(&self) -> &PlPath {
341        self
342    }
343}
344
345impl From<&str> for PlRefPath {
346    fn from(value: &str) -> Self {
347        Self::new(value)
348    }
349}
350
351macro_rules! impl_cloud_scheme {
352    ($($t:ident = $n:literal,)+) => {
353        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
354        #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
355        #[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
356        pub enum CloudScheme {
357            $($t,)+
358        }
359
360        impl CloudScheme {
361            /// Note, private function. Users should use [`CloudScheme::from_path`], that will handle e.g.
362            /// `file:/` without hostname properly.
363            #[expect(unreachable_patterns)]
364            fn from_scheme_str(s: &str) -> Option<Self> {
365                Some(match s {
366                    $($n => Self::$t,)+
367                    _ => return None,
368                })
369            }
370
371            pub const fn as_str(&self) -> &'static str {
372                match self {
373                    $(Self::$t => $n,)+
374                }
375            }
376        }
377    };
378}
379
380impl_cloud_scheme! {
381    Abfs = "abfs",
382    Abfss = "abfss",
383    Adl = "adl",
384    Az = "az",
385    Azure = "azure",
386    File = "file",
387    FileNoHostname = "file",
388    Gcs = "gcs",
389    Gs = "gs",
390    Hf = "hf",
391    Http = "http",
392    Https = "https",
393    S3 = "s3",
394    S3a = "s3a",
395}
396
397impl CloudScheme {
398    pub fn from_path(path: &str) -> Option<Self> {
399        if let Some(stripped) = path.strip_prefix("file:") {
400            return Some(if stripped.starts_with("//") {
401                Self::File
402            } else {
403                Self::FileNoHostname
404            });
405        }
406
407        Self::from_scheme_str(&path[..path.find("://")?])
408    }
409
410    /// Returns `i` such that `&self.as_str()[i..]` strips the scheme, as well as the `://` if it
411    /// exists.
412    pub fn strip_scheme_index(&self) -> usize {
413        if let Self::FileNoHostname = self {
414            5
415        } else {
416            self.as_str().len() + 3
417        }
418    }
419}
420
421impl Display for CloudScheme {
422    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
423        Display::fmt(self.as_str(), f)
424    }
425}
426
427/// Formats a local path to begin with `file:///`.
428///
429/// # Panics
430/// May panic if `absolute_local_path` is not an absolute local path.
431pub fn format_file_uri(absolute_local_path: &str) -> PlRefPath {
432    // Windows needs an extra slash, i.e.:
433    //
434    // # Windows
435    // Absolute path: "C:/Windows/system32"
436    // Formatted: "file:///C:/Windows/system32"
437    //
438    // # Unix
439    // Absolute path: "/root/.vimrc"
440    // Formatted: "file:///root/.vimrc"
441    if cfg!(target_family = "windows") || absolute_local_path.starts_with(WINDOWS_EXTPATH_PREFIX) {
442        if let Some(path) = PlPath::normalize_windows_path(absolute_local_path) {
443            PlRefPath::new(format_pl_refstr!("file:///{path}"))
444        } else {
445            PlRefPath::new(format_pl_refstr!("file:///{absolute_local_path}"))
446        }
447    } else {
448        PlRefPath::new(format_pl_refstr!("file://{absolute_local_path}"))
449    }
450}
451
452#[cfg(test)]
453mod tests {
454    use super::*;
455
456    #[test]
457    fn test_plpath_file() {
458        let p = PlRefPath::new("file:///home/user");
459        assert_eq!(
460            (
461                p.scheme(),
462                p.scheme().map(|x| x.as_str()),
463                p.as_str(),
464                p.strip_scheme(),
465            ),
466            (
467                Some(CloudScheme::File),
468                Some("file"),
469                "file:///home/user",
470                "/home/user"
471            )
472        );
473
474        let p = PlRefPath::new("file:/home/user");
475        assert_eq!(
476            (
477                p.scheme(),
478                p.scheme().map(|x| x.as_str()),
479                p.as_str(),
480                p.strip_scheme(),
481            ),
482            (
483                Some(CloudScheme::FileNoHostname),
484                Some("file"),
485                "file:/home/user",
486                "/home/user"
487            )
488        );
489
490        assert_eq!(PlRefPath::new("file://").scheme(), Some(CloudScheme::File));
491
492        assert_eq!(
493            PlRefPath::new("file://").strip_scheme_split_authority(),
494            None
495        );
496
497        assert_eq!(
498            PlRefPath::new("file:///").strip_scheme_split_authority(),
499            Some(("", "/"))
500        );
501
502        assert_eq!(
503            PlRefPath::new("file:///path").strip_scheme_split_authority(),
504            Some(("", "/path"))
505        );
506
507        assert_eq!(
508            PlRefPath::new("file://hostname:80/path").strip_scheme_split_authority(),
509            Some(("hostname:80", "/path"))
510        );
511
512        assert_eq!(
513            PlRefPath::new("file:").scheme(),
514            Some(CloudScheme::FileNoHostname)
515        );
516        assert_eq!(
517            PlRefPath::new("file:/").scheme(),
518            Some(CloudScheme::FileNoHostname)
519        );
520        assert_eq!(
521            PlRefPath::new("file:").strip_scheme_split_authority(),
522            Some(("", ""))
523        );
524        assert_eq!(
525            PlRefPath::new("file:/Local/path").strip_scheme_split_authority(),
526            Some(("", "/Local/path"))
527        );
528
529        assert_eq!(
530            PlRefPath::new(r#"\\?\C:\Windows\system32"#).as_str(),
531            "C:/Windows/system32"
532        );
533    }
534
535    #[test]
536    fn test_plpath_join() {
537        assert_eq!(
538            PlRefPath::new("s3://.../...").join("az://.../...").as_str(),
539            "az://.../..."
540        );
541
542        assert_eq!(
543            PlRefPath::new("s3://.../...")
544                .join("a=1/b=1/00000000.parquet")
545                .as_str(),
546            "s3://.../.../a=1/b=1/00000000.parquet"
547        );
548
549        assert_eq!(
550            PlRefPath::new("s3://.../...//")
551                .join("a=1/b=1/00000000.parquet")
552                .as_str(),
553            "s3://.../.../a=1/b=1/00000000.parquet"
554        );
555
556        fn _assert_plpath_join(base: &str, added: &str, expect: &str) {
557            // Normal path test
558            let expect = PlRefPath::new(expect);
559            let base = base.replace('/', std::path::MAIN_SEPARATOR_STR);
560            let added = added.replace('/', std::path::MAIN_SEPARATOR_STR);
561
562            assert_eq!(PlRefPath::new(&base).join(&added), expect);
563
564            // URI path test
565            let uri_base = format_file_uri(&base);
566            let expect_uri = if added.starts_with(std::path::MAIN_SEPARATOR_STR) {
567                expect.clone()
568            } else {
569                format_file_uri(expect.as_str())
570            };
571
572            assert_eq!(PlRefPath::new(uri_base.as_str()).join(added), expect_uri);
573        }
574
575        macro_rules! assert_plpath_join {
576            ($base:literal + $added:literal => $expect:literal) => {
577                _assert_plpath_join($base, $added, $expect)
578            };
579        }
580
581        assert_plpath_join!("a/b/c/" + "d/e" => "a/b/c/d/e");
582        assert_plpath_join!("a/b/c" + "d/e" => "a/b/c/d/e");
583        assert_plpath_join!("a/b/c" + "d/e/" => "a/b/c/d/e/");
584        assert_plpath_join!("a/b/c" + "/d" => "/d");
585        assert_plpath_join!("a/b/c" + "/d/" => "/d/");
586        assert_plpath_join!("" + "/d/" => "/d/");
587        assert_plpath_join!("/" + "/d/" => "/d/");
588        assert_plpath_join!("/x/y" + "/d/" => "/d/");
589        assert_plpath_join!("/x/y" + "/d" => "/d");
590        assert_plpath_join!("/x/y" + "d" => "/x/y/d");
591
592        assert_plpath_join!("/a/longer" + "path" => "/a/longer/path");
593        assert_plpath_join!("/a/longer" + "/path" => "/path");
594        assert_plpath_join!("/a/longer" + "path/test" => "/a/longer/path/test");
595        assert_plpath_join!("/a/longer" + "/path/test" => "/path/test");
596    }
597
598    #[test]
599    fn test_plpath_name() {
600        assert_eq!(PlRefPath::new("s3://...").file_name(), Some("...".as_ref()));
601        assert_eq!(
602            PlRefPath::new("a/b/file.parquet").file_name(),
603            Some("file.parquet".as_ref())
604        );
605        assert_eq!(
606            PlRefPath::new("file.parquet").file_name(),
607            Some("file.parquet".as_ref())
608        );
609
610        assert_eq!(PlRefPath::new("s3://").file_name(), None);
611        assert_eq!(PlRefPath::new("").file_name(), None);
612    }
613}