polars_utils/
pl_path.rs

1use std::borrow::{Borrow, Cow};
2use std::ffi::OsStr;
3use std::fmt::Display;
4use std::ops::{Deref, Range};
5use std::path::{Path, PathBuf};
6
7use polars_error::{PolarsResult, polars_err};
8
9use crate::format_pl_refstr;
10use crate::pl_str::PlRefStr;
11
12/// Windows paths can be prefixed with this.
13/// <https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry>
14pub const WINDOWS_EXTPATH_PREFIX: &str = r#"\\?\"#;
15
16/// Path represented as a UTF-8 string.
17///
18/// Equality and ordering are based on the string value, which can be sensitive to duplicate
19/// separators. `as_std_path()` can be used to return a `&std::path::Path` for comparisons / API
20/// access.
21#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
22#[repr(transparent)]
23pub struct PlPath {
24    inner: str,
25}
26
27#[derive(Default, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
28#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
29#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
30/// Reference-counted [`PlPath`].
31///
32/// # Windows paths invariant
33/// Windows paths will have leading `\\?\` prefix stripped, and all backslashes normalized to
34/// forward slashes.
35pub struct PlRefPath {
36    inner: PlRefStr,
37}
38
39impl PlPath {
40    // Note: Do not expose the following constructors, they do not normalize paths.
41    fn _new<S: AsRef<str> + ?Sized>(s: &S) -> &PlPath {
42        let s: &str = s.as_ref();
43        // Safety: `PlPath` is `repr(transparent)` on `str`.
44        unsafe { &*(s as *const str as *const PlPath) }
45    }
46
47    fn _try_from_path(path: &Path) -> PolarsResult<&PlPath> {
48        path.to_str()
49            .ok_or_else(|| polars_err!(non_utf8_path))
50            .map(Self::_new)
51    }
52
53    pub fn as_str(&self) -> &str {
54        unsafe { &*(self as *const PlPath as *const str) }
55    }
56
57    pub fn as_bytes(&self) -> &[u8] {
58        self.as_str().as_bytes()
59    }
60
61    pub fn as_os_str(&self) -> &OsStr {
62        OsStr::new(self)
63    }
64
65    pub fn as_std_path(&self) -> &Path {
66        Path::new(self)
67    }
68
69    pub fn to_ref_path(&self) -> PlRefPath {
70        PlRefPath::_new_no_normalize(self.as_str().into())
71    }
72
73    pub fn scheme(&self) -> Option<CloudScheme> {
74        CloudScheme::from_path(self.as_str())
75    }
76
77    /// Shorthand for `self.scheme().is_some()`.
78    pub fn has_scheme(&self) -> bool {
79        self.scheme().is_some()
80    }
81
82    /// Return a string with the scheme prefix removed (if any).
83    pub fn strip_scheme(&self) -> &str {
84        &self.as_str()[self.scheme().map_or(0, |x| x.strip_scheme_index())..self.inner.len()]
85    }
86
87    pub fn file_name(&self) -> Option<&OsStr> {
88        Path::new(self.strip_scheme()).file_name()
89    }
90
91    pub fn extension(&self) -> Option<&str> {
92        Path::new(self.strip_scheme())
93            .extension()
94            .map(|x| x.to_str().unwrap())
95    }
96
97    pub fn parent(&self) -> Option<&str> {
98        Path::new(self.strip_scheme())
99            .parent()
100            .map(|x| x.to_str().unwrap())
101    }
102
103    /// Slices the path.
104    pub fn sliced(&self, range: Range<usize>) -> &PlPath {
105        Self::_new(&self.as_str()[range])
106    }
107
108    /// Strips the scheme, then returns the authority component, and the remaining
109    /// string after the authority component. This can be understood as extracting
110    /// the bucket/prefix for cloud URIs.
111    ///
112    ///  E.g. `https://user@host:port/dir/file?param=value`
113    /// * Authority: `user@host:port`
114    /// * Remaining: `/dir/file?param=value`
115    ///
116    /// Note, for local / `file:` URIs, the returned authority will be empty, and
117    /// the remainder will be the full URI.
118    ///
119    /// # Returns
120    /// (authority, remaining).
121    pub fn strip_scheme_split_authority(&self) -> Option<(&'_ str, &'_ str)> {
122        match self.scheme() {
123            None | Some(CloudScheme::FileNoHostname) => Some(("", self.strip_scheme())),
124            Some(scheme) => {
125                let path_str = self.as_str();
126                let position = self.authority_end_position();
127
128                if position < path_str.len() {
129                    assert!(path_str[position..].starts_with('/'));
130                }
131
132                (position < path_str.len()).then_some((
133                    &path_str[scheme.strip_scheme_index()..position],
134                    &path_str[position..],
135                ))
136            },
137        }
138    }
139
140    /// Returns 0 if `self.scheme()` is `None`. Otherwise, returns `i` such that
141    /// `&self.to_str()[..i]` trims to the authority.
142    /// * If there is no '/', separator found, `i` will simply be the length of the string.
143    ///   * This is except if the scheme is `FileNoHostname`, where instead `i` will be "file:".len()
144    /// * If `self` has no `CloudScheme`, returns 0
145    pub fn authority_end_position(&self) -> usize {
146        match self.scheme() {
147            None => 0,
148            Some(scheme @ CloudScheme::FileNoHostname) => scheme.strip_scheme_index(),
149            Some(_) => {
150                let after_scheme = self.strip_scheme();
151                let offset = self.as_str().len() - after_scheme.len();
152
153                offset + after_scheme.find('/').unwrap_or(after_scheme.len())
154            },
155        }
156    }
157
158    pub fn to_absolute_path(&self) -> PolarsResult<PlRefPath> {
159        PlRefPath::try_from_pathbuf(std::path::absolute(Path::new(self.strip_scheme()))?)
160    }
161
162    pub fn join(&self, other: impl AsRef<str>) -> PlRefPath {
163        let other = other.as_ref();
164
165        if CloudScheme::from_path(other).is_some() {
166            PlRefPath::new(other)
167        } else {
168            PlRefPath::try_from_pathbuf(self.as_std_path().join(other)).unwrap()
169        }
170    }
171
172    /// Converts backslashes to forward-slashes, and removes `\\?\` prefix.
173    pub fn normalize_windows_path(path_str: &str) -> Option<PlRefPath> {
174        let has_extpath_prefix = path_str.starts_with(WINDOWS_EXTPATH_PREFIX);
175
176        if has_extpath_prefix || cfg!(target_family = "windows") {
177            let path_str = path_str
178                .strip_prefix(WINDOWS_EXTPATH_PREFIX)
179                .unwrap_or(path_str);
180
181            if matches!(
182                CloudScheme::from_path(path_str),
183                None | Some(CloudScheme::File | CloudScheme::FileNoHostname)
184            ) && path_str.contains('\\')
185            {
186                let new_path = path_str.replace('\\', "/");
187                let inner = PlRefStr::from_string(new_path);
188                return Some(PlRefPath { inner });
189            }
190        }
191
192        None
193    }
194}
195
196impl AsRef<str> for PlPath {
197    fn as_ref(&self) -> &str {
198        self.as_str()
199    }
200}
201
202impl AsRef<OsStr> for PlPath {
203    fn as_ref(&self) -> &OsStr {
204        OsStr::new(self.as_str())
205    }
206}
207
208impl AsRef<Path> for PlPath {
209    fn as_ref(&self) -> &Path {
210        self.as_std_path()
211    }
212}
213
214impl From<&PlPath> for Box<PlPath> {
215    fn from(value: &PlPath) -> Self {
216        let s: &str = value.as_str();
217        let s: Box<str> = s.into();
218        // Safety: `PlPath` is `repr(transparent)` on `str`.
219        let out: Box<PlPath> = unsafe { std::mem::transmute(s) };
220        out
221    }
222}
223
224impl Display for PlPath {
225    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
226        Display::fmt(self.as_str(), f)
227    }
228}
229
230impl PlRefPath {
231    pub fn empty() -> Self {
232        Self::default()
233    }
234
235    /// Normalizes Windows paths.
236    pub fn new(path: impl AsRef<str> + Into<PlRefStr>) -> Self {
237        if let Some(path) = PlPath::normalize_windows_path(path.as_ref()) {
238            return path;
239        }
240
241        Self::_new_no_normalize(path.into())
242    }
243
244    const fn _new_no_normalize(path: PlRefStr) -> Self {
245        Self { inner: path }
246    }
247
248    pub fn try_from_path(path: &Path) -> PolarsResult<PlRefPath> {
249        Ok(Self::new(PlPath::_try_from_path(path)?.as_str()))
250    }
251
252    pub fn try_from_pathbuf(path: PathBuf) -> PolarsResult<PlRefPath> {
253        Self::try_from_path(&path)
254    }
255
256    pub fn as_str(&self) -> &str {
257        &self.inner
258    }
259
260    pub fn as_ref_str(&self) -> &PlRefStr {
261        &self.inner
262    }
263
264    pub fn into_ref_str(self) -> PlRefStr {
265        self.inner
266    }
267
268    /// Slices the path.
269    pub fn sliced(&self, range: Range<usize>) -> PlRefPath {
270        if range == (0..self.as_str().len()) {
271            self.clone()
272        } else {
273            Self::_new_no_normalize(PlPath::sliced(self, range).as_str().into())
274        }
275    }
276
277    /// # Returns
278    /// Returns an absolute local path if this path ref is a relative local path, otherwise returns None.
279    pub fn to_absolute_path(&self) -> PolarsResult<Cow<'_, PlRefPath>> {
280        Ok(if self.has_scheme() || self.as_std_path().is_absolute() {
281            Cow::Borrowed(self)
282        } else {
283            Cow::Owned(PlPath::to_absolute_path(self)?)
284        })
285    }
286
287    /// Checks if references point to the same allocation.
288    pub fn ptr_eq(this: &Self, other: &Self) -> bool {
289        PlRefStr::ptr_eq(this.as_ref_str(), other.as_ref_str())
290    }
291}
292
293impl AsRef<str> for PlRefPath {
294    fn as_ref(&self) -> &str {
295        self.as_str()
296    }
297}
298
299impl AsRef<OsStr> for PlRefPath {
300    fn as_ref(&self) -> &OsStr {
301        self.as_os_str()
302    }
303}
304
305impl AsRef<Path> for PlRefPath {
306    fn as_ref(&self) -> &Path {
307        self.as_std_path()
308    }
309}
310
311impl Deref for PlRefPath {
312    type Target = PlPath;
313
314    fn deref(&self) -> &Self::Target {
315        PlPath::_new(self)
316    }
317}
318
319impl Display for PlRefPath {
320    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
321        Display::fmt(self.as_str(), f)
322    }
323}
324
325impl ToOwned for PlPath {
326    type Owned = PlRefPath;
327
328    fn to_owned(&self) -> Self::Owned {
329        self.to_ref_path()
330    }
331}
332
333impl Borrow<PlPath> for PlRefPath {
334    fn borrow(&self) -> &PlPath {
335        self
336    }
337}
338
339impl From<&str> for PlRefPath {
340    fn from(value: &str) -> Self {
341        Self::new(value)
342    }
343}
344
345macro_rules! impl_cloud_scheme {
346    ($($t:ident = $n:literal,)+) => {
347        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
348        #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
349        #[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
350        pub enum CloudScheme {
351            $($t,)+
352        }
353
354        impl CloudScheme {
355            /// Note, private function. Users should use [`CloudScheme::from_path`], that will handle e.g.
356            /// `file:/` without hostname properly.
357            #[expect(unreachable_patterns)]
358            fn from_scheme_str(s: &str) -> Option<Self> {
359                Some(match s {
360                    $($n => Self::$t,)+
361                    _ => return None,
362                })
363            }
364
365            pub const fn as_str(&self) -> &'static str {
366                match self {
367                    $(Self::$t => $n,)+
368                }
369            }
370        }
371    };
372}
373
374impl_cloud_scheme! {
375    Abfs = "abfs",
376    Abfss = "abfss",
377    Adl = "adl",
378    Az = "az",
379    Azure = "azure",
380    File = "file",
381    FileNoHostname = "file",
382    Gcs = "gcs",
383    Gs = "gs",
384    Hf = "hf",
385    Http = "http",
386    Https = "https",
387    S3 = "s3",
388    S3a = "s3a",
389}
390
391impl CloudScheme {
392    pub fn from_path(path: &str) -> Option<Self> {
393        if let Some(stripped) = path.strip_prefix("file:") {
394            return Some(if stripped.starts_with("//") {
395                Self::File
396            } else {
397                Self::FileNoHostname
398            });
399        }
400
401        Self::from_scheme_str(&path[..path.find("://")?])
402    }
403
404    /// Returns `i` such that `&self.as_str()[i..]` strips the scheme, as well as the `://` if it
405    /// exists.
406    pub fn strip_scheme_index(&self) -> usize {
407        if let Self::FileNoHostname = self {
408            5
409        } else {
410            self.as_str().len() + 3
411        }
412    }
413}
414
415impl Display for CloudScheme {
416    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
417        Display::fmt(self.as_str(), f)
418    }
419}
420
421/// Formats a local path to begin with `file:///`.
422///
423/// # Panics
424/// May panic if `absolute_local_path` is not an absolute local path.
425pub fn format_file_uri(absolute_local_path: &str) -> PlRefPath {
426    // Windows needs an extra slash, i.e.:
427    //
428    // # Windows
429    // Absolute path: "C:/Windows/system32"
430    // Formatted: "file:///C:/Windows/system32"
431    //
432    // # Unix
433    // Absolute path: "/root/.vimrc"
434    // Formatted: "file:///root/.vimrc"
435    if cfg!(target_family = "windows") || absolute_local_path.starts_with(WINDOWS_EXTPATH_PREFIX) {
436        if let Some(path) = PlPath::normalize_windows_path(absolute_local_path) {
437            PlRefPath::new(format_pl_refstr!("file:///{path}"))
438        } else {
439            PlRefPath::new(format_pl_refstr!("file:///{absolute_local_path}"))
440        }
441    } else {
442        PlRefPath::new(format_pl_refstr!("file://{absolute_local_path}"))
443    }
444}
445
446#[cfg(test)]
447mod tests {
448    use super::*;
449
450    #[test]
451    fn test_plpath_file() {
452        let p = PlRefPath::new("file:///home/user");
453        assert_eq!(
454            (
455                p.scheme(),
456                p.scheme().map(|x| x.as_str()),
457                p.as_str(),
458                p.strip_scheme(),
459            ),
460            (
461                Some(CloudScheme::File),
462                Some("file"),
463                "file:///home/user",
464                "/home/user"
465            )
466        );
467
468        let p = PlRefPath::new("file:/home/user");
469        assert_eq!(
470            (
471                p.scheme(),
472                p.scheme().map(|x| x.as_str()),
473                p.as_str(),
474                p.strip_scheme(),
475            ),
476            (
477                Some(CloudScheme::FileNoHostname),
478                Some("file"),
479                "file:/home/user",
480                "/home/user"
481            )
482        );
483
484        assert_eq!(PlRefPath::new("file://").scheme(), Some(CloudScheme::File));
485
486        assert_eq!(
487            PlRefPath::new("file://").strip_scheme_split_authority(),
488            None
489        );
490
491        assert_eq!(
492            PlRefPath::new("file:///").strip_scheme_split_authority(),
493            Some(("", "/"))
494        );
495
496        assert_eq!(
497            PlRefPath::new("file:///path").strip_scheme_split_authority(),
498            Some(("", "/path"))
499        );
500
501        assert_eq!(
502            PlRefPath::new("file://hostname:80/path").strip_scheme_split_authority(),
503            Some(("hostname:80", "/path"))
504        );
505
506        assert_eq!(
507            PlRefPath::new("file:").scheme(),
508            Some(CloudScheme::FileNoHostname)
509        );
510        assert_eq!(
511            PlRefPath::new("file:/").scheme(),
512            Some(CloudScheme::FileNoHostname)
513        );
514        assert_eq!(
515            PlRefPath::new("file:").strip_scheme_split_authority(),
516            Some(("", ""))
517        );
518        assert_eq!(
519            PlRefPath::new("file:/Local/path").strip_scheme_split_authority(),
520            Some(("", "/Local/path"))
521        );
522
523        assert_eq!(
524            PlRefPath::new(r#"\\?\C:\Windows\system32"#).as_str(),
525            "C:/Windows/system32"
526        );
527    }
528
529    #[test]
530    fn test_plpath_join() {
531        assert_eq!(
532            PlRefPath::new("s3://.../...").join("az://.../...").as_str(),
533            "az://.../..."
534        );
535
536        fn _assert_plpath_join(base: &str, added: &str, expect: &str) {
537            // Normal path test
538            let expect = PlRefPath::new(expect);
539            let base = base.replace('/', std::path::MAIN_SEPARATOR_STR);
540            let added = added.replace('/', std::path::MAIN_SEPARATOR_STR);
541
542            assert_eq!(PlRefPath::new(&base).join(&added), expect);
543
544            // URI path test
545            let uri_base = format_file_uri(&base);
546            let expect_uri = if added.starts_with(std::path::MAIN_SEPARATOR_STR) {
547                expect.clone()
548            } else {
549                format_file_uri(expect.as_str())
550            };
551
552            assert_eq!(PlRefPath::new(uri_base.as_str()).join(added), expect_uri);
553        }
554
555        macro_rules! assert_plpath_join {
556            ($base:literal + $added:literal => $expect:literal) => {
557                _assert_plpath_join($base, $added, $expect)
558            };
559        }
560
561        assert_plpath_join!("a/b/c/" + "d/e" => "a/b/c/d/e");
562        assert_plpath_join!("a/b/c" + "d/e" => "a/b/c/d/e");
563        assert_plpath_join!("a/b/c" + "d/e/" => "a/b/c/d/e/");
564        assert_plpath_join!("a/b/c" + "/d" => "/d");
565        assert_plpath_join!("a/b/c" + "/d/" => "/d/");
566        assert_plpath_join!("" + "/d/" => "/d/");
567        assert_plpath_join!("/" + "/d/" => "/d/");
568        assert_plpath_join!("/x/y" + "/d/" => "/d/");
569        assert_plpath_join!("/x/y" + "/d" => "/d");
570        assert_plpath_join!("/x/y" + "d" => "/x/y/d");
571
572        assert_plpath_join!("/a/longer" + "path" => "/a/longer/path");
573        assert_plpath_join!("/a/longer" + "/path" => "/path");
574        assert_plpath_join!("/a/longer" + "path/test" => "/a/longer/path/test");
575        assert_plpath_join!("/a/longer" + "/path/test" => "/path/test");
576    }
577
578    #[test]
579    fn test_plpath_name() {
580        assert_eq!(PlRefPath::new("s3://...").file_name(), Some("...".as_ref()));
581        assert_eq!(
582            PlRefPath::new("a/b/file.parquet").file_name(),
583            Some("file.parquet".as_ref())
584        );
585        assert_eq!(
586            PlRefPath::new("file.parquet").file_name(),
587            Some("file.parquet".as_ref())
588        );
589
590        assert_eq!(PlRefPath::new("s3://").file_name(), None);
591        assert_eq!(PlRefPath::new("").file_name(), None);
592    }
593}