Skip to main content

polars_utils/
pl_path.rs

1use std::borrow::{Borrow, Cow};
2use std::ffi::OsStr;
3use std::fmt::Display;
4use std::ops::{Deref, Range};
5use std::path::{Path, PathBuf};
6use std::sync::{LazyLock, RwLock};
7
8use polars_error::{PolarsResult, polars_bail, polars_err};
9
10use crate::aliases::PlHashSet;
11use crate::format_pl_refstr;
12use crate::pl_str::PlRefStr;
13
14/// Windows paths can be prefixed with this.
15/// <https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry>
16pub const WINDOWS_EXTPATH_PREFIX: &str = r#"\\?\"#;
17
18const BUILTIN_EXT_SCHEMES: &[&str] = &["hdfs"];
19
20/// Allow-list of external cloud schemes that use an external object_store builder.
21/// Superset of builtin ext_schemes to support internal testing.
22pub static ALLOWED_EXT_SCHEMES: LazyLock<RwLock<PlHashSet<&'static str>>> =
23    LazyLock::new(|| RwLock::new(PlHashSet::from_iter(BUILTIN_EXT_SCHEMES.iter().copied())));
24
25/// Look up scheme the external-scheme allow-list.
26fn get_ext_scheme(s: &str) -> Option<&'static str> {
27    ALLOWED_EXT_SCHEMES.read().unwrap().get(s).copied()
28}
29
30/// Whether the scheme is allowed (e.g. "hdfs").
31pub fn ext_scheme_allowed(s: &str) -> bool {
32    get_ext_scheme(s).is_some()
33}
34
35/// Extend allowed ext_schemes. Check for RFC 3986 compliance.
36/// Helper method for internal/test use only. Not public API (at this point).
37#[doc(hidden)]
38pub fn _allow_ext_scheme(scheme: &'static str) -> PolarsResult<()> {
39    let valid = scheme
40        .chars()
41        .next()
42        .is_some_and(|c| c.is_ascii_alphabetic())
43        && scheme
44            .chars()
45            .all(|c| c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.'));
46
47    if !valid {
48        polars_bail!(
49            InvalidOperation:
50            "invalid scheme '{}': must start with a letter and contain only \
51             letters, digits, '+', '-', '.'",
52            scheme
53        );
54    }
55
56    ALLOWED_EXT_SCHEMES.write().unwrap().insert(scheme);
57    Ok(())
58}
59
60/// Helper method for internal/test use only. Not public API.
61#[doc(hidden)]
62pub fn _disallow_ext_scheme(scheme: &str) {
63    if BUILTIN_EXT_SCHEMES.contains(&scheme) {
64        return; // built-ins are permanent
65    }
66    ALLOWED_EXT_SCHEMES.write().unwrap().remove(scheme);
67}
68
69/// Path represented as a UTF-8 string.
70///
71/// Equality and ordering are based on the string value, which can be sensitive to duplicate
72/// separators. `as_std_path()` can be used to return a `&std::path::Path` for comparisons / API
73/// access.
74#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
75#[repr(transparent)]
76pub struct PlPath {
77    inner: str,
78}
79
80#[derive(Default, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
81#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
82#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
83/// Reference-counted [`PlPath`].
84///
85/// # Windows paths invariant
86/// Windows paths will have leading `\\?\` prefix stripped, and all backslashes normalized to
87/// forward slashes.
88pub struct PlRefPath {
89    inner: PlRefStr,
90}
91
92impl PlPath {
93    // Note: Do not expose the following constructors, they do not normalize paths.
94    fn _new<S: AsRef<str> + ?Sized>(s: &S) -> &PlPath {
95        let s: &str = s.as_ref();
96        // Safety: `PlPath` is `repr(transparent)` on `str`.
97        unsafe { &*(s as *const str as *const PlPath) }
98    }
99
100    fn _try_from_path(path: &Path) -> PolarsResult<&PlPath> {
101        path.to_str()
102            .ok_or_else(|| polars_err!(non_utf8_path))
103            .map(Self::_new)
104    }
105
106    pub fn as_str(&self) -> &str {
107        unsafe { &*(self as *const PlPath as *const str) }
108    }
109
110    pub fn as_bytes(&self) -> &[u8] {
111        self.as_str().as_bytes()
112    }
113
114    pub fn as_os_str(&self) -> &OsStr {
115        OsStr::new(self)
116    }
117
118    pub fn as_std_path(&self) -> &Path {
119        Path::new(self)
120    }
121
122    pub fn to_ref_path(&self) -> PlRefPath {
123        PlRefPath::_new_no_normalize(self.as_str().into())
124    }
125
126    pub fn scheme(&self) -> Option<CloudScheme> {
127        CloudScheme::from_path(self.as_str())
128    }
129
130    /// Shorthand for `self.scheme().is_some()`.
131    pub fn has_scheme(&self) -> bool {
132        self.scheme().is_some()
133    }
134
135    /// Return a string with the scheme prefix removed (if any).
136    pub fn strip_scheme(&self) -> &str {
137        &self.as_str()[self.scheme().map_or(0, |x| x.strip_scheme_index())..self.inner.len()]
138    }
139
140    pub fn file_name(&self) -> Option<&OsStr> {
141        Path::new(self.strip_scheme()).file_name()
142    }
143
144    pub fn extension(&self) -> Option<&str> {
145        Path::new(self.strip_scheme())
146            .extension()
147            .map(|x| x.to_str().unwrap())
148    }
149
150    pub fn parent(&self) -> Option<&str> {
151        Path::new(self.strip_scheme())
152            .parent()
153            .map(|x| x.to_str().unwrap())
154    }
155
156    /// Slices the path.
157    pub fn sliced(&self, range: Range<usize>) -> &PlPath {
158        Self::_new(&self.as_str()[range])
159    }
160
161    /// Strips the scheme, then returns the authority component, and the remaining
162    /// string after the authority component. This can be understood as extracting
163    /// the bucket/prefix for cloud URIs.
164    ///
165    ///  E.g. `https://user@host:port/dir/file?param=value`
166    /// * Authority: `user@host:port`
167    /// * Remaining: `/dir/file?param=value`
168    ///
169    /// Note, for local / `file:` URIs, the returned authority will be empty, and
170    /// the remainder will be the full URI.
171    ///
172    /// # Returns
173    /// (authority, remaining).
174    pub fn strip_scheme_split_authority(&self) -> Option<(&'_ str, &'_ str)> {
175        match self.scheme() {
176            None | Some(CloudScheme::FileNoHostname) => Some(("", self.strip_scheme())),
177            Some(scheme) => {
178                let path_str = self.as_str();
179                let position = self.authority_end_position();
180
181                if position < path_str.len() {
182                    assert!(path_str[position..].starts_with('/'));
183                }
184
185                (position < path_str.len()).then_some((
186                    &path_str[scheme.strip_scheme_index()..position],
187                    &path_str[position..],
188                ))
189            },
190        }
191    }
192
193    /// Returns 0 if `self.scheme()` is `None`. Otherwise, returns `i` such that
194    /// `&self.to_str()[..i]` trims to the authority.
195    /// * If there is no '/', separator found, `i` will simply be the length of the string.
196    ///   * This is except if the scheme is `FileNoHostname`, where instead `i` will be "file:".len()
197    /// * If `self` has no `CloudScheme`, returns 0
198    pub fn authority_end_position(&self) -> usize {
199        match self.scheme() {
200            None => 0,
201            Some(scheme @ CloudScheme::FileNoHostname) => scheme.strip_scheme_index(),
202            Some(_) => {
203                let after_scheme = self.strip_scheme();
204                let offset = self.as_str().len() - after_scheme.len();
205
206                offset + after_scheme.find('/').unwrap_or(after_scheme.len())
207            },
208        }
209    }
210
211    pub fn to_absolute_path(&self) -> PolarsResult<PlRefPath> {
212        PlRefPath::try_from_pathbuf(std::path::absolute(Path::new(self.strip_scheme()))?)
213    }
214
215    pub fn join(&self, other: impl AsRef<str>) -> PlRefPath {
216        let other = other.as_ref();
217
218        if CloudScheme::from_path(other).is_some()
219            || other.starts_with('/')
220            || other.starts_with('\\')
221        {
222            PlRefPath::new(other)
223        } else if CloudScheme::from_path(self.as_str()).is_some() {
224            let lhs = self.as_str().trim_end_matches('/');
225            PlRefPath::new(format!("{lhs}/{other}"))
226        } else {
227            PlRefPath::try_from_pathbuf(self.as_std_path().join(other)).unwrap()
228        }
229    }
230
231    /// Converts backslashes to forward-slashes, and removes `\\?\` prefix.
232    pub fn normalize_windows_path(path_str: &str) -> Option<PlRefPath> {
233        let has_extpath_prefix = path_str.starts_with(WINDOWS_EXTPATH_PREFIX);
234
235        if has_extpath_prefix || cfg!(target_family = "windows") {
236            let path_str = path_str
237                .strip_prefix(WINDOWS_EXTPATH_PREFIX)
238                .unwrap_or(path_str);
239
240            if matches!(
241                CloudScheme::from_path(path_str),
242                None | Some(CloudScheme::File | CloudScheme::FileNoHostname)
243            ) && path_str.contains('\\')
244            {
245                let new_path = path_str.replace('\\', "/");
246                let inner = PlRefStr::from_string(new_path);
247                return Some(PlRefPath { inner });
248            }
249        }
250
251        None
252    }
253}
254
255impl AsRef<str> for PlPath {
256    fn as_ref(&self) -> &str {
257        self.as_str()
258    }
259}
260
261impl AsRef<OsStr> for PlPath {
262    fn as_ref(&self) -> &OsStr {
263        OsStr::new(self.as_str())
264    }
265}
266
267impl AsRef<Path> for PlPath {
268    fn as_ref(&self) -> &Path {
269        self.as_std_path()
270    }
271}
272
273impl From<&PlPath> for Box<PlPath> {
274    fn from(value: &PlPath) -> Self {
275        let s: &str = value.as_str();
276        let s: Box<str> = s.into();
277        // Safety: `PlPath` is `repr(transparent)` on `str`.
278        let out: Box<PlPath> = unsafe { std::mem::transmute(s) };
279        out
280    }
281}
282
283impl Display for PlPath {
284    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285        Display::fmt(self.as_str(), f)
286    }
287}
288
289impl PlRefPath {
290    pub fn empty() -> Self {
291        Self::default()
292    }
293
294    /// Normalizes Windows paths.
295    pub fn new(path: impl AsRef<str> + Into<PlRefStr>) -> Self {
296        if let Some(path) = PlPath::normalize_windows_path(path.as_ref()) {
297            return path;
298        }
299
300        Self::_new_no_normalize(path.into())
301    }
302
303    const fn _new_no_normalize(path: PlRefStr) -> Self {
304        Self { inner: path }
305    }
306
307    pub fn try_from_path(path: &Path) -> PolarsResult<PlRefPath> {
308        Ok(Self::new(PlPath::_try_from_path(path)?.as_str()))
309    }
310
311    pub fn try_from_pathbuf(path: PathBuf) -> PolarsResult<PlRefPath> {
312        Self::try_from_path(&path)
313    }
314
315    pub fn as_str(&self) -> &str {
316        &self.inner
317    }
318
319    pub fn as_ref_str(&self) -> &PlRefStr {
320        &self.inner
321    }
322
323    pub fn into_ref_str(self) -> PlRefStr {
324        self.inner
325    }
326
327    /// Slices the path.
328    pub fn sliced(&self, range: Range<usize>) -> PlRefPath {
329        if range == (0..self.as_str().len()) {
330            self.clone()
331        } else {
332            Self::_new_no_normalize(PlPath::sliced(self, range).as_str().into())
333        }
334    }
335
336    /// # Returns
337    /// Returns an absolute local path if this path ref is a relative local path, otherwise returns None.
338    pub fn to_absolute_path(&self) -> PolarsResult<Cow<'_, PlRefPath>> {
339        Ok(if self.has_scheme() || self.as_std_path().is_absolute() {
340            Cow::Borrowed(self)
341        } else {
342            Cow::Owned(PlPath::to_absolute_path(self)?)
343        })
344    }
345
346    /// Checks if references point to the same allocation.
347    pub fn ptr_eq(this: &Self, other: &Self) -> bool {
348        PlRefStr::ptr_eq(this.as_ref_str(), other.as_ref_str())
349    }
350}
351
352impl AsRef<str> for PlRefPath {
353    fn as_ref(&self) -> &str {
354        self.as_str()
355    }
356}
357
358impl AsRef<OsStr> for PlRefPath {
359    fn as_ref(&self) -> &OsStr {
360        self.as_os_str()
361    }
362}
363
364impl AsRef<Path> for PlRefPath {
365    fn as_ref(&self) -> &Path {
366        self.as_std_path()
367    }
368}
369
370impl Deref for PlRefPath {
371    type Target = PlPath;
372
373    fn deref(&self) -> &Self::Target {
374        PlPath::_new(self)
375    }
376}
377
378impl Display for PlRefPath {
379    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
380        Display::fmt(self.as_str(), f)
381    }
382}
383
384impl ToOwned for PlPath {
385    type Owned = PlRefPath;
386
387    fn to_owned(&self) -> Self::Owned {
388        self.to_ref_path()
389    }
390}
391
392impl Borrow<PlPath> for PlRefPath {
393    fn borrow(&self) -> &PlPath {
394        self
395    }
396}
397
398impl From<&str> for PlRefPath {
399    fn from(value: &str) -> Self {
400        Self::new(value)
401    }
402}
403
404macro_rules! impl_cloud_scheme {
405    ($($t:ident = $n:literal,)+) => {
406        #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
407        #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
408        #[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
409        pub enum CloudScheme {
410            $($t,)+
411            Ext(&'static str)
412        }
413
414        impl CloudScheme {
415            /// Note, private function. Users should use [`CloudScheme::from_path`], that will handle e.g.
416            /// `file:/` without hostname properly.
417            #[expect(unreachable_patterns)]
418            fn from_scheme_str(s: &str) -> Option<Self> {
419                // Allow-list of schemes with an external object_store_builder registered at runtime.
420                match s {
421                    $($n => Some(Self::$t),)+
422                    _ => get_ext_scheme(s).map(Self::Ext),
423                }
424            }
425
426            pub fn is_native_str(s: &str) -> bool {
427                match Self::from_scheme_str(s) {
428                    None | Some(Self::Ext(_)) => false,
429                    Some(_) => true,
430                }
431            }
432
433            pub const fn as_str(&self) -> &'static str {
434                match self {
435                    $(Self::$t => $n,)+
436                    Self::Ext(s) => s,
437                }
438            }
439        }
440    };
441}
442
443impl_cloud_scheme! {
444    Abfs = "abfs",
445    Abfss = "abfss",
446    Adl = "adl",
447    Az = "az",
448    Azure = "azure",
449    File = "file",
450    FileNoHostname = "file",
451    Gcs = "gcs",
452    Gs = "gs",
453    Hf = "hf",
454    Http = "http",
455    Https = "https",
456    S3 = "s3",
457    S3a = "s3a",
458}
459
460impl CloudScheme {
461    pub fn from_path(path: &str) -> Option<Self> {
462        if let Some(stripped) = path.strip_prefix("file:") {
463            return Some(if stripped.starts_with("//") {
464                Self::File
465            } else {
466                Self::FileNoHostname
467            });
468        }
469
470        Self::from_scheme_str(&path[..path.find("://")?])
471    }
472
473    /// Returns `i` such that `&self.as_str()[i..]` strips the scheme, as well as the `://` if it
474    /// exists.
475    pub fn strip_scheme_index(&self) -> usize {
476        if let Self::FileNoHostname = self {
477            5
478        } else {
479            self.as_str().len() + 3
480        }
481    }
482}
483
484impl Display for CloudScheme {
485    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
486        Display::fmt(self.as_str(), f)
487    }
488}
489
490/// Formats a local path to begin with `file:///`.
491///
492/// # Panics
493/// May panic if `absolute_local_path` is not an absolute local path.
494pub fn format_file_uri(absolute_local_path: &str) -> PlRefPath {
495    // Windows needs an extra slash, i.e.:
496    //
497    // # Windows
498    // Absolute path: "C:/Windows/system32"
499    // Formatted: "file:///C:/Windows/system32"
500    //
501    // # Unix
502    // Absolute path: "/root/.vimrc"
503    // Formatted: "file:///root/.vimrc"
504    if cfg!(target_family = "windows") || absolute_local_path.starts_with(WINDOWS_EXTPATH_PREFIX) {
505        if let Some(path) = PlPath::normalize_windows_path(absolute_local_path) {
506            PlRefPath::new(format_pl_refstr!("file:///{path}"))
507        } else {
508            PlRefPath::new(format_pl_refstr!("file:///{absolute_local_path}"))
509        }
510    } else {
511        PlRefPath::new(format_pl_refstr!("file://{absolute_local_path}"))
512    }
513}
514
515#[cfg(test)]
516mod tests {
517    use super::*;
518
519    #[test]
520    fn test_plpath_file() {
521        let p = PlRefPath::new("file:///home/user");
522        assert_eq!(
523            (
524                p.scheme(),
525                p.scheme().map(|x| x.as_str()),
526                p.as_str(),
527                p.strip_scheme(),
528            ),
529            (
530                Some(CloudScheme::File),
531                Some("file"),
532                "file:///home/user",
533                "/home/user"
534            )
535        );
536
537        let p = PlRefPath::new("file:/home/user");
538        assert_eq!(
539            (
540                p.scheme(),
541                p.scheme().map(|x| x.as_str()),
542                p.as_str(),
543                p.strip_scheme(),
544            ),
545            (
546                Some(CloudScheme::FileNoHostname),
547                Some("file"),
548                "file:/home/user",
549                "/home/user"
550            )
551        );
552
553        assert_eq!(PlRefPath::new("file://").scheme(), Some(CloudScheme::File));
554
555        assert_eq!(
556            PlRefPath::new("file://").strip_scheme_split_authority(),
557            None
558        );
559
560        assert_eq!(
561            PlRefPath::new("file:///").strip_scheme_split_authority(),
562            Some(("", "/"))
563        );
564
565        assert_eq!(
566            PlRefPath::new("file:///path").strip_scheme_split_authority(),
567            Some(("", "/path"))
568        );
569
570        assert_eq!(
571            PlRefPath::new("file://hostname:80/path").strip_scheme_split_authority(),
572            Some(("hostname:80", "/path"))
573        );
574
575        assert_eq!(
576            PlRefPath::new("file:").scheme(),
577            Some(CloudScheme::FileNoHostname)
578        );
579        assert_eq!(
580            PlRefPath::new("file:/").scheme(),
581            Some(CloudScheme::FileNoHostname)
582        );
583        assert_eq!(
584            PlRefPath::new("file:").strip_scheme_split_authority(),
585            Some(("", ""))
586        );
587        assert_eq!(
588            PlRefPath::new("file:/Local/path").strip_scheme_split_authority(),
589            Some(("", "/Local/path"))
590        );
591
592        assert_eq!(
593            PlRefPath::new(r#"\\?\C:\Windows\system32"#).as_str(),
594            "C:/Windows/system32"
595        );
596    }
597
598    #[test]
599    fn test_plpath_join() {
600        assert_eq!(
601            PlRefPath::new("s3://.../...").join("az://.../...").as_str(),
602            "az://.../..."
603        );
604
605        assert_eq!(
606            PlRefPath::new("s3://.../...")
607                .join("a=1/b=1/00000000.parquet")
608                .as_str(),
609            "s3://.../.../a=1/b=1/00000000.parquet"
610        );
611
612        assert_eq!(
613            PlRefPath::new("s3://.../...//")
614                .join("a=1/b=1/00000000.parquet")
615                .as_str(),
616            "s3://.../.../a=1/b=1/00000000.parquet"
617        );
618
619        fn _assert_plpath_join(base: &str, added: &str, expect: &str) {
620            // Normal path test
621            let expect = PlRefPath::new(expect);
622            let base = base.replace('/', std::path::MAIN_SEPARATOR_STR);
623            let added = added.replace('/', std::path::MAIN_SEPARATOR_STR);
624
625            assert_eq!(PlRefPath::new(&base).join(&added), expect);
626
627            // URI path test
628            let uri_base = format_file_uri(&base);
629            let expect_uri = if added.starts_with(std::path::MAIN_SEPARATOR_STR) {
630                expect.clone()
631            } else {
632                format_file_uri(expect.as_str())
633            };
634
635            assert_eq!(PlRefPath::new(uri_base.as_str()).join(added), expect_uri);
636        }
637
638        macro_rules! assert_plpath_join {
639            ($base:literal + $added:literal => $expect:literal) => {
640                _assert_plpath_join($base, $added, $expect)
641            };
642        }
643
644        assert_plpath_join!("a/b/c/" + "d/e" => "a/b/c/d/e");
645        assert_plpath_join!("a/b/c" + "d/e" => "a/b/c/d/e");
646        assert_plpath_join!("a/b/c" + "d/e/" => "a/b/c/d/e/");
647        assert_plpath_join!("a/b/c" + "/d" => "/d");
648        assert_plpath_join!("a/b/c" + "/d/" => "/d/");
649        assert_plpath_join!("" + "/d/" => "/d/");
650        assert_plpath_join!("/" + "/d/" => "/d/");
651        assert_plpath_join!("/x/y" + "/d/" => "/d/");
652        assert_plpath_join!("/x/y" + "/d" => "/d");
653        assert_plpath_join!("/x/y" + "d" => "/x/y/d");
654
655        assert_plpath_join!("/a/longer" + "path" => "/a/longer/path");
656        assert_plpath_join!("/a/longer" + "/path" => "/path");
657        assert_plpath_join!("/a/longer" + "path/test" => "/a/longer/path/test");
658        assert_plpath_join!("/a/longer" + "/path/test" => "/path/test");
659    }
660
661    #[test]
662    fn test_plpath_name() {
663        assert_eq!(PlRefPath::new("s3://...").file_name(), Some("...".as_ref()));
664        assert_eq!(
665            PlRefPath::new("a/b/file.parquet").file_name(),
666            Some("file.parquet".as_ref())
667        );
668        assert_eq!(
669            PlRefPath::new("file.parquet").file_name(),
670            Some("file.parquet".as_ref())
671        );
672
673        assert_eq!(PlRefPath::new("s3://").file_name(), None);
674        assert_eq!(PlRefPath::new("").file_name(), None);
675    }
676}