polars_utils/
plpath.rs

1use core::fmt;
2use std::ffi::OsStr;
3use std::path::{Path, PathBuf};
4use std::sync::Arc;
5
6/// A Path or URI
7#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
8#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
9#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
10pub enum PlPath {
11    Local(Arc<Path>),
12    Cloud(PlCloudPath),
13}
14
15/// A reference to a Path or URI
16#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
17pub enum PlPathRef<'a> {
18    Local(&'a Path),
19    Cloud(PlCloudPathRef<'a>),
20}
21
22#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
23#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
24#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
25pub struct PlCloudPath {
26    /// The scheme used in cloud e.g. `s3://` or `file://`.
27    scheme: CloudScheme,
28    /// The full URI e.g. `s3://path/to/bucket`.
29    uri: Arc<str>,
30}
31
32#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
33pub struct PlCloudPathRef<'a> {
34    /// The scheme used in cloud e.g. `s3://` or `file://`.
35    scheme: CloudScheme,
36    /// The full URI e.g. `s3://path/to/bucket`.
37    uri: &'a str,
38}
39
40impl<'a> fmt::Display for PlCloudPathRef<'a> {
41    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42        f.write_str(self.uri())
43    }
44}
45
46impl fmt::Display for PlCloudPath {
47    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48        self.as_ref().fmt(f)
49    }
50}
51
52impl PlCloudPath {
53    pub fn as_ref(&self) -> PlCloudPathRef<'_> {
54        PlCloudPathRef {
55            scheme: self.scheme,
56            uri: self.uri.as_ref(),
57        }
58    }
59
60    pub fn strip_scheme(&self) -> &str {
61        self.scheme.strip_scheme_from_uri(&self.uri)
62    }
63}
64
65impl PlCloudPathRef<'_> {
66    pub fn new<'a>(uri: &'a str) -> Option<PlCloudPathRef<'a>> {
67        CloudScheme::from_uri(uri).map(|scheme| PlCloudPathRef { scheme, uri })
68    }
69
70    pub fn into_owned(self) -> PlCloudPath {
71        PlCloudPath {
72            scheme: self.scheme,
73            uri: self.uri.into(),
74        }
75    }
76
77    pub fn scheme(&self) -> CloudScheme {
78        self.scheme
79    }
80
81    pub fn uri(&self) -> &str {
82        self.uri
83    }
84
85    pub fn strip_scheme(&self) -> &str {
86        self.scheme.strip_scheme_from_uri(self.uri)
87    }
88}
89
90pub struct PlPathDisplay<'a> {
91    path: PlPathRef<'a>,
92}
93
94impl<'a> fmt::Display for PlPathDisplay<'a> {
95    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96        match self.path {
97            PlPathRef::Local(p) => p.display().fmt(f),
98            PlPathRef::Cloud(p) => p.fmt(f),
99        }
100    }
101}
102
103macro_rules! impl_cloud_scheme {
104    ($($t:ident = $n:literal,)+) => {
105        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
106        #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
107        #[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
108        pub enum CloudScheme {
109            $($t,)+
110        }
111
112        impl CloudScheme {
113             /// Note, private function. Users should use [`CloudScheme::from_uri`], that will handle e.g.
114            /// `file:/` without hostname properly.
115            #[expect(unreachable_patterns)]
116            fn from_scheme(s: &str) -> Option<Self> {
117                Some(match s {
118                    $($n => Self::$t,)+
119                    _ => return None,
120                })
121            }
122
123            pub const fn as_str(&self) -> &'static str {
124                match self {
125                    $(Self::$t => $n,)+
126                }
127            }
128        }
129    };
130}
131
132impl_cloud_scheme! {
133    Abfs = "abfs",
134    Abfss = "abfss",
135    Adl = "adl",
136    Az = "az",
137    Azure = "azure",
138    File = "file",
139    FileNoHostname = "file",
140    Gcs = "gcs",
141    Gs = "gs",
142    Hf = "hf",
143    Http = "http",
144    Https = "https",
145    S3 = "s3",
146    S3a = "s3a",
147}
148
149impl CloudScheme {
150    pub fn from_uri(path: &str) -> Option<Self> {
151        if path.starts_with("file:/") {
152            return Some(if path.as_bytes().get(6) != Some(&b'/') {
153                Self::FileNoHostname
154            } else {
155                Self::File
156            });
157        }
158
159        Self::from_scheme(&path[..path.find("://")?])
160    }
161
162    pub fn strip_scheme_from_uri<'a>(&self, uri: &'a str) -> &'a str {
163        &uri[self.strip_scheme_index()..]
164    }
165
166    /// Returns `i` such that `&self.as_str()[i..]` strips the scheme, as well as the `://` if it
167    /// exists.
168    pub fn strip_scheme_index(&self) -> usize {
169        if let Self::FileNoHostname = self {
170            5
171        } else {
172            self.as_str().len() + 3
173        }
174    }
175}
176
177impl fmt::Display for CloudScheme {
178    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
179        f.write_str(self.as_str())
180    }
181}
182
183impl<'a> PlPathRef<'a> {
184    pub fn scheme(&self) -> Option<CloudScheme> {
185        match self {
186            Self::Local(_) => None,
187            Self::Cloud(p) => Some(p.scheme),
188        }
189    }
190
191    pub fn is_local(&self) -> bool {
192        matches!(self, Self::Local(_))
193    }
194
195    pub fn is_cloud_url(&self) -> bool {
196        matches!(self, Self::Cloud(_))
197    }
198
199    pub fn as_local_path(&self) -> Option<&Path> {
200        match self {
201            Self::Local(p) => Some(p),
202            Self::Cloud(_) => None,
203        }
204    }
205
206    pub fn as_cloud_path(&'a self) -> Option<PlCloudPathRef<'a>> {
207        match self {
208            Self::Local(_) => None,
209            Self::Cloud(p) => Some(*p),
210        }
211    }
212
213    pub fn join(&self, other: impl AsRef<str>) -> PlPath {
214        let other = other.as_ref();
215        if other.is_empty() {
216            return self.into_owned();
217        }
218
219        match self {
220            Self::Local(p) => PlPath::Local(p.join(other).into()),
221            Self::Cloud(p) => {
222                if let Some(cloud_path) = PlCloudPathRef::new(other) {
223                    return PlPath::Cloud(cloud_path.into_owned());
224                }
225
226                let needs_slash = !p.uri.ends_with('/') && !other.starts_with('/');
227
228                let mut out =
229                    String::with_capacity(p.uri.len() + usize::from(needs_slash) + other.len());
230
231                out.push_str(p.uri);
232                if needs_slash {
233                    out.push('/');
234                }
235                // NOTE: This has as a consequence that pushing an absolute path into a URI
236                // just pushes the slashes while for a path it will make that absolute path the new
237                // path. I think this is acceptable as I don't really know what the alternative
238                // would be.
239                out.push_str(other);
240
241                let uri = out.into();
242                PlPath::Cloud(PlCloudPath {
243                    scheme: p.scheme,
244                    uri,
245                })
246            },
247        }
248    }
249
250    pub fn display(&self) -> PlPathDisplay<'_> {
251        PlPathDisplay { path: *self }
252    }
253
254    pub fn from_local_path(path: &'a Path) -> Self {
255        Self::Local(path)
256    }
257
258    pub fn new(uri: &'a str) -> Self {
259        if let Some(scheme) = CloudScheme::from_uri(uri) {
260            Self::Cloud(PlCloudPathRef { scheme, uri })
261        } else {
262            Self::from_local_path(Path::new(uri))
263        }
264    }
265
266    pub fn into_owned(self) -> PlPath {
267        match self {
268            Self::Local(p) => PlPath::Local(p.into()),
269            Self::Cloud(p) => PlPath::Cloud(p.into_owned()),
270        }
271    }
272
273    pub fn strip_scheme(&self) -> &str {
274        match self {
275            Self::Local(p) => p.to_str().unwrap(),
276            Self::Cloud(p) => p.strip_scheme(),
277        }
278    }
279
280    pub fn parent(&self) -> Option<Self> {
281        Some(match self {
282            Self::Local(p) => Self::Local(p.parent()?),
283            Self::Cloud(p) => {
284                let uri = p.uri;
285                let offset_start = p.scheme.strip_scheme_index();
286                let last_slash = uri[offset_start..]
287                    .char_indices()
288                    .rev()
289                    .find(|(_, c)| *c == '/')?
290                    .0;
291                let uri = &uri[..offset_start + last_slash];
292
293                Self::Cloud(PlCloudPathRef {
294                    scheme: p.scheme,
295                    uri,
296                })
297            },
298        })
299    }
300
301    pub fn file_name(&self) -> Option<&OsStr> {
302        match self {
303            Self::Local(p) => {
304                if p.is_dir() {
305                    None
306                } else {
307                    p.file_name()
308                }
309            },
310            Self::Cloud(p) => {
311                if p.scheme() == CloudScheme::File
312                    && std::fs::metadata(p.strip_scheme()).is_ok_and(|x| x.is_dir())
313                {
314                    return None;
315                }
316
317                let p = p.strip_scheme();
318                let out = p.rfind('/').map_or(p, |i| &p[i + 1..]);
319                (!out.is_empty()).then_some(out.as_ref())
320            },
321        }
322    }
323
324    pub fn extension(&self) -> Option<&str> {
325        match self {
326            Self::Local(path) => path.extension().and_then(|e| e.to_str()),
327            Self::Cloud(_) => {
328                let after_scheme = self.strip_scheme();
329
330                after_scheme.rfind(['.', '/']).and_then(|i| {
331                    after_scheme[i..]
332                        .starts_with('.')
333                        .then_some(&after_scheme[i + 1..])
334                })
335            },
336        }
337    }
338
339    pub fn to_str(&self) -> &'a str {
340        match self {
341            Self::Local(p) => p.to_str().unwrap(),
342            Self::Cloud(p) => p.uri,
343        }
344    }
345
346    // It is up to the caller to ensure that the offset parameter 'n' matches
347    // a valid path segment starting index
348    pub fn offset_bytes(&'a self, n: usize) -> PathBuf {
349        let s = self.to_str();
350        if let Some(scheme) = self.scheme()
351            && n > 0
352        {
353            debug_assert!(n >= scheme.as_str().len())
354        }
355        PathBuf::from(&s[n..])
356    }
357
358    /// Strips the scheme, then returns the authority component, and the remaining
359    /// string after the authority component. This can be understood as extracting
360    /// the bucket/prefix for cloud URIs.
361    ///
362    ///  E.g. `https://user@host:port/dir/file?param=value`
363    /// * Authority: `user@host:port`
364    /// * Remaining: `/dir/file?param=value`
365    ///
366    /// Note, for local / `file:` URIs, the returned authority will be empty, and
367    /// the remainder will be the full URI.
368    ///
369    /// # Returns
370    /// (authority, remaining).
371    pub fn strip_scheme_split_authority(&self) -> Option<(&'_ str, &'_ str)> {
372        match self.scheme() {
373            None | Some(CloudScheme::File | CloudScheme::FileNoHostname) => {
374                Some(("", self.strip_scheme()))
375            },
376            Some(scheme) => {
377                let path_str = self.to_str();
378                let position = self.authority_end_position();
379
380                if position < path_str.len() {
381                    assert!(path_str[position..].starts_with('/'));
382                }
383
384                (position < path_str.len()).then_some((
385                    &path_str[scheme.strip_scheme_index()..position],
386                    &path_str[position..],
387                ))
388            },
389        }
390    }
391
392    /// Returns `i` such that `&self.to_str()[..i]` trims to the authority. If there is no '/'
393    /// separator found, `i` will simply be the length of the string.
394    pub fn authority_end_position(&self) -> usize {
395        match self.scheme() {
396            None | Some(CloudScheme::File | CloudScheme::FileNoHostname) => 0,
397            Some(_) => {
398                let after_scheme = self.strip_scheme();
399                let offset = self.to_str().len() - after_scheme.len();
400
401                offset + after_scheme.find('/').unwrap_or(after_scheme.len())
402            },
403        }
404    }
405
406    /// # Returns
407    /// Returns an absolute local path if this path ref is a relative local path, otherwise returns None.
408    pub fn to_absolute_path(&self) -> Option<PlPath> {
409        if let Self::Local(p) = self
410            && !p.is_absolute()
411            && !p.to_str().unwrap().is_empty()
412        {
413            Some(PlPath::new(
414                std::path::absolute(p).unwrap().to_str().unwrap(),
415            ))
416        } else {
417            None
418        }
419    }
420}
421
422impl PlPath {
423    pub fn new(uri: &str) -> Self {
424        PlPathRef::new(uri).into_owned()
425    }
426
427    pub fn display(&self) -> PlPathDisplay<'_> {
428        PlPathDisplay {
429            path: match self {
430                Self::Local(p) => PlPathRef::Local(p.as_ref()),
431                Self::Cloud(p) => PlPathRef::Cloud(p.as_ref()),
432            },
433        }
434    }
435
436    pub fn is_local(&self) -> bool {
437        self.as_ref().is_local()
438    }
439
440    pub fn is_cloud_url(&self) -> bool {
441        self.as_ref().is_cloud_url()
442    }
443
444    // We don't want FromStr since we are infallible.
445    #[expect(clippy::should_implement_trait)]
446    pub fn from_str(uri: &str) -> Self {
447        Self::new(uri)
448    }
449
450    pub fn from_string(uri: String) -> Self {
451        Self::new(&uri)
452    }
453
454    pub fn as_ref(&self) -> PlPathRef<'_> {
455        match self {
456            Self::Local(p) => PlPathRef::Local(p.as_ref()),
457            Self::Cloud(p) => PlPathRef::Cloud(p.as_ref()),
458        }
459    }
460
461    pub fn cloud_scheme(&self) -> Option<CloudScheme> {
462        match self {
463            Self::Local(_) => None,
464            Self::Cloud(p) => Some(p.scheme),
465        }
466    }
467
468    pub fn to_str(&self) -> &str {
469        match self {
470            Self::Local(p) => p.to_str().unwrap(),
471            Self::Cloud(p) => p.uri.as_ref(),
472        }
473    }
474
475    pub fn into_local_path(self) -> Option<Arc<Path>> {
476        match self {
477            PlPath::Local(path) => Some(path),
478            PlPath::Cloud(_) => None,
479        }
480    }
481}
482
483#[cfg(test)]
484mod tests {
485    use super::*;
486
487    #[test]
488    fn test_plpath_file() {
489        let p = PlPath::new("file:///home/user");
490        assert_eq!(
491            (
492                p.cloud_scheme(),
493                p.cloud_scheme().map(|x| x.as_str()),
494                p.to_str(),
495                p.as_ref().strip_scheme(),
496            ),
497            (
498                Some(CloudScheme::File),
499                Some("file"),
500                "file:///home/user",
501                "/home/user"
502            )
503        );
504
505        let p = PlPath::new("file:/home/user");
506        assert_eq!(
507            (
508                p.cloud_scheme(),
509                p.cloud_scheme().map(|x| x.as_str()),
510                p.to_str(),
511                p.as_ref().strip_scheme(),
512            ),
513            (
514                Some(CloudScheme::FileNoHostname),
515                Some("file"),
516                "file:/home/user",
517                "/home/user"
518            )
519        );
520    }
521
522    #[test]
523    fn plpath_join() {
524        fn _assert_plpath_join(base: &str, added: &str, expect: &str, expect_uri: Option<&str>) {
525            // Normal path test
526            let path_base = base
527                .chars()
528                .map(|c| match c {
529                    '/' => std::path::MAIN_SEPARATOR,
530                    c => c,
531                })
532                .collect::<String>();
533            let path_added = added
534                .chars()
535                .map(|c| match c {
536                    '/' => std::path::MAIN_SEPARATOR,
537                    c => c,
538                })
539                .collect::<String>();
540            let path_result = expect
541                .chars()
542                .map(|c| match c {
543                    '/' => std::path::MAIN_SEPARATOR,
544                    c => c,
545                })
546                .collect::<String>();
547            assert_eq!(
548                PlPath::new(&path_base).as_ref().join(path_added).to_str(),
549                path_result
550            );
551
552            if let Some(expect_uri) = expect_uri {
553                // URI path test
554                let uri_base = format!("file://{base}");
555
556                let uri_result = format!("file://{expect_uri}");
557                assert_eq!(
558                    PlPath::new(uri_base.as_str()).as_ref().join(added).to_str(),
559                    uri_result.as_str()
560                );
561            }
562        }
563
564        macro_rules! assert_plpath_join {
565            ($base:literal + $added:literal => $expect:literal) => {
566                _assert_plpath_join($base, $added, $expect, None)
567            };
568            ($base:literal + $added:literal => $expect:literal, $uri_result:literal) => {
569                _assert_plpath_join($base, $added, $expect, Some($uri_result))
570            };
571        }
572
573        assert_plpath_join!("a/b/c/" + "d/e" => "a/b/c/d/e");
574        assert_plpath_join!("a/b/c" + "d/e" => "a/b/c/d/e");
575        assert_plpath_join!("a/b/c" + "d/e/" => "a/b/c/d/e/");
576        assert_plpath_join!("a/b/c" + "" => "a/b/c");
577        assert_plpath_join!("a/b/c" + "/d" => "/d", "a/b/c/d");
578        assert_plpath_join!("a/b/c" + "/d/" => "/d/", "a/b/c/d/");
579        assert_plpath_join!("" + "/d/" => "/d/");
580        assert_plpath_join!("/" + "/d/" => "/d/", "//d/");
581        assert_plpath_join!("/x/y" + "/d/" => "/d/", "/x/y/d/");
582        assert_plpath_join!("/x/y" + "/d" => "/d", "/x/y/d");
583        assert_plpath_join!("/x/y" + "d" => "/x/y/d");
584
585        assert_plpath_join!("/a/longer" + "path" => "/a/longer/path");
586        assert_plpath_join!("/a/longer" + "/path" => "/path", "/a/longer/path");
587        assert_plpath_join!("/a/longer" + "path/wow" => "/a/longer/path/wow");
588        assert_plpath_join!("/a/longer" + "/path/wow" => "/path/wow", "/a/longer/path/wow");
589        assert_plpath_join!("/an/even/longer" + "path" => "/an/even/longer/path");
590        assert_plpath_join!("/an/even/longer" + "/path" => "/path", "/an/even/longer/path");
591        assert_plpath_join!("/an/even/longer" + "path/wow" => "/an/even/longer/path/wow");
592        assert_plpath_join!("/an/even/longer" + "/path/wow" => "/path/wow", "/an/even/longer/path/wow");
593    }
594
595    #[test]
596    fn test_plpath_name() {
597        assert_eq!(PlPathRef::new("s3://...").file_name(), Some("...".as_ref()));
598        assert_eq!(
599            PlPathRef::new("a/b/file.parquet").file_name(),
600            Some("file.parquet".as_ref())
601        );
602        assert_eq!(
603            PlPathRef::new("file.parquet").file_name(),
604            Some("file.parquet".as_ref())
605        );
606
607        assert_eq!(PlPathRef::new("s3://").file_name(), None);
608        assert_eq!(PlPathRef::new("").file_name(), None);
609    }
610}