polars_utils/
plpath.rs

1use core::fmt;
2use std::path::{Path, PathBuf};
3use std::str::FromStr;
4use std::sync::Arc;
5
6/// A Path or URI
7#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
8#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
9#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
10pub enum PlPath {
11    Local(Arc<Path>),
12    Cloud(PlCloudPath),
13}
14
15/// A reference to a Path or URI
16#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
17pub enum PlPathRef<'a> {
18    Local(&'a Path),
19    Cloud(PlCloudPathRef<'a>),
20}
21
22#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
23#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
24#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
25pub struct PlCloudPath {
26    /// The scheme used in cloud e.g. `s3://` or `file://`.
27    scheme: CloudScheme,
28    /// The full URI e.g. `s3://path/to/bucket`.
29    uri: Arc<str>,
30}
31
32#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
33pub struct PlCloudPathRef<'a> {
34    /// The scheme used in cloud e.g. `s3://` or `file://`.
35    scheme: CloudScheme,
36    /// The full URI e.g. `s3://path/to/bucket`.
37    uri: &'a str,
38}
39
40impl<'a> fmt::Display for PlCloudPathRef<'a> {
41    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42        f.write_str(self.uri())
43    }
44}
45
46impl fmt::Display for PlCloudPath {
47    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48        self.as_ref().fmt(f)
49    }
50}
51
52impl PlCloudPath {
53    pub fn as_ref(&self) -> PlCloudPathRef<'_> {
54        PlCloudPathRef {
55            scheme: self.scheme,
56            uri: self.uri.as_ref(),
57        }
58    }
59
60    pub fn strip_scheme(&self) -> &str {
61        &self.uri[self.scheme.as_str().len() + 3..]
62    }
63}
64
65impl PlCloudPathRef<'_> {
66    pub fn new<'a>(uri: &'a str) -> Option<PlCloudPathRef<'a>> {
67        if let Some(i) = uri.find([':', '/']) {
68            if uri[i..].starts_with("://") && CLOUD_SCHEME_REGEX.is_match(&uri[..i]) {
69                let scheme = CloudScheme::from_str(&uri[..i]).unwrap();
70                return Some(PlCloudPathRef { scheme, uri });
71            }
72        }
73
74        None
75    }
76
77    pub fn into_owned(self) -> PlCloudPath {
78        PlCloudPath {
79            scheme: self.scheme,
80            uri: self.uri.into(),
81        }
82    }
83
84    pub fn scheme(&self) -> CloudScheme {
85        self.scheme
86    }
87
88    pub fn uri(&self) -> &str {
89        self.uri
90    }
91
92    pub fn strip_scheme(&self) -> &str {
93        &self.uri[self.scheme.as_str().len() + "://".len()..]
94    }
95}
96
97pub struct AddressDisplay<'a> {
98    addr: PlPathRef<'a>,
99}
100
101impl<'a> fmt::Display for AddressDisplay<'a> {
102    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
103        match self.addr {
104            PlPathRef::Local(p) => p.display().fmt(f),
105            PlPathRef::Cloud(p) => p.fmt(f),
106        }
107    }
108}
109
110macro_rules! impl_scheme {
111    ($($t:ident = $n:literal,)+) => {
112        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
113        #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
114        #[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
115        pub enum CloudScheme {
116            $($t,)+
117        }
118
119        impl FromStr for CloudScheme {
120            type Err = ();
121
122            fn from_str(s: &str) -> Result<Self, Self::Err> {
123                match s {
124                    $($n => Ok(Self::$t),)+
125                    _ => Err(()),
126                }
127            }
128        }
129
130        impl CloudScheme {
131            pub fn as_str(&self) -> &'static str {
132                match self {
133                    $(Self::$t => $n,)+
134                }
135            }
136        }
137    };
138}
139
140impl_scheme! {
141    S3 = "s3",
142    S3a = "s3a",
143    Gs = "gs",
144    Gcs = "gcs",
145    File = "file",
146    Abfs = "abfs",
147    Abfss = "abfss",
148    Azure = "azure",
149    Az = "az",
150    Adl = "adl",
151    Http = "http",
152    Https = "https",
153    Hf = "hf",
154}
155
156impl fmt::Display for CloudScheme {
157    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
158        f.write_str(self.as_str())
159    }
160}
161
162crate::regex_cache::cached_regex! {
163    static CLOUD_SCHEME_REGEX = r"^(s3a?|gs|gcs|file|abfss?|azure|az|adl|https?|hf)$";
164}
165
166impl<'a> PlPathRef<'a> {
167    pub fn scheme(&self) -> Option<CloudScheme> {
168        match self {
169            Self::Local(_) => None,
170            Self::Cloud(p) => Some(p.scheme),
171        }
172    }
173
174    pub fn is_local(&self) -> bool {
175        matches!(self, Self::Local(_))
176    }
177
178    pub fn is_cloud_url(&self) -> bool {
179        matches!(self, Self::Cloud(_))
180    }
181
182    pub fn as_local_path(&self) -> Option<&Path> {
183        match self {
184            Self::Local(p) => Some(p),
185            Self::Cloud(_) => None,
186        }
187    }
188
189    pub fn as_cloud_addr(&self) -> Option<PlCloudPathRef<'_>> {
190        match self {
191            Self::Local(_) => None,
192            Self::Cloud(p) => Some(*p),
193        }
194    }
195
196    pub fn join(&self, other: impl AsRef<str>) -> PlPath {
197        let other = other.as_ref();
198        if other.is_empty() {
199            return self.into_owned();
200        }
201
202        match self {
203            Self::Local(p) => PlPath::Local(p.join(other).into()),
204            Self::Cloud(p) => {
205                if let Some(cloud_path) = PlCloudPathRef::new(other) {
206                    return PlPath::Cloud(cloud_path.into_owned());
207                }
208
209                let needs_slash = !p.uri.ends_with('/') && !other.starts_with('/');
210
211                let mut out =
212                    String::with_capacity(p.uri.len() + usize::from(needs_slash) + other.len());
213
214                out.push_str(p.uri);
215                if needs_slash {
216                    out.push('/');
217                }
218                // NOTE: This has as a consequence that pushing an absolute path into a URI
219                // just pushes the slashes while for a path it will make that absolute path the new
220                // path. I think this is acceptable as I don't really know what the alternative
221                // would be.
222                out.push_str(other);
223
224                let uri = out.into();
225                PlPath::Cloud(PlCloudPath {
226                    scheme: p.scheme,
227                    uri,
228                })
229            },
230        }
231    }
232
233    pub fn display(&self) -> AddressDisplay<'_> {
234        AddressDisplay { addr: *self }
235    }
236
237    pub fn from_local_path(path: &'a Path) -> Self {
238        Self::Local(path)
239    }
240
241    pub fn new(uri: &'a str) -> Self {
242        if let Some(i) = uri.find([':', '/']) {
243            if uri[i..].starts_with("://") && CLOUD_SCHEME_REGEX.is_match(&uri[..i]) {
244                let scheme = CloudScheme::from_str(&uri[..i]).unwrap();
245                return Self::Cloud(PlCloudPathRef { scheme, uri });
246            }
247        }
248
249        Self::from_local_path(Path::new(uri))
250    }
251
252    pub fn into_owned(self) -> PlPath {
253        match self {
254            Self::Local(p) => PlPath::Local(p.into()),
255            Self::Cloud(p) => PlPath::Cloud(p.into_owned()),
256        }
257    }
258
259    pub fn strip_scheme(&self) -> &str {
260        match self {
261            Self::Local(p) => p.to_str().unwrap(),
262            Self::Cloud(p) => p.strip_scheme(),
263        }
264    }
265
266    pub fn parent(&self) -> Option<Self> {
267        Some(match self {
268            Self::Local(p) => Self::Local(p.parent()?),
269            Self::Cloud(p) => {
270                let uri = p.uri;
271                let offset_start = p.scheme.as_str().len() + 3;
272                let last_slash = uri[offset_start..]
273                    .char_indices()
274                    .rev()
275                    .find(|(_, c)| *c == '/')?
276                    .0;
277                let uri = &uri[..offset_start + last_slash];
278
279                Self::Cloud(PlCloudPathRef {
280                    scheme: p.scheme,
281                    uri,
282                })
283            },
284        })
285    }
286
287    pub fn extension(&self) -> Option<&str> {
288        match self {
289            Self::Local(path) => path.extension().and_then(|e| e.to_str()),
290            Self::Cloud(_) => {
291                let offset_path = self.strip_scheme();
292                let separator = '/';
293
294                let mut ext_start = None;
295                for (i, c) in offset_path.char_indices() {
296                    if c == separator {
297                        ext_start = None;
298                    }
299
300                    if c == '.' && ext_start.is_none() {
301                        ext_start = Some(i);
302                    }
303                }
304
305                ext_start.map(|i| &offset_path[i + 1..])
306            },
307        }
308    }
309
310    pub fn to_str(&self) -> &'a str {
311        match self {
312            Self::Local(p) => p.to_str().unwrap(),
313            Self::Cloud(p) => p.uri,
314        }
315    }
316
317    // It is up to the caller to ensure that the offset parameter 'n' matches
318    // a valid path segment starting index
319    pub fn offset_bytes(&'a self, n: usize) -> PathBuf {
320        let s = self.to_str();
321        if let Some(scheme) = self.scheme()
322            && n > 0
323        {
324            debug_assert!(n >= scheme.as_str().len())
325        }
326        PathBuf::from(&s[n..])
327    }
328}
329
330impl PlPath {
331    pub fn new(uri: &str) -> Self {
332        PlPathRef::new(uri).into_owned()
333    }
334
335    pub fn display(&self) -> AddressDisplay<'_> {
336        AddressDisplay {
337            addr: match self {
338                Self::Local(p) => PlPathRef::Local(p.as_ref()),
339                Self::Cloud(p) => PlPathRef::Cloud(p.as_ref()),
340            },
341        }
342    }
343
344    pub fn is_local(&self) -> bool {
345        self.as_ref().is_local()
346    }
347
348    pub fn is_cloud_url(&self) -> bool {
349        self.as_ref().is_cloud_url()
350    }
351
352    // We don't want FromStr since we are infallible.
353    #[expect(clippy::should_implement_trait)]
354    pub fn from_str(uri: &str) -> Self {
355        Self::new(uri)
356    }
357
358    pub fn from_string(uri: String) -> Self {
359        Self::new(&uri)
360    }
361
362    pub fn as_ref(&self) -> PlPathRef<'_> {
363        match self {
364            Self::Local(p) => PlPathRef::Local(p.as_ref()),
365            Self::Cloud(p) => PlPathRef::Cloud(p.as_ref()),
366        }
367    }
368
369    pub fn cloud_scheme(&self) -> Option<CloudScheme> {
370        match self {
371            Self::Local(_) => None,
372            Self::Cloud(p) => Some(p.scheme),
373        }
374    }
375
376    pub fn to_str(&self) -> &str {
377        match self {
378            Self::Local(p) => p.to_str().unwrap(),
379            Self::Cloud(p) => p.uri.as_ref(),
380        }
381    }
382
383    pub fn into_local_path(self) -> Option<Arc<Path>> {
384        match self {
385            PlPath::Local(path) => Some(path),
386            PlPath::Cloud(_) => None,
387        }
388    }
389}
390
391#[cfg(test)]
392mod tests {
393    use super::*;
394
395    #[test]
396    fn plpath_join() {
397        macro_rules! assert_plpath_join {
398            ($base:literal + $added:literal => $result:literal$(, $uri_result:literal)?) => {
399                // Normal path test
400                let path_base = $base.chars().map(|c| match c {
401                    '/' => std::path::MAIN_SEPARATOR,
402                    c => c,
403                }).collect::<String>();
404                let path_added = $added.chars().map(|c| match c {
405                    '/' => std::path::MAIN_SEPARATOR,
406                    c => c,
407                }).collect::<String>();
408                let path_result = $result.chars().map(|c| match c {
409                    '/' => std::path::MAIN_SEPARATOR,
410                    c => c,
411                }).collect::<String>();
412                assert_eq!(PlPath::new(&path_base).as_ref().join(path_added).to_str(), path_result);
413
414                // URI path test
415                let uri_base = format!("file://{}", $base);
416                #[allow(unused_variables)]
417                let result = {
418                    let x = $result;
419                    $(let x = $uri_result;)?
420                    x
421                };
422                let uri_result = format!("file://{result}");
423                assert_eq!(
424                    PlPath::new(uri_base.as_str())
425                        .as_ref()
426                        .join($added)
427                        .to_str(),
428                    uri_result.as_str()
429                );
430            };
431        }
432
433        assert_plpath_join!("a/b/c/" + "d/e" => "a/b/c/d/e");
434        assert_plpath_join!("a/b/c" + "d/e" => "a/b/c/d/e");
435        assert_plpath_join!("a/b/c" + "d/e/" => "a/b/c/d/e/");
436        assert_plpath_join!("a/b/c" + "" => "a/b/c");
437        assert_plpath_join!("a/b/c" + "/d" => "/d", "a/b/c/d");
438        assert_plpath_join!("a/b/c" + "/d/" => "/d/", "a/b/c/d/");
439        assert_plpath_join!("" + "/d/" => "/d/");
440        assert_plpath_join!("/" + "/d/" => "/d/", "//d/");
441        assert_plpath_join!("/x/y" + "/d/" => "/d/", "/x/y/d/");
442        assert_plpath_join!("/x/y" + "/d" => "/d", "/x/y/d");
443        assert_plpath_join!("/x/y" + "d" => "/x/y/d");
444
445        assert_plpath_join!("/a/longer" + "path" => "/a/longer/path");
446        assert_plpath_join!("/a/longer" + "/path" => "/path", "/a/longer/path");
447        assert_plpath_join!("/a/longer" + "path/wow" => "/a/longer/path/wow");
448        assert_plpath_join!("/a/longer" + "/path/wow" => "/path/wow", "/a/longer/path/wow");
449        assert_plpath_join!("/an/even/longer" + "path" => "/an/even/longer/path");
450        assert_plpath_join!("/an/even/longer" + "/path" => "/path", "/an/even/longer/path");
451        assert_plpath_join!("/an/even/longer" + "path/wow" => "/an/even/longer/path/wow");
452        assert_plpath_join!("/an/even/longer" + "/path/wow" => "/path/wow", "/an/even/longer/path/wow");
453    }
454}