polars_utils/
plpath.rs

1use core::fmt;
2use std::path::{Path, PathBuf};
3use std::str::FromStr;
4use std::sync::Arc;
5
6/// A Path or URI
7#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
8#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
9#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
10pub enum PlPath {
11    Local(Arc<Path>),
12    Cloud(PlCloudPath),
13}
14
15/// A reference to a Path or URI
16#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
17pub enum PlPathRef<'a> {
18    Local(&'a Path),
19    Cloud(PlCloudPathRef<'a>),
20}
21
22#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
23#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
24#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
25pub struct PlCloudPath {
26    /// The scheme used in cloud e.g. `s3://` or `file://`.
27    scheme: CloudScheme,
28    /// The full URI e.g. `s3://path/to/bucket`.
29    uri: Arc<str>,
30}
31
32#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
33pub struct PlCloudPathRef<'a> {
34    /// The scheme used in cloud e.g. `s3://` or `file://`.
35    scheme: CloudScheme,
36    /// The full URI e.g. `s3://path/to/bucket`.
37    uri: &'a str,
38}
39
40impl<'a> fmt::Display for PlCloudPathRef<'a> {
41    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42        f.write_str(self.uri())
43    }
44}
45
46impl fmt::Display for PlCloudPath {
47    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48        self.as_ref().fmt(f)
49    }
50}
51
52impl PlCloudPath {
53    pub fn as_ref(&self) -> PlCloudPathRef<'_> {
54        PlCloudPathRef {
55            scheme: self.scheme,
56            uri: self.uri.as_ref(),
57        }
58    }
59
60    pub fn strip_scheme(&self) -> &str {
61        &self.uri[self.scheme.as_str().len() + 3..]
62    }
63}
64
65impl PlCloudPathRef<'_> {
66    pub fn into_owned(self) -> PlCloudPath {
67        PlCloudPath {
68            scheme: self.scheme,
69            uri: self.uri.into(),
70        }
71    }
72
73    pub fn scheme(&self) -> CloudScheme {
74        self.scheme
75    }
76
77    pub fn uri(&self) -> &str {
78        self.uri
79    }
80
81    pub fn strip_scheme(&self) -> &str {
82        &self.uri[self.scheme.as_str().len() + "://".len()..]
83    }
84}
85
86pub struct AddressDisplay<'a> {
87    addr: PlPathRef<'a>,
88}
89
90impl<'a> fmt::Display for AddressDisplay<'a> {
91    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
92        match self.addr {
93            PlPathRef::Local(p) => p.display().fmt(f),
94            PlPathRef::Cloud(p) => p.fmt(f),
95        }
96    }
97}
98
99macro_rules! impl_scheme {
100    ($($t:ident = $n:literal,)+) => {
101        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
102        #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
103        #[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
104        pub enum CloudScheme {
105            $($t,)+
106        }
107
108        impl FromStr for CloudScheme {
109            type Err = ();
110
111            fn from_str(s: &str) -> Result<Self, Self::Err> {
112                match s {
113                    $($n => Ok(Self::$t),)+
114                    _ => Err(()),
115                }
116            }
117        }
118
119        impl CloudScheme {
120            pub fn as_str(&self) -> &'static str {
121                match self {
122                    $(Self::$t => $n,)+
123                }
124            }
125        }
126    };
127}
128
129impl_scheme! {
130    S3 = "s3",
131    S3a = "s3a",
132    Gs = "gs",
133    Gcs = "gcs",
134    File = "file",
135    Abfs = "abfs",
136    Abfss = "abfss",
137    Azure = "azure",
138    Az = "az",
139    Adl = "adl",
140    Http = "http",
141    Https = "https",
142    Hf = "hf",
143}
144
145impl fmt::Display for CloudScheme {
146    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
147        f.write_str(self.as_str())
148    }
149}
150
151crate::regex_cache::cached_regex! {
152    static CLOUD_SCHEME_REGEX = r"^(s3a?|gs|gcs|file|abfss?|azure|az|adl|https?|hf)$";
153}
154
155impl<'a> PlPathRef<'a> {
156    pub fn scheme(&self) -> Option<CloudScheme> {
157        match self {
158            Self::Local(_) => None,
159            Self::Cloud(p) => Some(p.scheme),
160        }
161    }
162
163    pub fn is_local(&self) -> bool {
164        matches!(self, Self::Local(_))
165    }
166
167    pub fn is_cloud_url(&self) -> bool {
168        matches!(self, Self::Cloud(_))
169    }
170
171    pub fn as_local_path(&self) -> Option<&Path> {
172        match self {
173            Self::Local(p) => Some(p),
174            Self::Cloud(_) => None,
175        }
176    }
177
178    pub fn as_cloud_addr(&self) -> Option<PlCloudPathRef<'_>> {
179        match self {
180            Self::Local(_) => None,
181            Self::Cloud(p) => Some(*p),
182        }
183    }
184
185    pub fn join(&self, other: impl AsRef<str>) -> PlPath {
186        let other = other.as_ref();
187        if other.is_empty() {
188            return self.into_owned();
189        }
190
191        match self {
192            Self::Local(p) => PlPath::Local(p.join(other).into()),
193            Self::Cloud(p) => {
194                let needs_slash = !p.uri.ends_with('/') && !other.starts_with('/');
195
196                let mut out =
197                    String::with_capacity(p.uri.len() + usize::from(needs_slash) + other.len());
198
199                out.push_str(p.uri);
200                if needs_slash {
201                    out.push('/');
202                }
203                // NOTE: This has as a consequence that pushing an absolute path into a URI
204                // just pushes the slashes while for a path it will make that absolute path the new
205                // path. I think this is acceptable as I don't really know what the alternative
206                // would be.
207                out.push_str(other);
208
209                let uri = out.into();
210                PlPath::Cloud(PlCloudPath {
211                    scheme: p.scheme,
212                    uri,
213                })
214            },
215        }
216    }
217
218    pub fn display(&self) -> AddressDisplay<'_> {
219        AddressDisplay { addr: *self }
220    }
221
222    pub fn from_local_path(path: &'a Path) -> Self {
223        Self::Local(path)
224    }
225
226    pub fn new(uri: &'a str) -> Self {
227        if let Some(i) = uri.find([':', '/']) {
228            if uri[i..].starts_with("://") && CLOUD_SCHEME_REGEX.is_match(&uri[..i]) {
229                let scheme = CloudScheme::from_str(&uri[..i]).unwrap();
230                return Self::Cloud(PlCloudPathRef { scheme, uri });
231            }
232        }
233
234        Self::from_local_path(Path::new(uri))
235    }
236
237    pub fn into_owned(self) -> PlPath {
238        match self {
239            Self::Local(p) => PlPath::Local(p.into()),
240            Self::Cloud(p) => PlPath::Cloud(p.into_owned()),
241        }
242    }
243
244    pub fn strip_scheme(&self) -> &str {
245        match self {
246            Self::Local(p) => p.to_str().unwrap(),
247            Self::Cloud(p) => p.strip_scheme(),
248        }
249    }
250
251    pub fn parent(&self) -> Option<Self> {
252        Some(match self {
253            Self::Local(p) => Self::Local(p.parent()?),
254            Self::Cloud(p) => {
255                let uri = p.uri;
256                let offset_start = p.scheme.as_str().len() + 3;
257                let last_slash = uri[offset_start..]
258                    .char_indices()
259                    .rev()
260                    .find(|(_, c)| *c == '/')?
261                    .0;
262                let uri = &uri[..offset_start + last_slash];
263
264                Self::Cloud(PlCloudPathRef {
265                    scheme: p.scheme,
266                    uri,
267                })
268            },
269        })
270    }
271
272    pub fn extension(&self) -> Option<&str> {
273        match self {
274            Self::Local(path) => path.extension().and_then(|e| e.to_str()),
275            Self::Cloud(_) => {
276                let offset_path = self.strip_scheme();
277                let separator = '/';
278
279                let mut ext_start = None;
280                for (i, c) in offset_path.char_indices() {
281                    if c == separator {
282                        ext_start = None;
283                    }
284
285                    if c == '.' && ext_start.is_none() {
286                        ext_start = Some(i);
287                    }
288                }
289
290                ext_start.map(|i| &offset_path[i + 1..])
291            },
292        }
293    }
294
295    pub fn to_str(&self) -> &'a str {
296        match self {
297            Self::Local(p) => p.to_str().unwrap(),
298            Self::Cloud(p) => p.uri,
299        }
300    }
301
302    // It is up to the caller to ensure that the offset parameter 'n' matches
303    // a valid path segment starting index
304    pub fn offset_bytes(&'a self, n: usize) -> PathBuf {
305        let s = self.to_str();
306        if let Some(scheme) = self.scheme()
307            && n > 0
308        {
309            debug_assert!(n >= scheme.as_str().len())
310        }
311        PathBuf::from(&s[n..])
312    }
313}
314
315impl PlPath {
316    pub fn new(uri: &str) -> Self {
317        PlPathRef::new(uri).into_owned()
318    }
319
320    pub fn display(&self) -> AddressDisplay<'_> {
321        AddressDisplay {
322            addr: match self {
323                Self::Local(p) => PlPathRef::Local(p.as_ref()),
324                Self::Cloud(p) => PlPathRef::Cloud(p.as_ref()),
325            },
326        }
327    }
328
329    pub fn is_local(&self) -> bool {
330        self.as_ref().is_local()
331    }
332
333    pub fn is_cloud_url(&self) -> bool {
334        self.as_ref().is_cloud_url()
335    }
336
337    // We don't want FromStr since we are infallible.
338    #[expect(clippy::should_implement_trait)]
339    pub fn from_str(uri: &str) -> Self {
340        Self::new(uri)
341    }
342
343    pub fn from_string(uri: String) -> Self {
344        Self::new(&uri)
345    }
346
347    pub fn as_ref(&self) -> PlPathRef<'_> {
348        match self {
349            Self::Local(p) => PlPathRef::Local(p.as_ref()),
350            Self::Cloud(p) => PlPathRef::Cloud(p.as_ref()),
351        }
352    }
353
354    pub fn cloud_scheme(&self) -> Option<CloudScheme> {
355        match self {
356            Self::Local(_) => None,
357            Self::Cloud(p) => Some(p.scheme),
358        }
359    }
360
361    pub fn to_str(&self) -> &str {
362        match self {
363            Self::Local(p) => p.to_str().unwrap(),
364            Self::Cloud(p) => p.uri.as_ref(),
365        }
366    }
367
368    pub fn into_local_path(self) -> Option<Arc<Path>> {
369        match self {
370            PlPath::Local(path) => Some(path),
371            PlPath::Cloud(_) => None,
372        }
373    }
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379
380    #[test]
381    fn plpath_join() {
382        macro_rules! assert_plpath_join {
383            ($base:literal + $added:literal => $result:literal$(, $uri_result:literal)?) => {
384                // Normal path test
385                let path_base = $base.chars().map(|c| match c {
386                    '/' => std::path::MAIN_SEPARATOR,
387                    c => c,
388                }).collect::<String>();
389                let path_added = $added.chars().map(|c| match c {
390                    '/' => std::path::MAIN_SEPARATOR,
391                    c => c,
392                }).collect::<String>();
393                let path_result = $result.chars().map(|c| match c {
394                    '/' => std::path::MAIN_SEPARATOR,
395                    c => c,
396                }).collect::<String>();
397                assert_eq!(PlPath::new(&path_base).as_ref().join(path_added).to_str(), path_result);
398
399                // URI path test
400                let uri_base = format!("file://{}", $base);
401                #[allow(unused_variables)]
402                let result = {
403                    let x = $result;
404                    $(let x = $uri_result;)?
405                    x
406                };
407                let uri_result = format!("file://{result}");
408                assert_eq!(
409                    PlPath::new(uri_base.as_str())
410                        .as_ref()
411                        .join($added)
412                        .to_str(),
413                    uri_result.as_str()
414                );
415            };
416        }
417
418        assert_plpath_join!("a/b/c/" + "d/e" => "a/b/c/d/e");
419        assert_plpath_join!("a/b/c" + "d/e" => "a/b/c/d/e");
420        assert_plpath_join!("a/b/c" + "d/e/" => "a/b/c/d/e/");
421        assert_plpath_join!("a/b/c" + "" => "a/b/c");
422        assert_plpath_join!("a/b/c" + "/d" => "/d", "a/b/c/d");
423        assert_plpath_join!("a/b/c" + "/d/" => "/d/", "a/b/c/d/");
424        assert_plpath_join!("" + "/d/" => "/d/");
425        assert_plpath_join!("/" + "/d/" => "/d/", "//d/");
426        assert_plpath_join!("/x/y" + "/d/" => "/d/", "/x/y/d/");
427        assert_plpath_join!("/x/y" + "/d" => "/d", "/x/y/d");
428        assert_plpath_join!("/x/y" + "d" => "/x/y/d");
429
430        assert_plpath_join!("/a/longer" + "path" => "/a/longer/path");
431        assert_plpath_join!("/a/longer" + "/path" => "/path", "/a/longer/path");
432        assert_plpath_join!("/a/longer" + "path/wow" => "/a/longer/path/wow");
433        assert_plpath_join!("/a/longer" + "/path/wow" => "/path/wow", "/a/longer/path/wow");
434        assert_plpath_join!("/an/even/longer" + "path" => "/an/even/longer/path");
435        assert_plpath_join!("/an/even/longer" + "/path" => "/path", "/an/even/longer/path");
436        assert_plpath_join!("/an/even/longer" + "path/wow" => "/an/even/longer/path/wow");
437        assert_plpath_join!("/an/even/longer" + "/path/wow" => "/path/wow", "/an/even/longer/path/wow");
438    }
439}