1use std::borrow::{Borrow, Cow};
2use std::ffi::OsStr;
3use std::fmt::Display;
4use std::ops::{Deref, Range};
5use std::path::{Path, PathBuf};
6use std::sync::{LazyLock, RwLock};
7
8use polars_error::{PolarsResult, polars_bail, polars_err};
9
10use crate::aliases::PlHashSet;
11use crate::format_pl_refstr;
12use crate::pl_str::PlRefStr;
13
14pub const WINDOWS_EXTPATH_PREFIX: &str = r#"\\?\"#;
17
18const BUILTIN_EXT_SCHEMES: &[&str] = &["hdfs"];
19
20pub static ALLOWED_EXT_SCHEMES: LazyLock<RwLock<PlHashSet<&'static str>>> =
23 LazyLock::new(|| RwLock::new(PlHashSet::from_iter(BUILTIN_EXT_SCHEMES.iter().copied())));
24
25fn get_ext_scheme(s: &str) -> Option<&'static str> {
27 ALLOWED_EXT_SCHEMES.read().unwrap().get(s).copied()
28}
29
30pub fn ext_scheme_allowed(s: &str) -> bool {
32 get_ext_scheme(s).is_some()
33}
34
35#[doc(hidden)]
38pub fn _allow_ext_scheme(scheme: &'static str) -> PolarsResult<()> {
39 let valid = scheme
40 .chars()
41 .next()
42 .is_some_and(|c| c.is_ascii_alphabetic())
43 && scheme
44 .chars()
45 .all(|c| c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.'));
46
47 if !valid {
48 polars_bail!(
49 InvalidOperation:
50 "invalid scheme '{}': must start with a letter and contain only \
51 letters, digits, '+', '-', '.'",
52 scheme
53 );
54 }
55
56 ALLOWED_EXT_SCHEMES.write().unwrap().insert(scheme);
57 Ok(())
58}
59
60#[doc(hidden)]
62pub fn _disallow_ext_scheme(scheme: &str) {
63 if BUILTIN_EXT_SCHEMES.contains(&scheme) {
64 return; }
66 ALLOWED_EXT_SCHEMES.write().unwrap().remove(scheme);
67}
68
69#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
75#[repr(transparent)]
76pub struct PlPath {
77 inner: str,
78}
79
80#[derive(Default, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
81#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
82#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
83pub struct PlRefPath {
89 inner: PlRefStr,
90}
91
92impl PlPath {
93 fn _new<S: AsRef<str> + ?Sized>(s: &S) -> &PlPath {
95 let s: &str = s.as_ref();
96 unsafe { &*(s as *const str as *const PlPath) }
98 }
99
100 fn _try_from_path(path: &Path) -> PolarsResult<&PlPath> {
101 path.to_str()
102 .ok_or_else(|| polars_err!(non_utf8_path))
103 .map(Self::_new)
104 }
105
106 pub fn as_str(&self) -> &str {
107 unsafe { &*(self as *const PlPath as *const str) }
108 }
109
110 pub fn as_bytes(&self) -> &[u8] {
111 self.as_str().as_bytes()
112 }
113
114 pub fn as_os_str(&self) -> &OsStr {
115 OsStr::new(self)
116 }
117
118 pub fn as_std_path(&self) -> &Path {
119 Path::new(self)
120 }
121
122 pub fn to_ref_path(&self) -> PlRefPath {
123 PlRefPath::_new_no_normalize(self.as_str().into())
124 }
125
126 pub fn scheme(&self) -> Option<CloudScheme> {
127 CloudScheme::from_path(self.as_str())
128 }
129
130 pub fn has_scheme(&self) -> bool {
132 self.scheme().is_some()
133 }
134
135 pub fn strip_scheme(&self) -> &str {
137 &self.as_str()[self.scheme().map_or(0, |x| x.strip_scheme_index())..self.inner.len()]
138 }
139
140 pub fn file_name(&self) -> Option<&OsStr> {
141 Path::new(self.strip_scheme()).file_name()
142 }
143
144 pub fn extension(&self) -> Option<&str> {
145 Path::new(self.strip_scheme())
146 .extension()
147 .map(|x| x.to_str().unwrap())
148 }
149
150 pub fn parent(&self) -> Option<&str> {
151 Path::new(self.strip_scheme())
152 .parent()
153 .map(|x| x.to_str().unwrap())
154 }
155
156 pub fn sliced(&self, range: Range<usize>) -> &PlPath {
158 Self::_new(&self.as_str()[range])
159 }
160
161 pub fn strip_scheme_split_authority(&self) -> Option<(&'_ str, &'_ str)> {
175 match self.scheme() {
176 None | Some(CloudScheme::FileNoHostname) => Some(("", self.strip_scheme())),
177 Some(scheme) => {
178 let path_str = self.as_str();
179 let position = self.authority_end_position();
180
181 if position < path_str.len() {
182 assert!(path_str[position..].starts_with('/'));
183 }
184
185 (position < path_str.len()).then_some((
186 &path_str[scheme.strip_scheme_index()..position],
187 &path_str[position..],
188 ))
189 },
190 }
191 }
192
193 pub fn authority_end_position(&self) -> usize {
199 match self.scheme() {
200 None => 0,
201 Some(scheme @ CloudScheme::FileNoHostname) => scheme.strip_scheme_index(),
202 Some(_) => {
203 let after_scheme = self.strip_scheme();
204 let offset = self.as_str().len() - after_scheme.len();
205
206 offset + after_scheme.find('/').unwrap_or(after_scheme.len())
207 },
208 }
209 }
210
211 pub fn to_absolute_path(&self) -> PolarsResult<PlRefPath> {
212 PlRefPath::try_from_pathbuf(std::path::absolute(Path::new(self.strip_scheme()))?)
213 }
214
215 pub fn join(&self, other: impl AsRef<str>) -> PlRefPath {
216 let other = other.as_ref();
217
218 if CloudScheme::from_path(other).is_some()
219 || other.starts_with('/')
220 || other.starts_with('\\')
221 {
222 PlRefPath::new(other)
223 } else if CloudScheme::from_path(self.as_str()).is_some() {
224 let lhs = self.as_str().trim_end_matches('/');
225 PlRefPath::new(format!("{lhs}/{other}"))
226 } else {
227 PlRefPath::try_from_pathbuf(self.as_std_path().join(other)).unwrap()
228 }
229 }
230
231 pub fn normalize_windows_path(path_str: &str) -> Option<PlRefPath> {
233 let has_extpath_prefix = path_str.starts_with(WINDOWS_EXTPATH_PREFIX);
234
235 if has_extpath_prefix || cfg!(target_family = "windows") {
236 let path_str = path_str
237 .strip_prefix(WINDOWS_EXTPATH_PREFIX)
238 .unwrap_or(path_str);
239
240 if matches!(
241 CloudScheme::from_path(path_str),
242 None | Some(CloudScheme::File | CloudScheme::FileNoHostname)
243 ) && path_str.contains('\\')
244 {
245 let new_path = path_str.replace('\\', "/");
246 let inner = PlRefStr::from_string(new_path);
247 return Some(PlRefPath { inner });
248 }
249 }
250
251 None
252 }
253}
254
255impl AsRef<str> for PlPath {
256 fn as_ref(&self) -> &str {
257 self.as_str()
258 }
259}
260
261impl AsRef<OsStr> for PlPath {
262 fn as_ref(&self) -> &OsStr {
263 OsStr::new(self.as_str())
264 }
265}
266
267impl AsRef<Path> for PlPath {
268 fn as_ref(&self) -> &Path {
269 self.as_std_path()
270 }
271}
272
273impl From<&PlPath> for Box<PlPath> {
274 fn from(value: &PlPath) -> Self {
275 let s: &str = value.as_str();
276 let s: Box<str> = s.into();
277 let out: Box<PlPath> = unsafe { std::mem::transmute(s) };
279 out
280 }
281}
282
283impl Display for PlPath {
284 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
285 Display::fmt(self.as_str(), f)
286 }
287}
288
289impl PlRefPath {
290 pub fn empty() -> Self {
291 Self::default()
292 }
293
294 pub fn new(path: impl AsRef<str> + Into<PlRefStr>) -> Self {
296 if let Some(path) = PlPath::normalize_windows_path(path.as_ref()) {
297 return path;
298 }
299
300 Self::_new_no_normalize(path.into())
301 }
302
303 const fn _new_no_normalize(path: PlRefStr) -> Self {
304 Self { inner: path }
305 }
306
307 pub fn try_from_path(path: &Path) -> PolarsResult<PlRefPath> {
308 Ok(Self::new(PlPath::_try_from_path(path)?.as_str()))
309 }
310
311 pub fn try_from_pathbuf(path: PathBuf) -> PolarsResult<PlRefPath> {
312 Self::try_from_path(&path)
313 }
314
315 pub fn as_str(&self) -> &str {
316 &self.inner
317 }
318
319 pub fn as_ref_str(&self) -> &PlRefStr {
320 &self.inner
321 }
322
323 pub fn into_ref_str(self) -> PlRefStr {
324 self.inner
325 }
326
327 pub fn sliced(&self, range: Range<usize>) -> PlRefPath {
329 if range == (0..self.as_str().len()) {
330 self.clone()
331 } else {
332 Self::_new_no_normalize(PlPath::sliced(self, range).as_str().into())
333 }
334 }
335
336 pub fn to_absolute_path(&self) -> PolarsResult<Cow<'_, PlRefPath>> {
339 Ok(if self.has_scheme() || self.as_std_path().is_absolute() {
340 Cow::Borrowed(self)
341 } else {
342 Cow::Owned(PlPath::to_absolute_path(self)?)
343 })
344 }
345
346 pub fn ptr_eq(this: &Self, other: &Self) -> bool {
348 PlRefStr::ptr_eq(this.as_ref_str(), other.as_ref_str())
349 }
350}
351
352impl AsRef<str> for PlRefPath {
353 fn as_ref(&self) -> &str {
354 self.as_str()
355 }
356}
357
358impl AsRef<OsStr> for PlRefPath {
359 fn as_ref(&self) -> &OsStr {
360 self.as_os_str()
361 }
362}
363
364impl AsRef<Path> for PlRefPath {
365 fn as_ref(&self) -> &Path {
366 self.as_std_path()
367 }
368}
369
370impl Deref for PlRefPath {
371 type Target = PlPath;
372
373 fn deref(&self) -> &Self::Target {
374 PlPath::_new(self)
375 }
376}
377
378impl Display for PlRefPath {
379 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
380 Display::fmt(self.as_str(), f)
381 }
382}
383
384impl ToOwned for PlPath {
385 type Owned = PlRefPath;
386
387 fn to_owned(&self) -> Self::Owned {
388 self.to_ref_path()
389 }
390}
391
392impl Borrow<PlPath> for PlRefPath {
393 fn borrow(&self) -> &PlPath {
394 self
395 }
396}
397
398impl From<&str> for PlRefPath {
399 fn from(value: &str) -> Self {
400 Self::new(value)
401 }
402}
403
404macro_rules! impl_cloud_scheme {
405 ($($t:ident = $n:literal,)+) => {
406 #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
407 #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
408 #[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
409 pub enum CloudScheme {
410 $($t,)+
411 Ext(&'static str)
412 }
413
414 impl CloudScheme {
415 #[expect(unreachable_patterns)]
418 fn from_scheme_str(s: &str) -> Option<Self> {
419 match s {
421 $($n => Some(Self::$t),)+
422 _ => get_ext_scheme(s).map(Self::Ext),
423 }
424 }
425
426 pub fn is_native_str(s: &str) -> bool {
427 match Self::from_scheme_str(s) {
428 None | Some(Self::Ext(_)) => false,
429 Some(_) => true,
430 }
431 }
432
433 pub const fn as_str(&self) -> &'static str {
434 match self {
435 $(Self::$t => $n,)+
436 Self::Ext(s) => s,
437 }
438 }
439 }
440 };
441}
442
443impl_cloud_scheme! {
444 Abfs = "abfs",
445 Abfss = "abfss",
446 Adl = "adl",
447 Az = "az",
448 Azure = "azure",
449 File = "file",
450 FileNoHostname = "file",
451 Gcs = "gcs",
452 Gs = "gs",
453 Hf = "hf",
454 Http = "http",
455 Https = "https",
456 S3 = "s3",
457 S3a = "s3a",
458}
459
460impl CloudScheme {
461 pub fn from_path(path: &str) -> Option<Self> {
462 if let Some(stripped) = path.strip_prefix("file:") {
463 return Some(if stripped.starts_with("//") {
464 Self::File
465 } else {
466 Self::FileNoHostname
467 });
468 }
469
470 Self::from_scheme_str(&path[..path.find("://")?])
471 }
472
473 pub fn strip_scheme_index(&self) -> usize {
476 if let Self::FileNoHostname = self {
477 5
478 } else {
479 self.as_str().len() + 3
480 }
481 }
482}
483
484impl Display for CloudScheme {
485 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
486 Display::fmt(self.as_str(), f)
487 }
488}
489
490pub fn format_file_uri(absolute_local_path: &str) -> PlRefPath {
495 if cfg!(target_family = "windows") || absolute_local_path.starts_with(WINDOWS_EXTPATH_PREFIX) {
505 if let Some(path) = PlPath::normalize_windows_path(absolute_local_path) {
506 PlRefPath::new(format_pl_refstr!("file:///{path}"))
507 } else {
508 PlRefPath::new(format_pl_refstr!("file:///{absolute_local_path}"))
509 }
510 } else {
511 PlRefPath::new(format_pl_refstr!("file://{absolute_local_path}"))
512 }
513}
514
515#[cfg(test)]
516mod tests {
517 use super::*;
518
519 #[test]
520 fn test_plpath_file() {
521 let p = PlRefPath::new("file:///home/user");
522 assert_eq!(
523 (
524 p.scheme(),
525 p.scheme().map(|x| x.as_str()),
526 p.as_str(),
527 p.strip_scheme(),
528 ),
529 (
530 Some(CloudScheme::File),
531 Some("file"),
532 "file:///home/user",
533 "/home/user"
534 )
535 );
536
537 let p = PlRefPath::new("file:/home/user");
538 assert_eq!(
539 (
540 p.scheme(),
541 p.scheme().map(|x| x.as_str()),
542 p.as_str(),
543 p.strip_scheme(),
544 ),
545 (
546 Some(CloudScheme::FileNoHostname),
547 Some("file"),
548 "file:/home/user",
549 "/home/user"
550 )
551 );
552
553 assert_eq!(PlRefPath::new("file://").scheme(), Some(CloudScheme::File));
554
555 assert_eq!(
556 PlRefPath::new("file://").strip_scheme_split_authority(),
557 None
558 );
559
560 assert_eq!(
561 PlRefPath::new("file:///").strip_scheme_split_authority(),
562 Some(("", "/"))
563 );
564
565 assert_eq!(
566 PlRefPath::new("file:///path").strip_scheme_split_authority(),
567 Some(("", "/path"))
568 );
569
570 assert_eq!(
571 PlRefPath::new("file://hostname:80/path").strip_scheme_split_authority(),
572 Some(("hostname:80", "/path"))
573 );
574
575 assert_eq!(
576 PlRefPath::new("file:").scheme(),
577 Some(CloudScheme::FileNoHostname)
578 );
579 assert_eq!(
580 PlRefPath::new("file:/").scheme(),
581 Some(CloudScheme::FileNoHostname)
582 );
583 assert_eq!(
584 PlRefPath::new("file:").strip_scheme_split_authority(),
585 Some(("", ""))
586 );
587 assert_eq!(
588 PlRefPath::new("file:/Local/path").strip_scheme_split_authority(),
589 Some(("", "/Local/path"))
590 );
591
592 assert_eq!(
593 PlRefPath::new(r#"\\?\C:\Windows\system32"#).as_str(),
594 "C:/Windows/system32"
595 );
596 }
597
598 #[test]
599 fn test_plpath_join() {
600 assert_eq!(
601 PlRefPath::new("s3://.../...").join("az://.../...").as_str(),
602 "az://.../..."
603 );
604
605 assert_eq!(
606 PlRefPath::new("s3://.../...")
607 .join("a=1/b=1/00000000.parquet")
608 .as_str(),
609 "s3://.../.../a=1/b=1/00000000.parquet"
610 );
611
612 assert_eq!(
613 PlRefPath::new("s3://.../...//")
614 .join("a=1/b=1/00000000.parquet")
615 .as_str(),
616 "s3://.../.../a=1/b=1/00000000.parquet"
617 );
618
619 fn _assert_plpath_join(base: &str, added: &str, expect: &str) {
620 let expect = PlRefPath::new(expect);
622 let base = base.replace('/', std::path::MAIN_SEPARATOR_STR);
623 let added = added.replace('/', std::path::MAIN_SEPARATOR_STR);
624
625 assert_eq!(PlRefPath::new(&base).join(&added), expect);
626
627 let uri_base = format_file_uri(&base);
629 let expect_uri = if added.starts_with(std::path::MAIN_SEPARATOR_STR) {
630 expect.clone()
631 } else {
632 format_file_uri(expect.as_str())
633 };
634
635 assert_eq!(PlRefPath::new(uri_base.as_str()).join(added), expect_uri);
636 }
637
638 macro_rules! assert_plpath_join {
639 ($base:literal + $added:literal => $expect:literal) => {
640 _assert_plpath_join($base, $added, $expect)
641 };
642 }
643
644 assert_plpath_join!("a/b/c/" + "d/e" => "a/b/c/d/e");
645 assert_plpath_join!("a/b/c" + "d/e" => "a/b/c/d/e");
646 assert_plpath_join!("a/b/c" + "d/e/" => "a/b/c/d/e/");
647 assert_plpath_join!("a/b/c" + "/d" => "/d");
648 assert_plpath_join!("a/b/c" + "/d/" => "/d/");
649 assert_plpath_join!("" + "/d/" => "/d/");
650 assert_plpath_join!("/" + "/d/" => "/d/");
651 assert_plpath_join!("/x/y" + "/d/" => "/d/");
652 assert_plpath_join!("/x/y" + "/d" => "/d");
653 assert_plpath_join!("/x/y" + "d" => "/x/y/d");
654
655 assert_plpath_join!("/a/longer" + "path" => "/a/longer/path");
656 assert_plpath_join!("/a/longer" + "/path" => "/path");
657 assert_plpath_join!("/a/longer" + "path/test" => "/a/longer/path/test");
658 assert_plpath_join!("/a/longer" + "/path/test" => "/path/test");
659 }
660
661 #[test]
662 fn test_plpath_name() {
663 assert_eq!(PlRefPath::new("s3://...").file_name(), Some("...".as_ref()));
664 assert_eq!(
665 PlRefPath::new("a/b/file.parquet").file_name(),
666 Some("file.parquet".as_ref())
667 );
668 assert_eq!(
669 PlRefPath::new("file.parquet").file_name(),
670 Some("file.parquet".as_ref())
671 );
672
673 assert_eq!(PlRefPath::new("s3://").file_name(), None);
674 assert_eq!(PlRefPath::new("").file_name(), None);
675 }
676}