1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
use std::path::{Path, PathBuf};
use polars_core::error::to_compute_err;
use polars_core::prelude::*;
use polars_io::cloud::CloudOptions;
use polars_io::utils::is_cloud_url;
use polars_io::RowIndex;
use polars_plan::prelude::UnionArgs;
use crate::prelude::*;
pub type PathIterator = Box<dyn Iterator<Item = PolarsResult<PathBuf>>>;
// cloud_options is used only with async feature
#[allow(unused_variables)]
fn polars_glob(pattern: &str, cloud_options: Option<&CloudOptions>) -> PolarsResult<PathIterator> {
if is_cloud_url(pattern) {
#[cfg(feature = "async")]
{
let paths = polars_io::async_glob(pattern, cloud_options)?;
Ok(Box::new(paths.into_iter().map(|a| Ok(PathBuf::from(&a)))))
}
#[cfg(not(feature = "async"))]
panic!("Feature `async` must be enabled to use globbing patterns with cloud urls.")
} else {
let paths = glob::glob(pattern)
.map_err(|_| polars_err!(ComputeError: "invalid glob pattern given"))?;
let paths = paths.map(|v| v.map_err(to_compute_err));
Ok(Box::new(paths))
}
}
/// Reads [LazyFrame] from a filesystem or a cloud storage.
/// Supports glob patterns.
///
/// Use [LazyFileListReader::finish] to get the final [LazyFrame].
pub trait LazyFileListReader: Clone {
/// Get the final [LazyFrame].
fn finish(self) -> PolarsResult<LazyFrame> {
if !self.glob() {
return self.finish_no_glob();
}
if let Some(paths) = self.iter_paths()? {
let lfs = paths
.map(|r| {
let path = r?;
self.clone()
// Each individual reader should not apply a row limit.
.with_n_rows(None)
// Each individual reader should not apply a row index.
.with_row_index(None)
.with_path(path.clone())
.with_rechunk(false)
.finish_no_glob()
.map_err(|e| {
polars_err!(
ComputeError: "error while reading {}: {}", path.display(), e
)
})
})
.collect::<PolarsResult<Vec<_>>>()?;
polars_ensure!(
!lfs.is_empty(),
ComputeError: "no matching files found in {}", self.path().display()
);
let mut lf = self.concat_impl(lfs)?;
if let Some(n_rows) = self.n_rows() {
lf = lf.slice(0, n_rows as IdxSize)
};
if let Some(rc) = self.row_index() {
lf = lf.with_row_index(&rc.name, Some(rc.offset))
};
Ok(lf)
} else {
self.finish_no_glob()
}
}
/// Recommended concatenation of [LazyFrame]s from many input files.
///
/// This method should not take into consideration [LazyFileListReader::n_rows]
/// nor [LazyFileListReader::row_index].
fn concat_impl(&self, lfs: Vec<LazyFrame>) -> PolarsResult<LazyFrame> {
let args = UnionArgs {
rechunk: self.rechunk(),
parallel: true,
to_supertypes: false,
from_partitioned_ds: true,
..Default::default()
};
concat_impl(&lfs, args)
}
/// Get the final [LazyFrame].
/// This method assumes, that path is *not* a glob.
///
/// It is recommended to always use [LazyFileListReader::finish] method.
fn finish_no_glob(self) -> PolarsResult<LazyFrame>;
fn glob(&self) -> bool {
true
}
/// Path of the scanned file.
/// It can be potentially a glob pattern.
fn path(&self) -> &Path;
fn paths(&self) -> &[PathBuf];
/// Set path of the scanned file.
/// Support glob patterns.
#[must_use]
fn with_path(self, path: PathBuf) -> Self;
/// Set paths of the scanned files.
/// Doesn't glob patterns.
#[must_use]
fn with_paths(self, paths: Arc<[PathBuf]>) -> Self;
/// Configure the row limit.
fn with_n_rows(self, n_rows: impl Into<Option<usize>>) -> Self;
/// Configure the row index.
fn with_row_index(self, row_index: impl Into<Option<RowIndex>>) -> Self;
/// Rechunk the memory to contiguous chunks when parsing is done.
fn rechunk(&self) -> bool;
/// Rechunk the memory to contiguous chunks when parsing is done.
#[must_use]
fn with_rechunk(self, toggle: bool) -> Self;
/// Try to stop parsing when `n` rows are parsed. During multithreaded parsing the upper bound `n` cannot
/// be guaranteed.
fn n_rows(&self) -> Option<usize>;
/// Add a row index column.
fn row_index(&self) -> Option<&RowIndex>;
/// [CloudOptions] used to list files.
fn cloud_options(&self) -> Option<&CloudOptions> {
None
}
/// Get list of files referenced by this reader.
///
/// Returns [None] if path is not a glob pattern.
fn iter_paths(&self) -> PolarsResult<Option<PathIterator>> {
let paths = self.paths();
if paths.is_empty() {
let path_str = self.path().to_string_lossy();
if path_str.contains('*') || path_str.contains('?') || path_str.contains('[') {
polars_glob(&path_str, self.cloud_options()).map(Some)
} else {
Ok(None)
}
} else {
polars_ensure!(self.path().to_string_lossy() == "", InvalidOperation: "expected only a single path argument");
// Lint is incorrect as we need static lifetime.
#[allow(clippy::unnecessary_to_owned)]
Ok(Some(Box::new(paths.to_vec().into_iter().map(Ok))))
}
}
}