polars_utils/
regex_cache.rs

1use std::cell::RefCell;
2
3use regex::{Regex, RegexBuilder};
4
5use crate::cache::LruCache;
6
7fn get_size_limit() -> Option<usize> {
8    Some(
9        std::env::var("POLARS_REGEX_SIZE_LIMIT")
10            .ok()
11            .filter(|l| !l.is_empty())?
12            .parse()
13            .expect("invalid POLARS_REGEX_SIZE_LIMIT"),
14    )
15}
16
17// Regex compilation is really heavy, and the resulting regexes can be large as
18// well, so we should have a good caching scheme.
19//
20// TODO: add larger global cache which has time-based flush.
21
22/// A cache for compiled regular expressions.
23pub struct RegexCache {
24    cache: LruCache<String, Regex>,
25    size_limit: Option<usize>,
26}
27
28impl RegexCache {
29    fn new() -> Self {
30        Self {
31            cache: LruCache::with_capacity(32),
32            size_limit: get_size_limit(),
33        }
34    }
35
36    pub fn compile(&mut self, re: &str) -> Result<&Regex, regex::Error> {
37        let r = self.cache.try_get_or_insert_with(re, |re| {
38            // We do this little loop to only check POLARS_REGEX_SIZE_LIMIT when
39            // a regex fails to compile due to the size limit.
40            loop {
41                let mut builder = RegexBuilder::new(re);
42                if let Some(bytes) = self.size_limit {
43                    builder.size_limit(bytes);
44                }
45                match builder.build() {
46                    err @ Err(regex::Error::CompiledTooBig(_)) => {
47                        let new_size_limit = get_size_limit();
48                        if new_size_limit != self.size_limit {
49                            self.size_limit = new_size_limit;
50                            continue; // Try to compile again.
51                        }
52                        break err;
53                    },
54                    r => break r,
55                };
56            }
57        });
58        Ok(&*r?)
59    }
60}
61
62thread_local! {
63    static LOCAL_REGEX_CACHE: RefCell<RegexCache> = RefCell::new(RegexCache::new());
64}
65
66pub fn compile_regex(re: &str) -> Result<Regex, regex::Error> {
67    LOCAL_REGEX_CACHE.with_borrow_mut(|cache| cache.compile(re).cloned())
68}
69
70pub fn with_regex_cache<R, F: FnOnce(&mut RegexCache) -> R>(f: F) -> R {
71    LOCAL_REGEX_CACHE.with_borrow_mut(f)
72}
73
74#[macro_export]
75macro_rules! cached_regex {
76    () => {};
77
78    ($vis:vis static $name:ident = $regex:expr; $($rest:tt)*) => {
79        #[allow(clippy::disallowed_methods)]
80        $vis static $name: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| regex::Regex::new($regex).unwrap());
81        $crate::regex_cache::cached_regex!($($rest)*);
82    };
83}
84pub use cached_regex;