polars_core/chunked_array/logical/categorical/
string_cache.rs

1use std::hash::{BuildHasher, Hash, Hasher};
2use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
3use std::sync::{LazyLock, Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard};
4
5use hashbrown::HashTable;
6use hashbrown::hash_table::Entry;
7use polars_utils::aliases::PlFixedStateQuality;
8use polars_utils::pl_str::PlSmallStr;
9
10use crate::hashing::_HASHMAP_INIT_SIZE;
11
12/// We use atomic reference counting to determine how many threads use the
13/// string cache. If the refcount is zero, we may clear the string cache.
14static STRING_CACHE_REFCOUNT: Mutex<u32> = Mutex::new(0);
15static STRING_CACHE_ENABLED_GLOBALLY: AtomicBool = AtomicBool::new(false);
16static STRING_CACHE_UUID_CTR: AtomicU32 = AtomicU32::new(0);
17
18/// Enable the global string cache as long as the object is alive ([RAII]).
19///
20/// # Examples
21///
22/// Enable the string cache by initializing the object:
23///
24/// ```
25/// use polars_core::StringCacheHolder;
26///
27/// let _sc = StringCacheHolder::hold();
28/// ```
29///
30/// The string cache is enabled until `handle` is dropped.
31///
32/// # De-allocation
33///
34/// Multiple threads can hold the string cache at the same time.
35/// The contents of the cache will only get dropped when no thread holds it.
36///
37/// [RAII]: https://en.wikipedia.org/wiki/Resource_acquisition_is_initialization
38pub struct StringCacheHolder {
39    // only added so that it will never be constructed directly
40    #[allow(dead_code)]
41    private_zst: (),
42}
43
44impl Default for StringCacheHolder {
45    fn default() -> Self {
46        Self::hold()
47    }
48}
49
50impl StringCacheHolder {
51    /// Hold the StringCache
52    pub fn hold() -> StringCacheHolder {
53        increment_string_cache_refcount();
54        StringCacheHolder { private_zst: () }
55    }
56}
57
58impl Drop for StringCacheHolder {
59    fn drop(&mut self) {
60        decrement_string_cache_refcount();
61    }
62}
63
64fn increment_string_cache_refcount() {
65    let mut refcount = STRING_CACHE_REFCOUNT.lock().unwrap();
66    *refcount += 1;
67}
68fn decrement_string_cache_refcount() {
69    let mut refcount = STRING_CACHE_REFCOUNT.lock().unwrap();
70    *refcount -= 1;
71    if *refcount == 0 {
72        STRING_CACHE.clear()
73    }
74}
75
76/// Enable the global string cache.
77///
78/// [`Categorical`] columns created under the same global string cache have the
79/// same underlying physical value when string values are equal. This allows the
80/// columns to be concatenated or used in a join operation, for example.
81///
82/// Note that enabling the global string cache introduces some overhead.
83/// The amount of overhead depends on the number of categories in your data.
84/// It is advised to enable the global string cache only when strictly necessary.
85///
86/// [`Categorical`]: crate::datatypes::DataType::Categorical
87pub fn enable_string_cache() {
88    let was_enabled = STRING_CACHE_ENABLED_GLOBALLY.swap(true, Ordering::AcqRel);
89    if !was_enabled {
90        increment_string_cache_refcount();
91    }
92}
93
94/// Disable and clear the global string cache.
95///
96/// Note: Consider using [`StringCacheHolder`] for a more reliable way of
97/// enabling and disabling the string cache.
98pub fn disable_string_cache() {
99    let was_enabled = STRING_CACHE_ENABLED_GLOBALLY.swap(false, Ordering::AcqRel);
100    if was_enabled {
101        decrement_string_cache_refcount();
102    }
103}
104
105/// Check whether the global string cache is enabled.
106pub fn using_string_cache() -> bool {
107    let refcount = STRING_CACHE_REFCOUNT.lock().unwrap();
108    *refcount > 0
109}
110
111// This is the hash and the Index offset in the linear buffer
112#[derive(Copy, Clone)]
113struct Key {
114    pub(super) hash: u64,
115    pub(super) idx: u32,
116}
117
118impl Key {
119    #[inline]
120    pub(super) fn new(hash: u64, idx: u32) -> Self {
121        Self { hash, idx }
122    }
123}
124
125impl Hash for Key {
126    #[inline]
127    fn hash<H: Hasher>(&self, state: &mut H) {
128        state.write_u64(self.hash)
129    }
130}
131
132pub(crate) struct SCacheInner {
133    map: HashTable<Key>,
134    pub(crate) uuid: u32,
135    payloads: Vec<PlSmallStr>,
136}
137
138impl SCacheInner {
139    #[inline]
140    pub(crate) unsafe fn get_unchecked(&self, cat: u32) -> &str {
141        self.payloads.get_unchecked(cat as usize).as_str()
142    }
143
144    pub(crate) fn len(&self) -> usize {
145        self.map.len()
146    }
147
148    #[inline]
149    pub(crate) fn insert_from_hash(&mut self, h: u64, s: &str) -> u32 {
150        let mut global_idx = self.payloads.len() as u32;
151        let entry = self.map.entry(
152            h,
153            |k| {
154                let value = unsafe { self.payloads.get_unchecked(k.idx as usize) };
155                s == value.as_str()
156            },
157            |k| k.hash,
158        );
159
160        match entry {
161            Entry::Occupied(entry) => {
162                global_idx = entry.get().idx;
163            },
164            Entry::Vacant(entry) => {
165                let idx = self.payloads.len() as u32;
166                let key = Key::new(h, idx);
167                entry.insert(key);
168                self.payloads.push(PlSmallStr::from_str(s));
169            },
170        }
171        global_idx
172    }
173
174    #[inline]
175    pub(crate) fn get_cat(&self, s: &str) -> Option<u32> {
176        let h = StringCache::get_hash_builder().hash_one(s);
177        self.map
178            .find(h, |k| {
179                let value = unsafe { self.payloads.get_unchecked(k.idx as usize) };
180                s == value.as_str()
181            })
182            .map(|k| k.idx)
183    }
184
185    #[inline]
186    pub(crate) fn insert(&mut self, s: &str) -> u32 {
187        let h = StringCache::get_hash_builder().hash_one(s);
188        self.insert_from_hash(h, s)
189    }
190
191    #[inline]
192    pub(crate) fn get_current_payloads(&self) -> &[PlSmallStr] {
193        &self.payloads
194    }
195}
196
197impl Default for SCacheInner {
198    fn default() -> Self {
199        Self {
200            map: HashTable::with_capacity(_HASHMAP_INIT_SIZE),
201            uuid: STRING_CACHE_UUID_CTR.fetch_add(1, Ordering::AcqRel),
202            payloads: Vec::with_capacity(_HASHMAP_INIT_SIZE),
203        }
204    }
205}
206
207/// Used by categorical data that need to share global categories.
208/// In *eager* you need to specifically toggle global string cache to have a global effect.
209/// In *lazy* it is toggled on at the start of a computation run and turned of (deleted) when a
210/// result is produced.
211#[derive(Default)]
212pub(crate) struct StringCache(pub(crate) RwLock<SCacheInner>);
213
214impl StringCache {
215    /// The global `StringCache` will always use a predictable seed. This allows local builders to mimic
216    /// the hashes in case of contention.
217    #[inline]
218    pub(crate) fn get_hash_builder() -> PlFixedStateQuality {
219        PlFixedStateQuality::with_seed(0)
220    }
221
222    pub(crate) fn active_cache_id() -> u32 {
223        STRING_CACHE_UUID_CTR
224            .load(Ordering::Relaxed)
225            .wrapping_sub(1)
226    }
227
228    /// Lock the string cache
229    pub(crate) fn lock_map(&self) -> RwLockWriteGuard<SCacheInner> {
230        self.0.write().unwrap()
231    }
232
233    pub(crate) fn read_map(&self) -> RwLockReadGuard<SCacheInner> {
234        self.0.read().unwrap()
235    }
236
237    pub(crate) fn clear(&self) {
238        let mut lock = self.lock_map();
239        *lock = Default::default();
240    }
241
242    pub(crate) fn apply<F, T>(&self, fun: F) -> (u32, T)
243    where
244        F: FnOnce(&mut RwLockWriteGuard<SCacheInner>) -> T,
245    {
246        let cache = &mut crate::STRING_CACHE.lock_map();
247
248        let result = fun(cache);
249
250        if cache.len() > u32::MAX as usize {
251            panic!("not more than {} categories supported", u32::MAX)
252        };
253
254        (cache.uuid, result)
255    }
256}
257
258pub(crate) static STRING_CACHE: LazyLock<StringCache> = LazyLock::new(Default::default);