polars_core/chunked_array/logical/categorical/
string_cache.rs1use std::hash::{BuildHasher, Hash, Hasher};
2use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
3use std::sync::{LazyLock, Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard};
4
5use hashbrown::HashTable;
6use hashbrown::hash_table::Entry;
7use polars_utils::aliases::PlFixedStateQuality;
8use polars_utils::pl_str::PlSmallStr;
9
10use crate::hashing::_HASHMAP_INIT_SIZE;
11
12static STRING_CACHE_REFCOUNT: Mutex<u32> = Mutex::new(0);
15static STRING_CACHE_ENABLED_GLOBALLY: AtomicBool = AtomicBool::new(false);
16static STRING_CACHE_UUID_CTR: AtomicU32 = AtomicU32::new(0);
17
18pub struct StringCacheHolder {
39 #[allow(dead_code)]
41 private_zst: (),
42}
43
44impl Default for StringCacheHolder {
45 fn default() -> Self {
46 Self::hold()
47 }
48}
49
50impl StringCacheHolder {
51 pub fn hold() -> StringCacheHolder {
53 increment_string_cache_refcount();
54 StringCacheHolder { private_zst: () }
55 }
56}
57
58impl Drop for StringCacheHolder {
59 fn drop(&mut self) {
60 decrement_string_cache_refcount();
61 }
62}
63
64fn increment_string_cache_refcount() {
65 let mut refcount = STRING_CACHE_REFCOUNT.lock().unwrap();
66 *refcount += 1;
67}
68fn decrement_string_cache_refcount() {
69 let mut refcount = STRING_CACHE_REFCOUNT.lock().unwrap();
70 *refcount -= 1;
71 if *refcount == 0 {
72 STRING_CACHE.clear()
73 }
74}
75
76pub fn enable_string_cache() {
88 let was_enabled = STRING_CACHE_ENABLED_GLOBALLY.swap(true, Ordering::AcqRel);
89 if !was_enabled {
90 increment_string_cache_refcount();
91 }
92}
93
94pub fn disable_string_cache() {
99 let was_enabled = STRING_CACHE_ENABLED_GLOBALLY.swap(false, Ordering::AcqRel);
100 if was_enabled {
101 decrement_string_cache_refcount();
102 }
103}
104
105pub fn using_string_cache() -> bool {
107 let refcount = STRING_CACHE_REFCOUNT.lock().unwrap();
108 *refcount > 0
109}
110
111#[derive(Copy, Clone)]
113struct Key {
114 pub(super) hash: u64,
115 pub(super) idx: u32,
116}
117
118impl Key {
119 #[inline]
120 pub(super) fn new(hash: u64, idx: u32) -> Self {
121 Self { hash, idx }
122 }
123}
124
125impl Hash for Key {
126 #[inline]
127 fn hash<H: Hasher>(&self, state: &mut H) {
128 state.write_u64(self.hash)
129 }
130}
131
132pub(crate) struct SCacheInner {
133 map: HashTable<Key>,
134 pub(crate) uuid: u32,
135 payloads: Vec<PlSmallStr>,
136}
137
138impl SCacheInner {
139 #[inline]
140 pub(crate) unsafe fn get_unchecked(&self, cat: u32) -> &str {
141 self.payloads.get_unchecked(cat as usize).as_str()
142 }
143
144 pub(crate) fn len(&self) -> usize {
145 self.map.len()
146 }
147
148 #[inline]
149 pub(crate) fn insert_from_hash(&mut self, h: u64, s: &str) -> u32 {
150 let mut global_idx = self.payloads.len() as u32;
151 let entry = self.map.entry(
152 h,
153 |k| {
154 let value = unsafe { self.payloads.get_unchecked(k.idx as usize) };
155 s == value.as_str()
156 },
157 |k| k.hash,
158 );
159
160 match entry {
161 Entry::Occupied(entry) => {
162 global_idx = entry.get().idx;
163 },
164 Entry::Vacant(entry) => {
165 let idx = self.payloads.len() as u32;
166 let key = Key::new(h, idx);
167 entry.insert(key);
168 self.payloads.push(PlSmallStr::from_str(s));
169 },
170 }
171 global_idx
172 }
173
174 #[inline]
175 pub(crate) fn get_cat(&self, s: &str) -> Option<u32> {
176 let h = StringCache::get_hash_builder().hash_one(s);
177 self.map
178 .find(h, |k| {
179 let value = unsafe { self.payloads.get_unchecked(k.idx as usize) };
180 s == value.as_str()
181 })
182 .map(|k| k.idx)
183 }
184
185 #[inline]
186 pub(crate) fn insert(&mut self, s: &str) -> u32 {
187 let h = StringCache::get_hash_builder().hash_one(s);
188 self.insert_from_hash(h, s)
189 }
190
191 #[inline]
192 pub(crate) fn get_current_payloads(&self) -> &[PlSmallStr] {
193 &self.payloads
194 }
195}
196
197impl Default for SCacheInner {
198 fn default() -> Self {
199 Self {
200 map: HashTable::with_capacity(_HASHMAP_INIT_SIZE),
201 uuid: STRING_CACHE_UUID_CTR.fetch_add(1, Ordering::AcqRel),
202 payloads: Vec::with_capacity(_HASHMAP_INIT_SIZE),
203 }
204 }
205}
206
207#[derive(Default)]
212pub(crate) struct StringCache(pub(crate) RwLock<SCacheInner>);
213
214impl StringCache {
215 #[inline]
218 pub(crate) fn get_hash_builder() -> PlFixedStateQuality {
219 PlFixedStateQuality::with_seed(0)
220 }
221
222 pub(crate) fn active_cache_id() -> u32 {
223 STRING_CACHE_UUID_CTR
224 .load(Ordering::Relaxed)
225 .wrapping_sub(1)
226 }
227
228 pub(crate) fn lock_map(&self) -> RwLockWriteGuard<SCacheInner> {
230 self.0.write().unwrap()
231 }
232
233 pub(crate) fn read_map(&self) -> RwLockReadGuard<SCacheInner> {
234 self.0.read().unwrap()
235 }
236
237 pub(crate) fn clear(&self) {
238 let mut lock = self.lock_map();
239 *lock = Default::default();
240 }
241
242 pub(crate) fn apply<F, T>(&self, fun: F) -> (u32, T)
243 where
244 F: FnOnce(&mut RwLockWriteGuard<SCacheInner>) -> T,
245 {
246 let cache = &mut crate::STRING_CACHE.lock_map();
247
248 let result = fun(cache);
249
250 if cache.len() > u32::MAX as usize {
251 panic!("not more than {} categories supported", u32::MAX)
252 };
253
254 (cache.uuid, result)
255 }
256}
257
258pub(crate) static STRING_CACHE: LazyLock<StringCache> = LazyLock::new(Default::default);