polars_core/chunked_array/logical/categorical/
revmap.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2use std::fmt::{Debug, Formatter};
3use std::hash::{BuildHasher, Hash, Hasher};
4
5use arrow::array::*;
6use polars_utils::aliases::PlFixedStateQuality;
7
8use crate::datatypes::PlHashMap;
9use crate::{StringCache, using_string_cache};
10
11#[derive(Clone)]
12pub enum RevMapping {
13    /// Hashmap: maps the indexes from the global cache/categorical array to indexes in the local Utf8Array
14    /// Utf8Array: caches the string values
15    Global(PlHashMap<u32, u32>, Utf8ViewArray, u32),
16    /// Utf8Array: caches the string values and a hash of all values for quick comparison
17    Local(Utf8ViewArray, u128),
18}
19
20impl Debug for RevMapping {
21    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
22        match self {
23            RevMapping::Global(_, _, _) => {
24                write!(f, "global")
25            },
26            RevMapping::Local(_, _) => {
27                write!(f, "local")
28            },
29        }
30    }
31}
32
33impl Default for RevMapping {
34    fn default() -> Self {
35        let slice: &[Option<&str>] = &[];
36        let cats = Utf8ViewArray::from_slice(slice);
37        if using_string_cache() {
38            let cache = &mut crate::STRING_CACHE.lock_map();
39            let id = cache.uuid;
40            RevMapping::Global(Default::default(), cats, id)
41        } else {
42            RevMapping::build_local(cats)
43        }
44    }
45}
46
47#[allow(clippy::len_without_is_empty)]
48impl RevMapping {
49    pub fn is_active_global(&self) -> bool {
50        match self {
51            Self::Global(_, _, id) => *id == StringCache::active_cache_id(),
52            _ => false,
53        }
54    }
55
56    pub fn is_global(&self) -> bool {
57        matches!(self, Self::Global(_, _, _))
58    }
59
60    pub fn is_local(&self) -> bool {
61        matches!(self, Self::Local(_, _))
62    }
63
64    /// Get the categories in this [`RevMapping`]
65    pub fn get_categories(&self) -> &Utf8ViewArray {
66        match self {
67            Self::Global(_, a, _) => a,
68            Self::Local(a, _) => a,
69        }
70    }
71
72    fn build_hash(categories: &Utf8ViewArray) -> u128 {
73        // TODO! we must also validate the cases of duplicates!
74        let mut hb = PlFixedStateQuality::with_seed(0).build_hasher();
75        categories.values_iter().for_each(|val| {
76            val.hash(&mut hb);
77        });
78        let hash = hb.finish();
79        ((hash as u128) << 64) | (categories.total_buffer_len() as u128)
80    }
81
82    pub fn build_local(categories: Utf8ViewArray) -> Self {
83        debug_assert_eq!(categories.null_count(), 0);
84        let hash = Self::build_hash(&categories);
85        Self::Local(categories, hash)
86    }
87
88    /// Get the length of the [`RevMapping`]
89    pub fn len(&self) -> usize {
90        self.get_categories().len()
91    }
92
93    /// [`Categorical`] to [`str`]
94    ///
95    /// [`Categorical`]: crate::datatypes::DataType::Categorical
96    pub fn get(&self, idx: u32) -> &str {
97        match self {
98            Self::Global(map, a, _) => {
99                let idx = *map.get(&idx).unwrap();
100                a.value(idx as usize)
101            },
102            Self::Local(a, _) => a.value(idx as usize),
103        }
104    }
105
106    pub fn get_optional(&self, idx: u32) -> Option<&str> {
107        match self {
108            Self::Global(map, a, _) => {
109                let idx = *map.get(&idx)?;
110                a.get(idx as usize)
111            },
112            Self::Local(a, _) => a.get(idx as usize),
113        }
114    }
115
116    /// [`Categorical`] to [`str`]
117    ///
118    /// [`Categorical`]: crate::datatypes::DataType::Categorical
119    ///
120    /// # Safety
121    /// This doesn't do any bound checking
122    pub(crate) unsafe fn get_unchecked(&self, idx: u32) -> &str {
123        match self {
124            Self::Global(map, a, _) => {
125                let idx = *map.get(&idx).unwrap();
126                a.value_unchecked(idx as usize)
127            },
128            Self::Local(a, _) => a.value_unchecked(idx as usize),
129        }
130    }
131    /// Check if the categoricals have a compatible mapping
132    #[inline]
133    pub fn same_src(&self, other: &Self) -> bool {
134        match (self, other) {
135            (RevMapping::Global(_, _, l), RevMapping::Global(_, _, r)) => *l == *r,
136            (RevMapping::Local(_, l_hash), RevMapping::Local(_, r_hash)) => l_hash == r_hash,
137            _ => false,
138        }
139    }
140
141    /// [`str`] to [`Categorical`]
142    ///
143    ///
144    /// [`Categorical`]: crate::datatypes::DataType::Categorical
145    pub fn find(&self, value: &str) -> Option<u32> {
146        match self {
147            Self::Global(rev_map, a, id) => {
148                // fast path is check
149                if using_string_cache() {
150                    let map = crate::STRING_CACHE.read_map();
151                    if map.uuid == *id {
152                        return map.get_cat(value);
153                    }
154                }
155                rev_map
156                    .iter()
157                    // SAFETY:
158                    // value is always within bounds
159                    .find(|&(_k, &v)| (unsafe { a.value_unchecked(v as usize) } == value))
160                    .map(|(k, _v)| *k)
161            },
162
163            Self::Local(a, _) => {
164                // SAFETY: within bounds
165                unsafe { (0..a.len()).find(|idx| a.value_unchecked(*idx) == value) }
166                    .map(|idx| idx as u32)
167            },
168        }
169    }
170}