polars_core/chunked_array/logical/enum_/
mod.rs

1use std::sync::Arc;
2
3use arrow::array::UInt32Vec;
4use arrow::bitmap::MutableBitmap;
5use polars_error::{PolarsResult, polars_bail, polars_err};
6use polars_utils::aliases::{InitHashMaps, PlHashMap};
7use polars_utils::pl_str::PlSmallStr;
8
9use super::{CategoricalChunked, CategoricalOrdering, DataType, Field, RevMapping, UInt32Chunked};
10
11pub struct EnumChunkedBuilder {
12    name: PlSmallStr,
13    enum_builder: UInt32Vec,
14
15    rev: Arc<RevMapping>,
16    ordering: CategoricalOrdering,
17    seen: MutableBitmap,
18
19    // Mapping to amortize the costs of lookups.
20    mapping: PlHashMap<PlSmallStr, u32>,
21    strict: bool,
22}
23
24impl EnumChunkedBuilder {
25    pub fn new(
26        name: PlSmallStr,
27        capacity: usize,
28        rev: Arc<RevMapping>,
29        ordering: CategoricalOrdering,
30        strict: bool,
31    ) -> Self {
32        let seen = MutableBitmap::from_len_zeroed(rev.len());
33
34        Self {
35            name,
36            enum_builder: UInt32Vec::with_capacity(capacity),
37
38            rev,
39            ordering,
40            seen,
41
42            mapping: PlHashMap::new(),
43            strict,
44        }
45    }
46
47    pub fn append_str(&mut self, v: &str) -> PolarsResult<&mut Self> {
48        match self.mapping.get(v) {
49            Some(v) => self.enum_builder.push(Some(*v)),
50            None => {
51                let Some(iv) = self.rev.find(v) else {
52                    if self.strict {
53                        polars_bail!(InvalidOperation: "cannot append '{v}' to enum without that variant");
54                    } else {
55                        self.enum_builder.push(None);
56                        return Ok(self);
57                    }
58                };
59                self.seen.set(iv as usize, true);
60                self.mapping.insert(v.into(), iv);
61                self.enum_builder.push(Some(iv));
62            },
63        }
64
65        Ok(self)
66    }
67
68    pub fn append_null(&mut self) -> &mut Self {
69        self.enum_builder.push(None);
70        self
71    }
72
73    pub fn append_enum(&mut self, v: u32, rev: &RevMapping) -> PolarsResult<&mut Self> {
74        if !self.rev.same_src(rev) {
75            if self.strict {
76                return Err(polars_err!(ComputeError: "incompatible enum types"));
77            } else {
78                self.enum_builder.push(None);
79            }
80        } else {
81            self.seen.set(v as usize, true);
82            self.enum_builder.push(Some(v));
83        }
84
85        Ok(self)
86    }
87
88    pub fn finish(self) -> CategoricalChunked {
89        let arr = self.enum_builder.freeze();
90        let null_count = arr.validity().map_or(0, |a| a.unset_bits());
91        let length = arr.len();
92        let ca = unsafe {
93            UInt32Chunked::new_with_dims(
94                Arc::new(Field::new(self.name, DataType::UInt32)),
95                vec![Box::new(arr)],
96                length,
97                null_count,
98            )
99        };
100        // Fast Unique <=> unique(rev) == unique(ca)
101        let fast_unique = !ca.has_nulls() && self.seen.unset_bits() == 0;
102
103        // SAFETY: keys and values are in bounds
104        unsafe {
105            CategoricalChunked::from_cats_and_rev_map_unchecked(ca, self.rev, true, self.ordering)
106                .with_fast_unique(fast_unique)
107        }
108    }
109}