polars_utils/
cardinality_sketch.rs1use crate::algebraic_ops::alg_add_f64;
2
3fn inv_pow2(n: u8) -> f64 {
5 let base = f64::to_bits(1.0);
6 f64::from_bits(base - ((n as u64) << 52))
7}
8
9#[derive(Clone)]
20pub struct CardinalitySketch {
21 buckets: [u8; 256],
22}
23
24impl Default for CardinalitySketch {
25 fn default() -> Self {
26 Self::new()
27 }
28}
29
30impl CardinalitySketch {
31 pub fn new() -> Self {
32 Self {
33 buckets: [0u8; 256],
34 }
35 }
36
37 pub fn insert(&mut self, mut h: u64) {
39 const ARBITRARY_ODD: u64 = 0x902813a5785dc787;
40 h = h.wrapping_mul(ARBITRARY_ODD);
43 let idx = (h >> 56) as usize;
44 let p = 1 + (h << 8).leading_zeros() as u8;
45 self.buckets[idx] = self.buckets[idx].max(p);
46 }
47
48 pub fn combine(&mut self, other: &CardinalitySketch) {
49 self.buckets = std::array::from_fn(|i| std::cmp::max(self.buckets[i], other.buckets[i]));
50 }
51
52 pub fn estimate(&self) -> usize {
53 let m = 256.0;
54 let alpha_m = 0.7123 / (1.0 + 1.079 / m);
55
56 let mut sum = 0.0;
57 let mut num_zero = 0;
58 for x in self.buckets.iter() {
59 sum = alg_add_f64(sum, inv_pow2(*x));
60 num_zero += (*x == 0) as usize;
61 }
62
63 let est = (alpha_m * m * m) / sum;
64 let corr_est = if est <= 5.0 / 2.0 * m && num_zero != 0 {
65 m * (m as f32 / num_zero as f32).ln() as f64
67 } else {
68 est
69 };
70
71 if num_zero == self.buckets.len() {
72 0
73 } else {
74 ((corr_est + 0.5) as usize).max(1)
75 }
76 }
77}