polars_utils/
cardinality_sketch.rs

1use crate::algebraic_ops::alg_add_f64;
2
3// Computes 2^-n by directly subtracting from the IEEE754 double exponent.
4fn inv_pow2(n: u8) -> f64 {
5    let base = f64::to_bits(1.0);
6    f64::from_bits(base - ((n as u64) << 52))
7}
8
9/// HyperLogLog in Practice: Algorithmic Engineering of
10/// a State of The Art Cardinality Estimation Algorithm
11/// Stefan Heule, Marc Nunkesser, Alexander Hall
12///
13/// We use m = 256 which gives a relative error of ~6.5% of the cardinality
14/// estimate. We don't bother with stuffing the counts in 6 bits, byte access is
15/// fast.
16///
17/// The bias correction described in the paper is not implemented, so this is
18/// somewhere in between HyperLogLog and HyperLogLog++.
19#[derive(Clone)]
20pub struct CardinalitySketch {
21    buckets: [u8; 256],
22}
23
24impl Default for CardinalitySketch {
25    fn default() -> Self {
26        Self::new()
27    }
28}
29
30impl CardinalitySketch {
31    pub fn new() -> Self {
32        Self {
33            buckets: [0u8; 256],
34        }
35    }
36
37    /// Add a new hash to the sketch.
38    pub fn insert(&mut self, mut h: u64) {
39        const ARBITRARY_ODD: u64 = 0x902813a5785dc787;
40        // We multiply by this arbitrarily chosen odd number and then take the
41        // top bits to ensure the sketch is influenced by all bits of the hash.
42        h = h.wrapping_mul(ARBITRARY_ODD);
43        let idx = (h >> 56) as usize;
44        let p = 1 + (h << 8).leading_zeros() as u8;
45        self.buckets[idx] = self.buckets[idx].max(p);
46    }
47
48    pub fn combine(&mut self, other: &CardinalitySketch) {
49        self.buckets = std::array::from_fn(|i| std::cmp::max(self.buckets[i], other.buckets[i]));
50    }
51
52    pub fn estimate(&self) -> usize {
53        let m = 256.0;
54        let alpha_m = 0.7123 / (1.0 + 1.079 / m);
55
56        let mut sum = 0.0;
57        let mut num_zero = 0;
58        for x in self.buckets.iter() {
59            sum = alg_add_f64(sum, inv_pow2(*x));
60            num_zero += (*x == 0) as usize;
61        }
62
63        let est = (alpha_m * m * m) / sum;
64        let corr_est = if est <= 5.0 / 2.0 * m && num_zero != 0 {
65            // Small cardinality estimate, full 64-bit logarithm is overkill.
66            m * (m as f32 / num_zero as f32).ln() as f64
67        } else {
68            est
69        };
70
71        if num_zero == self.buckets.len() {
72            0
73        } else {
74            ((corr_est + 0.5) as usize).max(1)
75        }
76    }
77}