polars_utils/
hashing.rs

1use std::hash::{Hash, Hasher};
2
3use crate::nulls::IsNull;
4
5// Hash combine from c++' boost lib.
6#[inline(always)]
7pub fn _boost_hash_combine(l: u64, r: u64) -> u64 {
8    l ^ r.wrapping_add(0x9e3779b9u64.wrapping_add(l << 6).wrapping_add(r >> 2))
9}
10
11#[inline(always)]
12pub const fn folded_multiply(a: u64, b: u64) -> u64 {
13    let full = (a as u128).wrapping_mul(b as u128);
14    (full as u64) ^ ((full >> 64) as u64)
15}
16
17/// Contains a byte slice and a precomputed hash for that string.
18/// During rehashes, we will rehash the hash instead of the string, that makes
19/// rehashing cheap and allows cache coherent small hash tables.
20#[derive(Eq, Copy, Clone, Debug)]
21pub struct BytesHash<'a> {
22    payload: Option<&'a [u8]>,
23    pub(super) hash: u64,
24}
25
26impl<'a> BytesHash<'a> {
27    #[inline]
28    pub fn new(s: Option<&'a [u8]>, hash: u64) -> Self {
29        Self { payload: s, hash }
30    }
31}
32
33impl<'a> IsNull for BytesHash<'a> {
34    const HAS_NULLS: bool = true;
35    type Inner = BytesHash<'a>;
36
37    #[inline(always)]
38    fn is_null(&self) -> bool {
39        self.payload.is_none()
40    }
41
42    fn unwrap_inner(self) -> Self::Inner {
43        assert!(self.payload.is_some());
44        self
45    }
46}
47
48impl Hash for BytesHash<'_> {
49    fn hash<H: Hasher>(&self, state: &mut H) {
50        state.write_u64(self.hash)
51    }
52}
53
54impl PartialEq for BytesHash<'_> {
55    #[inline]
56    fn eq(&self, other: &Self) -> bool {
57        (self.hash == other.hash) && (self.payload == other.payload)
58    }
59}
60
61#[inline(always)]
62pub fn hash_to_partition(h: u64, n_partitions: usize) -> usize {
63    // Assuming h is a 64-bit random number, we note that
64    // h / 2^64 is almost a uniform random number in [0, 1), and thus
65    // floor(h * n_partitions / 2^64) is almost a uniform random integer in
66    // [0, n_partitions). Despite being written with u128 multiplication this
67    // compiles to a single mul / mulhi instruction on x86-x64/aarch64.
68    ((h as u128 * n_partitions as u128) >> 64) as usize
69}
70
71#[derive(Clone)]
72pub struct HashPartitioner {
73    num_partitions: usize,
74    seed: u64,
75}
76
77impl HashPartitioner {
78    /// Creates a new hash partitioner with the given number of partitions and
79    /// seed.
80    #[inline]
81    pub fn new(num_partitions: usize, mut seed: u64) -> Self {
82        assert!(num_partitions > 0);
83        // Make sure seeds bits are properly randomized, and is odd.
84        const ARBITRARY1: u64 = 0x85921e81c41226a0;
85        const ARBITRARY2: u64 = 0x3bc1d0faba166294;
86        const ARBITRARY3: u64 = 0xfbde893e21a73756;
87        seed = folded_multiply(seed ^ ARBITRARY1, ARBITRARY2);
88        seed = folded_multiply(seed, ARBITRARY3);
89        seed |= 1;
90        Self {
91            seed,
92            num_partitions,
93        }
94    }
95
96    /// Converts a hash to a partition. It is guaranteed that the output is
97    /// in the range [0, n_partitions), and that independent HashPartitioners
98    /// that we initialized with the same num_partitions and seed return the same
99    /// partition.
100    #[inline(always)]
101    pub fn hash_to_partition(&self, hash: u64) -> usize {
102        // Assuming r is a 64-bit random number, we note that
103        // r / 2^64 is almost a uniform random number in [0, 1), and thus
104        // floor(r * n_partitions / 2^64) is almost a uniform random integer in
105        // [0, n_partitions). Despite being written with u128 multiplication this
106        // compiles to a single mul / mulhi instruction on x86-x64/aarch64.
107        let shuffled = hash.wrapping_mul(self.seed);
108        ((shuffled as u128 * self.num_partitions as u128) >> 64) as usize
109    }
110
111    #[inline(always)]
112    pub fn num_partitions(&self) -> usize {
113        self.num_partitions
114    }
115}
116
117// FIXME: use Hasher interface and support a random state.
118pub trait DirtyHash {
119    // A quick and dirty hash. Only the top bits of the hash are decent, such as
120    // used in hash_to_partition.
121    fn dirty_hash(&self) -> u64;
122}
123
124// Multiplication by a 'random' odd number gives a universal hash function in
125// the top bits.
126const RANDOM_ODD: u64 = 0x55fbfd6bfc5458e9;
127
128macro_rules! impl_hash_partition_as_u64 {
129    ($T: ty) => {
130        impl DirtyHash for $T {
131            fn dirty_hash(&self) -> u64 {
132                (*self as u64).wrapping_mul(RANDOM_ODD)
133            }
134        }
135    };
136}
137
138impl_hash_partition_as_u64!(u8);
139impl_hash_partition_as_u64!(u16);
140impl_hash_partition_as_u64!(u32);
141impl_hash_partition_as_u64!(u64);
142impl_hash_partition_as_u64!(i8);
143impl_hash_partition_as_u64!(i16);
144impl_hash_partition_as_u64!(i32);
145impl_hash_partition_as_u64!(i64);
146
147impl DirtyHash for i128 {
148    fn dirty_hash(&self) -> u64 {
149        (*self as u64)
150            .wrapping_mul(RANDOM_ODD)
151            .wrapping_add((*self >> 64) as u64)
152    }
153}
154
155impl DirtyHash for BytesHash<'_> {
156    fn dirty_hash(&self) -> u64 {
157        self.hash
158    }
159}
160
161impl<T: DirtyHash + ?Sized> DirtyHash for &T {
162    fn dirty_hash(&self) -> u64 {
163        (*self).dirty_hash()
164    }
165}
166
167// FIXME: we should probably encourage explicit null handling, but for now we'll
168// allow directly getting a partition from a nullable value.
169impl<T: DirtyHash> DirtyHash for Option<T> {
170    fn dirty_hash(&self) -> u64 {
171        self.as_ref().map(|s| s.dirty_hash()).unwrap_or(0)
172    }
173}