polars_utils/
hashing.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
use std::hash::{Hash, Hasher};

use crate::nulls::IsNull;

pub const fn folded_multiply(a: u64, b: u64) -> u64 {
    let full = (a as u128).wrapping_mul(b as u128);
    (full as u64) ^ ((full >> 64) as u64)
}

/// Contains a byte slice and a precomputed hash for that string.
/// During rehashes, we will rehash the hash instead of the string, that makes
/// rehashing cheap and allows cache coherent small hash tables.
#[derive(Eq, Copy, Clone, Debug)]
pub struct BytesHash<'a> {
    payload: Option<&'a [u8]>,
    pub(super) hash: u64,
}

impl<'a> BytesHash<'a> {
    #[inline]
    pub fn new(s: Option<&'a [u8]>, hash: u64) -> Self {
        Self { payload: s, hash }
    }
}

impl<'a> IsNull for BytesHash<'a> {
    const HAS_NULLS: bool = true;
    type Inner = BytesHash<'a>;

    #[inline(always)]
    fn is_null(&self) -> bool {
        self.payload.is_none()
    }

    fn unwrap_inner(self) -> Self::Inner {
        assert!(self.payload.is_some());
        self
    }
}

impl Hash for BytesHash<'_> {
    fn hash<H: Hasher>(&self, state: &mut H) {
        state.write_u64(self.hash)
    }
}

impl PartialEq for BytesHash<'_> {
    #[inline]
    fn eq(&self, other: &Self) -> bool {
        (self.hash == other.hash) && (self.payload == other.payload)
    }
}

#[inline(always)]
pub fn hash_to_partition(h: u64, n_partitions: usize) -> usize {
    // Assuming h is a 64-bit random number, we note that
    // h / 2^64 is almost a uniform random number in [0, 1), and thus
    // floor(h * n_partitions / 2^64) is almost a uniform random integer in
    // [0, n_partitions). Despite being written with u128 multiplication this
    // compiles to a single mul / mulhi instruction on x86-x64/aarch64.
    ((h as u128 * n_partitions as u128) >> 64) as usize
}

#[derive(Clone)]
pub struct HashPartitioner {
    num_partitions: usize,
    seed: u64,
}

impl HashPartitioner {
    /// Creates a new hash partitioner with the given number of partitions and
    /// seed.
    #[inline]
    pub fn new(num_partitions: usize, mut seed: u64) -> Self {
        assert!(num_partitions > 0);
        // Make sure seeds bits are properly randomized, and is odd.
        const ARBITRARY1: u64 = 0x85921e81c41226a0;
        const ARBITRARY2: u64 = 0x3bc1d0faba166294;
        const ARBITRARY3: u64 = 0xfbde893e21a73756;
        seed = folded_multiply(seed ^ ARBITRARY1, ARBITRARY2);
        seed = folded_multiply(seed, ARBITRARY3);
        seed |= 1;
        Self {
            seed,
            num_partitions,
        }
    }

    /// Converts a hash to a partition. It is guaranteed that the output is
    /// in the range [0, n_partitions), and that independent HashPartitioners
    /// that we initialized with the same num_partitions and seed return the same
    /// partition.
    #[inline(always)]
    pub fn hash_to_partition(&self, hash: u64) -> usize {
        // Assuming r is a 64-bit random number, we note that
        // r / 2^64 is almost a uniform random number in [0, 1), and thus
        // floor(r * n_partitions / 2^64) is almost a uniform random integer in
        // [0, n_partitions). Despite being written with u128 multiplication this
        // compiles to a single mul / mulhi instruction on x86-x64/aarch64.
        let shuffled = hash.wrapping_mul(self.seed);
        ((shuffled as u128 * self.num_partitions as u128) >> 64) as usize
    }

    #[inline(always)]
    pub fn num_partitions(&self) -> usize {
        self.num_partitions
    }
}

// FIXME: use Hasher interface and support a random state.
pub trait DirtyHash {
    // A quick and dirty hash. Only the top bits of the hash are decent, such as
    // used in hash_to_partition.
    fn dirty_hash(&self) -> u64;
}

// Multiplication by a 'random' odd number gives a universal hash function in
// the top bits.
const RANDOM_ODD: u64 = 0x55fbfd6bfc5458e9;

macro_rules! impl_hash_partition_as_u64 {
    ($T: ty) => {
        impl DirtyHash for $T {
            fn dirty_hash(&self) -> u64 {
                (*self as u64).wrapping_mul(RANDOM_ODD)
            }
        }
    };
}

impl_hash_partition_as_u64!(u8);
impl_hash_partition_as_u64!(u16);
impl_hash_partition_as_u64!(u32);
impl_hash_partition_as_u64!(u64);
impl_hash_partition_as_u64!(i8);
impl_hash_partition_as_u64!(i16);
impl_hash_partition_as_u64!(i32);
impl_hash_partition_as_u64!(i64);

impl DirtyHash for i128 {
    fn dirty_hash(&self) -> u64 {
        (*self as u64)
            .wrapping_mul(RANDOM_ODD)
            .wrapping_add((*self >> 64) as u64)
    }
}

impl DirtyHash for BytesHash<'_> {
    fn dirty_hash(&self) -> u64 {
        self.hash
    }
}

impl<T: DirtyHash + ?Sized> DirtyHash for &T {
    fn dirty_hash(&self) -> u64 {
        (*self).dirty_hash()
    }
}

// FIXME: we should probably encourage explicit null handling, but for now we'll
// allow directly getting a partition from a nullable value.
impl<T: DirtyHash> DirtyHash for Option<T> {
    fn dirty_hash(&self) -> u64 {
        self.as_ref().map(|s| s.dirty_hash()).unwrap_or(0)
    }
}