1use std::hash::BuildHasher;
2
3use arrow::bitmap::utils::get_bit_unchecked;
4use polars_utils::aliases::PlSeedableRandomStateQuality;
5use polars_utils::hashing::{_boost_hash_combine, folded_multiply};
6use polars_utils::total_ord::{ToTotalOrd, TotalHash};
7use rayon::prelude::*;
8use xxhash_rust::xxh3::xxh3_64_with_seed;
9
10use super::*;
11use crate::POOL;
12use crate::prelude::*;
13use crate::series::implementations::null::NullChunked;
14
15const MULTIPLE: u64 = 6364136223846793005;
17
18pub trait VecHash {
23 fn vec_hash(
25 &self,
26 _random_state: PlSeedableRandomStateQuality,
27 _buf: &mut Vec<u64>,
28 ) -> PolarsResult<()>;
29
30 fn vec_hash_combine(
31 &self,
32 _random_state: PlSeedableRandomStateQuality,
33 _hashes: &mut [u64],
34 ) -> PolarsResult<()>;
35}
36
37pub(crate) fn get_null_hash_value(random_state: &PlSeedableRandomStateQuality) -> u64 {
38 let first = random_state.hash_one(3188347919usize);
41 random_state.hash_one(first)
42}
43
44fn insert_null_hash(
45 chunks: &[ArrayRef],
46 random_state: PlSeedableRandomStateQuality,
47 buf: &mut Vec<u64>,
48) {
49 let null_h = get_null_hash_value(&random_state);
50 let hashes = buf.as_mut_slice();
51
52 let mut offset = 0;
53 chunks.iter().for_each(|arr| {
54 if arr.null_count() > 0 {
55 let validity = arr.validity().unwrap();
56 let (slice, byte_offset, _) = validity.as_slice();
57 (0..validity.len())
58 .map(|i| unsafe { get_bit_unchecked(slice, i + byte_offset) })
59 .zip(&mut hashes[offset..])
60 .for_each(|(valid, h)| {
61 *h = [null_h, *h][valid as usize];
62 })
63 }
64 offset += arr.len();
65 });
66}
67
68fn numeric_vec_hash<T>(
69 ca: &ChunkedArray<T>,
70 random_state: PlSeedableRandomStateQuality,
71 buf: &mut Vec<u64>,
72) where
73 T: PolarsNumericType,
74 T::Native: TotalHash + ToTotalOrd,
75 <T::Native as ToTotalOrd>::TotalOrdItem: Hash,
76{
77 buf.clear();
84 buf.reserve(ca.len());
85
86 #[allow(unused_unsafe)]
87 #[allow(clippy::useless_transmute)]
88 ca.downcast_iter().for_each(|arr| {
89 buf.extend(
90 arr.values()
91 .as_slice()
92 .iter()
93 .copied()
94 .map(|v| random_state.hash_one(v.to_total_ord())),
95 );
96 });
97 insert_null_hash(&ca.chunks, random_state, buf)
98}
99
100fn numeric_vec_hash_combine<T>(
101 ca: &ChunkedArray<T>,
102 random_state: PlSeedableRandomStateQuality,
103 hashes: &mut [u64],
104) where
105 T: PolarsNumericType,
106 T::Native: TotalHash + ToTotalOrd,
107 <T::Native as ToTotalOrd>::TotalOrdItem: Hash,
108{
109 let null_h = get_null_hash_value(&random_state);
110
111 let mut offset = 0;
112 ca.downcast_iter().for_each(|arr| {
113 match arr.null_count() {
114 0 => arr
115 .values()
116 .as_slice()
117 .iter()
118 .zip(&mut hashes[offset..])
119 .for_each(|(v, h)| {
120 *h = folded_multiply(
122 random_state.hash_one(v.to_total_ord()) ^ folded_multiply(*h, MULTIPLE),
125 MULTIPLE,
126 );
127 }),
128 _ => {
129 let validity = arr.validity().unwrap();
130 let (slice, byte_offset, _) = validity.as_slice();
131 (0..validity.len())
132 .map(|i| unsafe { get_bit_unchecked(slice, i + byte_offset) })
133 .zip(&mut hashes[offset..])
134 .zip(arr.values().as_slice())
135 .for_each(|((valid, h), l)| {
136 let lh = random_state.hash_one(l.to_total_ord());
137 let to_hash = [null_h, lh][valid as usize];
138 *h = folded_multiply(to_hash ^ folded_multiply(*h, MULTIPLE), MULTIPLE);
139 });
140 },
141 }
142 offset += arr.len();
143 });
144}
145
146macro_rules! vec_hash_numeric {
147 ($ca:ident) => {
148 impl VecHash for $ca {
149 fn vec_hash(
150 &self,
151 random_state: PlSeedableRandomStateQuality,
152 buf: &mut Vec<u64>,
153 ) -> PolarsResult<()> {
154 numeric_vec_hash(self, random_state, buf);
155 Ok(())
156 }
157
158 fn vec_hash_combine(
159 &self,
160 random_state: PlSeedableRandomStateQuality,
161 hashes: &mut [u64],
162 ) -> PolarsResult<()> {
163 numeric_vec_hash_combine(self, random_state, hashes);
164 Ok(())
165 }
166 }
167 };
168}
169
170vec_hash_numeric!(Int64Chunked);
171vec_hash_numeric!(Int32Chunked);
172vec_hash_numeric!(Int16Chunked);
173vec_hash_numeric!(Int8Chunked);
174vec_hash_numeric!(UInt64Chunked);
175vec_hash_numeric!(UInt32Chunked);
176vec_hash_numeric!(UInt16Chunked);
177vec_hash_numeric!(UInt8Chunked);
178vec_hash_numeric!(Float64Chunked);
179vec_hash_numeric!(Float32Chunked);
180#[cfg(feature = "dtype-f16")]
181vec_hash_numeric!(Float16Chunked);
182#[cfg(feature = "dtype-u128")]
183vec_hash_numeric!(UInt128Chunked);
184#[cfg(any(feature = "dtype-decimal", feature = "dtype-i128"))]
185vec_hash_numeric!(Int128Chunked);
186
187impl VecHash for StringChunked {
188 fn vec_hash(
189 &self,
190 random_state: PlSeedableRandomStateQuality,
191 buf: &mut Vec<u64>,
192 ) -> PolarsResult<()> {
193 self.as_binary().vec_hash(random_state, buf)?;
194 Ok(())
195 }
196
197 fn vec_hash_combine(
198 &self,
199 random_state: PlSeedableRandomStateQuality,
200 hashes: &mut [u64],
201 ) -> PolarsResult<()> {
202 self.as_binary().vec_hash_combine(random_state, hashes)?;
203 Ok(())
204 }
205}
206
207pub fn _hash_binary_array(
209 arr: &BinaryArray<i64>,
210 random_state: PlSeedableRandomStateQuality,
211 buf: &mut Vec<u64>,
212) {
213 let null_h = get_null_hash_value(&random_state);
214 if arr.null_count() == 0 {
215 buf.extend(arr.values_iter().map(|v| xxh3_64_with_seed(v, null_h)))
217 } else {
218 buf.extend(arr.into_iter().map(|opt_v| match opt_v {
219 Some(v) => xxh3_64_with_seed(v, null_h),
220 None => null_h,
221 }))
222 }
223}
224
225fn hash_binview_array(
226 arr: &BinaryViewArray,
227 random_state: PlSeedableRandomStateQuality,
228 buf: &mut Vec<u64>,
229) {
230 let null_h = get_null_hash_value(&random_state);
231 if arr.null_count() == 0 {
232 buf.extend(arr.values_iter().map(|v| xxh3_64_with_seed(v, null_h)))
234 } else {
235 buf.extend(arr.into_iter().map(|opt_v| match opt_v {
236 Some(v) => xxh3_64_with_seed(v, null_h),
237 None => null_h,
238 }))
239 }
240}
241
242impl VecHash for BinaryChunked {
243 fn vec_hash(
244 &self,
245 random_state: PlSeedableRandomStateQuality,
246 buf: &mut Vec<u64>,
247 ) -> PolarsResult<()> {
248 buf.clear();
249 buf.reserve(self.len());
250 self.downcast_iter()
251 .for_each(|arr| hash_binview_array(arr, random_state.clone(), buf));
252 Ok(())
253 }
254
255 fn vec_hash_combine(
256 &self,
257 random_state: PlSeedableRandomStateQuality,
258 hashes: &mut [u64],
259 ) -> PolarsResult<()> {
260 let null_h = get_null_hash_value(&random_state);
261
262 let mut offset = 0;
263 self.downcast_iter().for_each(|arr| {
264 match arr.null_count() {
265 0 => arr
266 .values_iter()
267 .zip(&mut hashes[offset..])
268 .for_each(|(v, h)| {
269 let l = xxh3_64_with_seed(v, null_h);
270 *h = _boost_hash_combine(l, *h)
271 }),
272 _ => {
273 let validity = arr.validity().unwrap();
274 let (slice, byte_offset, _) = validity.as_slice();
275 (0..validity.len())
276 .map(|i| unsafe { get_bit_unchecked(slice, i + byte_offset) })
277 .zip(&mut hashes[offset..])
278 .zip(arr.values_iter())
279 .for_each(|((valid, h), l)| {
280 let l = if valid {
281 xxh3_64_with_seed(l, null_h)
282 } else {
283 null_h
284 };
285 *h = _boost_hash_combine(l, *h)
286 });
287 },
288 }
289 offset += arr.len();
290 });
291 Ok(())
292 }
293}
294
295impl VecHash for BinaryOffsetChunked {
296 fn vec_hash(
297 &self,
298 random_state: PlSeedableRandomStateQuality,
299 buf: &mut Vec<u64>,
300 ) -> PolarsResult<()> {
301 buf.clear();
302 buf.reserve(self.len());
303 self.downcast_iter()
304 .for_each(|arr| _hash_binary_array(arr, random_state.clone(), buf));
305 Ok(())
306 }
307
308 fn vec_hash_combine(
309 &self,
310 random_state: PlSeedableRandomStateQuality,
311 hashes: &mut [u64],
312 ) -> PolarsResult<()> {
313 let null_h = get_null_hash_value(&random_state);
314
315 let mut offset = 0;
316 self.downcast_iter().for_each(|arr| {
317 match arr.null_count() {
318 0 => arr
319 .values_iter()
320 .zip(&mut hashes[offset..])
321 .for_each(|(v, h)| {
322 let l = xxh3_64_with_seed(v, null_h);
323 *h = _boost_hash_combine(l, *h)
324 }),
325 _ => {
326 let validity = arr.validity().unwrap();
327 let (slice, byte_offset, _) = validity.as_slice();
328 (0..validity.len())
329 .map(|i| unsafe { get_bit_unchecked(slice, i + byte_offset) })
330 .zip(&mut hashes[offset..])
331 .zip(arr.values_iter())
332 .for_each(|((valid, h), l)| {
333 let l = if valid {
334 xxh3_64_with_seed(l, null_h)
335 } else {
336 null_h
337 };
338 *h = _boost_hash_combine(l, *h)
339 });
340 },
341 }
342 offset += arr.len();
343 });
344 Ok(())
345 }
346}
347
348impl VecHash for NullChunked {
349 fn vec_hash(
350 &self,
351 random_state: PlSeedableRandomStateQuality,
352 buf: &mut Vec<u64>,
353 ) -> PolarsResult<()> {
354 let null_h = get_null_hash_value(&random_state);
355 buf.clear();
356 buf.resize(self.len(), null_h);
357 Ok(())
358 }
359
360 fn vec_hash_combine(
361 &self,
362 random_state: PlSeedableRandomStateQuality,
363 hashes: &mut [u64],
364 ) -> PolarsResult<()> {
365 let null_h = get_null_hash_value(&random_state);
366 hashes
367 .iter_mut()
368 .for_each(|h| *h = _boost_hash_combine(null_h, *h));
369 Ok(())
370 }
371}
372impl VecHash for BooleanChunked {
373 fn vec_hash(
374 &self,
375 random_state: PlSeedableRandomStateQuality,
376 buf: &mut Vec<u64>,
377 ) -> PolarsResult<()> {
378 buf.clear();
379 buf.reserve(self.len());
380 let true_h = random_state.hash_one(true);
381 let false_h = random_state.hash_one(false);
382 let null_h = get_null_hash_value(&random_state);
383 self.downcast_iter().for_each(|arr| {
384 if arr.null_count() == 0 {
385 buf.extend(arr.values_iter().map(|v| if v { true_h } else { false_h }))
386 } else {
387 buf.extend(arr.into_iter().map(|opt_v| match opt_v {
388 Some(true) => true_h,
389 Some(false) => false_h,
390 None => null_h,
391 }))
392 }
393 });
394 Ok(())
395 }
396
397 fn vec_hash_combine(
398 &self,
399 random_state: PlSeedableRandomStateQuality,
400 hashes: &mut [u64],
401 ) -> PolarsResult<()> {
402 let true_h = random_state.hash_one(true);
403 let false_h = random_state.hash_one(false);
404 let null_h = get_null_hash_value(&random_state);
405
406 let mut offset = 0;
407 self.downcast_iter().for_each(|arr| {
408 match arr.null_count() {
409 0 => arr
410 .values_iter()
411 .zip(&mut hashes[offset..])
412 .for_each(|(v, h)| {
413 let l = if v { true_h } else { false_h };
414 *h = _boost_hash_combine(l, *h)
415 }),
416 _ => {
417 let validity = arr.validity().unwrap();
418 let (slice, byte_offset, _) = validity.as_slice();
419 (0..validity.len())
420 .map(|i| unsafe { get_bit_unchecked(slice, i + byte_offset) })
421 .zip(&mut hashes[offset..])
422 .zip(arr.values())
423 .for_each(|((valid, h), l)| {
424 let l = if valid {
425 if l { true_h } else { false_h }
426 } else {
427 null_h
428 };
429 *h = _boost_hash_combine(l, *h)
430 });
431 },
432 }
433 offset += arr.len();
434 });
435 Ok(())
436 }
437}
438
439#[cfg(feature = "object")]
440impl<T> VecHash for ObjectChunked<T>
441where
442 T: PolarsObject,
443{
444 fn vec_hash(
445 &self,
446 random_state: PlSeedableRandomStateQuality,
447 buf: &mut Vec<u64>,
448 ) -> PolarsResult<()> {
449 buf.clear();
456 buf.reserve(self.len());
457
458 self.downcast_iter()
459 .for_each(|arr| buf.extend(arr.into_iter().map(|opt_v| random_state.hash_one(opt_v))));
460
461 Ok(())
462 }
463
464 fn vec_hash_combine(
465 &self,
466 random_state: PlSeedableRandomStateQuality,
467 hashes: &mut [u64],
468 ) -> PolarsResult<()> {
469 self.apply_to_slice(
470 |opt_v, h| {
471 let hashed = random_state.hash_one(opt_v);
472 _boost_hash_combine(hashed, *h)
473 },
474 hashes,
475 );
476 Ok(())
477 }
478}
479
480pub fn _df_rows_to_hashes_threaded_vertical(
481 keys: &[DataFrame],
482 build_hasher: Option<PlSeedableRandomStateQuality>,
483) -> PolarsResult<(Vec<UInt64Chunked>, PlSeedableRandomStateQuality)> {
484 let build_hasher = build_hasher.unwrap_or_default();
485
486 let hashes = POOL.install(|| {
487 keys.into_par_iter()
488 .map(|df| {
489 let hb = build_hasher.clone();
490 let mut hashes = vec![];
491 columns_to_hashes(df.get_columns(), Some(hb), &mut hashes)?;
492 Ok(UInt64Chunked::from_vec(PlSmallStr::EMPTY, hashes))
493 })
494 .collect::<PolarsResult<Vec<_>>>()
495 })?;
496 Ok((hashes, build_hasher))
497}
498
499pub fn columns_to_hashes(
500 keys: &[Column],
501 build_hasher: Option<PlSeedableRandomStateQuality>,
502 hashes: &mut Vec<u64>,
503) -> PolarsResult<PlSeedableRandomStateQuality> {
504 let build_hasher = build_hasher.unwrap_or_default();
505
506 let mut iter = keys.iter();
507 let first = iter.next().expect("at least one key");
508 first.vec_hash(build_hasher.clone(), hashes)?;
509
510 for keys in iter {
511 keys.vec_hash_combine(build_hasher.clone(), hashes)?;
512 }
513
514 Ok(build_hasher)
515}