twox_hash/
xxhash64.rs

1//! The implementation of XXH64.
2
3use core::{
4    fmt,
5    hash::{self, BuildHasher},
6    mem,
7};
8
9use crate::IntoU64;
10
11// Keeping these constants in this form to match the C code.
12const PRIME64_1: u64 = 0x9E3779B185EBCA87;
13const PRIME64_2: u64 = 0xC2B2AE3D27D4EB4F;
14const PRIME64_3: u64 = 0x165667B19E3779F9;
15const PRIME64_4: u64 = 0x85EBCA77C2B2AE63;
16const PRIME64_5: u64 = 0x27D4EB2F165667C5;
17
18type Lane = u64;
19type Lanes = [Lane; 4];
20type Bytes = [u8; 32];
21
22const BYTES_IN_LANE: usize = mem::size_of::<Bytes>();
23
24#[derive(Clone, PartialEq)]
25struct BufferData(Lanes);
26
27impl BufferData {
28    const fn new() -> Self {
29        Self([0; 4])
30    }
31
32    const fn bytes(&self) -> &Bytes {
33        const _: () = assert!(mem::align_of::<u8>() <= mem::align_of::<Lane>());
34        // SAFETY[bytes]: The alignment of `u64` is at least that of
35        // `u8` and all the values are initialized.
36        unsafe { &*self.0.as_ptr().cast() }
37    }
38
39    fn bytes_mut(&mut self) -> &mut Bytes {
40        // SAFETY: See SAFETY[bytes]
41        unsafe { &mut *self.0.as_mut_ptr().cast() }
42    }
43}
44
45impl fmt::Debug for BufferData {
46    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47        f.debug_list().entries(self.0.iter()).finish()
48    }
49}
50
51#[derive(Debug, Clone, PartialEq)]
52struct Buffer {
53    offset: usize,
54    data: BufferData,
55}
56
57impl Buffer {
58    const fn new() -> Self {
59        Self {
60            offset: 0,
61            data: BufferData::new(),
62        }
63    }
64
65    // RATIONALE: See RATIONALE[inline]
66    #[inline]
67    fn extend<'d>(&mut self, data: &'d [u8]) -> (Option<&Lanes>, &'d [u8]) {
68        // Most of the slice methods we use here have `_unchecked` variants, but
69        //
70        // 1. this method is called one time per `Hasher::write` call
71        // 2. this method early exits if we don't have anything in the buffer
72        //
73        // Because of this, removing the panics via `unsafe` doesn't
74        // have much benefit other than reducing code size by a tiny
75        // bit.
76
77        if self.offset == 0 {
78            return (None, data);
79        };
80
81        let bytes = self.data.bytes_mut();
82        debug_assert!(self.offset <= bytes.len());
83
84        let empty = &mut bytes[self.offset..];
85        let n_to_copy = usize::min(empty.len(), data.len());
86
87        let dst = &mut empty[..n_to_copy];
88
89        let (src, rest) = data.split_at(n_to_copy);
90
91        dst.copy_from_slice(src);
92        self.offset += n_to_copy;
93
94        debug_assert!(self.offset <= bytes.len());
95
96        if self.offset == bytes.len() {
97            self.offset = 0;
98            (Some(&self.data.0), rest)
99        } else {
100            (None, rest)
101        }
102    }
103
104    // RATIONALE: See RATIONALE[inline]
105    #[inline]
106    fn set(&mut self, data: &[u8]) {
107        if data.is_empty() {
108            return;
109        }
110
111        debug_assert_eq!(self.offset, 0);
112
113        let n_to_copy = data.len();
114
115        let bytes = self.data.bytes_mut();
116        debug_assert!(n_to_copy < bytes.len());
117
118        bytes[..n_to_copy].copy_from_slice(data);
119        self.offset = data.len();
120    }
121
122    // RATIONALE: See RATIONALE[inline]
123    #[inline]
124    fn remaining(&self) -> &[u8] {
125        &self.data.bytes()[..self.offset]
126    }
127}
128
129#[derive(Clone, PartialEq)]
130struct Accumulators(Lanes);
131
132impl Accumulators {
133    const fn new(seed: u64) -> Self {
134        Self([
135            seed.wrapping_add(PRIME64_1).wrapping_add(PRIME64_2),
136            seed.wrapping_add(PRIME64_2),
137            seed,
138            seed.wrapping_sub(PRIME64_1),
139        ])
140    }
141
142    // RATIONALE: See RATIONALE[inline]
143    #[inline]
144    fn write(&mut self, lanes: Lanes) {
145        let [acc1, acc2, acc3, acc4] = &mut self.0;
146        let [lane1, lane2, lane3, lane4] = lanes;
147
148        *acc1 = round(*acc1, lane1.to_le());
149        *acc2 = round(*acc2, lane2.to_le());
150        *acc3 = round(*acc3, lane3.to_le());
151        *acc4 = round(*acc4, lane4.to_le());
152    }
153
154    // RATIONALE: See RATIONALE[inline]
155    #[inline]
156    fn write_many<'d>(&mut self, mut data: &'d [u8]) -> &'d [u8] {
157        while let Some((chunk, rest)) = data.split_first_chunk::<BYTES_IN_LANE>() {
158            // SAFETY: We have the right number of bytes and are
159            // handling the unaligned case.
160            let lanes = unsafe { chunk.as_ptr().cast::<Lanes>().read_unaligned() };
161            self.write(lanes);
162            data = rest;
163        }
164        data
165    }
166
167    // RATIONALE: See RATIONALE[inline]
168    #[inline]
169    const fn finish(&self) -> u64 {
170        let [acc1, acc2, acc3, acc4] = self.0;
171
172        let mut acc = {
173            let acc1 = acc1.rotate_left(1);
174            let acc2 = acc2.rotate_left(7);
175            let acc3 = acc3.rotate_left(12);
176            let acc4 = acc4.rotate_left(18);
177
178            acc1.wrapping_add(acc2)
179                .wrapping_add(acc3)
180                .wrapping_add(acc4)
181        };
182
183        acc = Self::merge_accumulator(acc, acc1);
184        acc = Self::merge_accumulator(acc, acc2);
185        acc = Self::merge_accumulator(acc, acc3);
186        acc = Self::merge_accumulator(acc, acc4);
187
188        acc
189    }
190
191    // RATIONALE: See RATIONALE[inline]
192    #[inline]
193    const fn merge_accumulator(mut acc: u64, acc_n: u64) -> u64 {
194        acc ^= round(0, acc_n);
195        acc = acc.wrapping_mul(PRIME64_1);
196        acc.wrapping_add(PRIME64_4)
197    }
198}
199
200impl fmt::Debug for Accumulators {
201    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
202        let [acc1, acc2, acc3, acc4] = self.0;
203        f.debug_struct("Accumulators")
204            .field("acc1", &acc1)
205            .field("acc2", &acc2)
206            .field("acc3", &acc3)
207            .field("acc4", &acc4)
208            .finish()
209    }
210}
211
212/// Calculates the 64-bit hash.
213#[derive(Debug, Clone, PartialEq)]
214pub struct Hasher {
215    seed: u64,
216    accumulators: Accumulators,
217    buffer: Buffer,
218    length: u64,
219}
220
221impl Default for Hasher {
222    fn default() -> Self {
223        Self::with_seed(0)
224    }
225}
226
227impl Hasher {
228    /// Hash all data at once. If you can use this function, you may
229    /// see noticable speed gains for certain types of input.
230    #[must_use]
231    // RATIONALE[inline]:
232    //
233    // These `inline`s help unlock a speedup in one benchmark [1] from
234    // ~900µs to ~200µs.
235    //
236    // Further inspection of the disassembly showed that various
237    // helper functions were not being inlined. Avoiding these few
238    // function calls wins us the tiniest performance increase, just
239    // enough so that we are neck-and-neck with (or slightly faster
240    // than!) the C code.
241    //
242    // This results in the entire hash computation being inlined at
243    // the call site.
244    //
245    // [1]: https://github.com/apache/datafusion-comet/pull/575
246    #[inline]
247    pub fn oneshot(seed: u64, data: &[u8]) -> u64 {
248        let len = data.len();
249
250        // Since we know that there's no more data coming, we don't
251        // need to construct the intermediate buffers or copy data to
252        // or from the buffers.
253
254        let mut accumulators = Accumulators::new(seed);
255
256        let data = accumulators.write_many(data);
257
258        Self::finish_with(seed, len.into_u64(), &accumulators, data)
259    }
260
261    /// Constructs the hasher with an initial seed.
262    #[must_use]
263    pub const fn with_seed(seed: u64) -> Self {
264        // Step 1. Initialize internal accumulators
265        Self {
266            seed,
267            accumulators: Accumulators::new(seed),
268            buffer: Buffer::new(),
269            length: 0,
270        }
271    }
272
273    /// The seed this hasher was created with.
274    pub const fn seed(&self) -> u64 {
275        self.seed
276    }
277
278    /// The total number of bytes hashed.
279    pub const fn total_len(&self) -> u64 {
280        self.length
281    }
282
283    #[must_use]
284    // RATIONALE: See RATIONALE[inline]
285    #[inline]
286    fn finish_with(seed: u64, len: u64, accumulators: &Accumulators, mut remaining: &[u8]) -> u64 {
287        // Step 3. Accumulator convergence
288        let mut acc = if len < BYTES_IN_LANE.into_u64() {
289            seed.wrapping_add(PRIME64_5)
290        } else {
291            accumulators.finish()
292        };
293
294        // Step 4. Add input length
295        acc += len;
296
297        // Step 5. Consume remaining input
298        while let Some((chunk, rest)) = remaining.split_first_chunk() {
299            let lane = u64::from_ne_bytes(*chunk).to_le();
300
301            acc ^= round(0, lane);
302            acc = acc.rotate_left(27).wrapping_mul(PRIME64_1);
303            acc = acc.wrapping_add(PRIME64_4);
304            remaining = rest;
305        }
306
307        while let Some((chunk, rest)) = remaining.split_first_chunk() {
308            let lane = u32::from_ne_bytes(*chunk).to_le().into_u64();
309
310            acc ^= lane.wrapping_mul(PRIME64_1);
311            acc = acc.rotate_left(23).wrapping_mul(PRIME64_2);
312            acc = acc.wrapping_add(PRIME64_3);
313
314            remaining = rest;
315        }
316
317        for &byte in remaining {
318            let lane = byte.into_u64();
319
320            acc ^= lane.wrapping_mul(PRIME64_5);
321            acc = acc.rotate_left(11).wrapping_mul(PRIME64_1);
322        }
323
324        // Step 6. Final mix (avalanche)
325        acc ^= acc >> 33;
326        acc = acc.wrapping_mul(PRIME64_2);
327        acc ^= acc >> 29;
328        acc = acc.wrapping_mul(PRIME64_3);
329        acc ^= acc >> 32;
330
331        acc
332    }
333}
334
335impl hash::Hasher for Hasher {
336    // RATIONALE: See RATIONALE[inline]
337    #[inline]
338    fn write(&mut self, data: &[u8]) {
339        let len = data.len();
340
341        // Step 2. Process stripes
342        let (buffered_lanes, data) = self.buffer.extend(data);
343
344        if let Some(&lanes) = buffered_lanes {
345            self.accumulators.write(lanes);
346        }
347
348        let data = self.accumulators.write_many(data);
349
350        self.buffer.set(data);
351
352        self.length += len.into_u64();
353    }
354
355    // RATIONALE: See RATIONALE[inline]
356    #[inline]
357    fn finish(&self) -> u64 {
358        Self::finish_with(
359            self.seed,
360            self.length,
361            &self.accumulators,
362            self.buffer.remaining(),
363        )
364    }
365}
366
367// RATIONALE: See RATIONALE[inline]
368#[inline]
369const fn round(mut acc: u64, lane: u64) -> u64 {
370    acc = acc.wrapping_add(lane.wrapping_mul(PRIME64_2));
371    acc = acc.rotate_left(31);
372    acc.wrapping_mul(PRIME64_1)
373}
374
375/// Constructs [`Hasher`][] for multiple hasher instances.
376#[derive(Clone)]
377pub struct State(u64);
378
379impl State {
380    /// Constructs the hasher with an initial seed.
381    pub fn with_seed(seed: u64) -> Self {
382        Self(seed)
383    }
384}
385
386impl BuildHasher for State {
387    type Hasher = Hasher;
388
389    fn build_hasher(&self) -> Self::Hasher {
390        Hasher::with_seed(self.0)
391    }
392}
393
394#[cfg(test)]
395mod test {
396    use core::{
397        array,
398        hash::{BuildHasherDefault, Hasher as _},
399    };
400    use std::collections::HashMap;
401
402    use super::*;
403
404    const _TRAITS: () = {
405        const fn is_clone<T: Clone>() {}
406        is_clone::<Hasher>();
407        is_clone::<State>();
408    };
409
410    const EMPTY_BYTES: [u8; 0] = [];
411
412    #[test]
413    fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() {
414        let bytes = [0x9c; 32];
415
416        let mut byte_by_byte = Hasher::with_seed(0);
417        for byte in bytes.chunks(1) {
418            byte_by_byte.write(byte);
419        }
420        let byte_by_byte = byte_by_byte.finish();
421
422        let mut one_chunk = Hasher::with_seed(0);
423        one_chunk.write(&bytes);
424        let one_chunk = one_chunk.finish();
425
426        assert_eq!(byte_by_byte, one_chunk);
427    }
428
429    #[test]
430    fn hash_of_nothing_matches_c_implementation() {
431        let mut hasher = Hasher::with_seed(0);
432        hasher.write(&EMPTY_BYTES);
433        assert_eq!(hasher.finish(), 0xef46_db37_51d8_e999);
434    }
435
436    #[test]
437    fn hash_of_single_byte_matches_c_implementation() {
438        let mut hasher = Hasher::with_seed(0);
439        hasher.write(&[42]);
440        assert_eq!(hasher.finish(), 0x0a9e_dece_beb0_3ae4);
441    }
442
443    #[test]
444    fn hash_of_multiple_bytes_matches_c_implementation() {
445        let mut hasher = Hasher::with_seed(0);
446        hasher.write(b"Hello, world!\0");
447        assert_eq!(hasher.finish(), 0x7b06_c531_ea43_e89f);
448    }
449
450    #[test]
451    fn hash_of_multiple_chunks_matches_c_implementation() {
452        let bytes: [u8; 100] = array::from_fn(|i| i as u8);
453        let mut hasher = Hasher::with_seed(0);
454        hasher.write(&bytes);
455        assert_eq!(hasher.finish(), 0x6ac1_e580_3216_6597);
456    }
457
458    #[test]
459    fn hash_with_different_seed_matches_c_implementation() {
460        let mut hasher = Hasher::with_seed(0xae05_4331_1b70_2d91);
461        hasher.write(&EMPTY_BYTES);
462        assert_eq!(hasher.finish(), 0x4b6a_04fc_df7a_4672);
463    }
464
465    #[test]
466    fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() {
467        let bytes: [u8; 100] = array::from_fn(|i| i as u8);
468        let mut hasher = Hasher::with_seed(0xae05_4331_1b70_2d91);
469        hasher.write(&bytes);
470        assert_eq!(hasher.finish(), 0x567e_355e_0682_e1f1);
471    }
472
473    #[test]
474    fn hashes_with_different_offsets_are_the_same() {
475        let bytes = [0x7c; 4096];
476        let expected = Hasher::oneshot(0, &[0x7c; 64]);
477
478        let the_same = bytes
479            .windows(64)
480            .map(|w| {
481                let mut hasher = Hasher::with_seed(0);
482                hasher.write(w);
483                hasher.finish()
484            })
485            .all(|h| h == expected);
486        assert!(the_same);
487    }
488
489    #[test]
490    fn can_be_used_in_a_hashmap_with_a_default_seed() {
491        let mut hash: HashMap<_, _, BuildHasherDefault<Hasher>> = Default::default();
492        hash.insert(42, "the answer");
493        assert_eq!(hash.get(&42), Some(&"the answer"));
494    }
495}
496
497#[cfg(feature = "random")]
498#[cfg_attr(docsrs, doc(cfg(feature = "random")))]
499mod random_impl {
500    use super::*;
501
502    /// Constructs a randomized seed and reuses it for multiple hasher
503    /// instances.
504    #[derive(Clone)]
505    pub struct RandomState(State);
506
507    impl Default for RandomState {
508        fn default() -> Self {
509            Self::new()
510        }
511    }
512
513    impl RandomState {
514        fn new() -> Self {
515            Self(State::with_seed(rand::random()))
516        }
517    }
518
519    impl BuildHasher for RandomState {
520        type Hasher = Hasher;
521
522        fn build_hasher(&self) -> Self::Hasher {
523            self.0.build_hasher()
524        }
525    }
526
527    #[cfg(test)]
528    mod test {
529        use std::collections::HashMap;
530
531        use super::*;
532
533        const _TRAITS: () = {
534            const fn is_clone<T: Clone>() {}
535            is_clone::<RandomState>();
536        };
537
538        #[test]
539        fn can_be_used_in_a_hashmap_with_a_random_seed() {
540            let mut hash: HashMap<_, _, RandomState> = Default::default();
541            hash.insert(42, "the answer");
542            assert_eq!(hash.get(&42), Some(&"the answer"));
543        }
544    }
545}
546
547#[cfg(feature = "random")]
548#[cfg_attr(docsrs, doc(cfg(feature = "random")))]
549pub use random_impl::*;
550
551#[cfg(feature = "serialize")]
552#[cfg_attr(docsrs, doc(cfg(feature = "serialize")))]
553mod serialize_impl {
554    use serde::{Deserialize, Serialize};
555
556    use super::*;
557
558    impl<'de> Deserialize<'de> for Hasher {
559        fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
560        where
561            D: serde::Deserializer<'de>,
562        {
563            let shim = Deserialize::deserialize(deserializer)?;
564
565            let Shim {
566                total_len,
567                seed,
568                core,
569                buffer,
570                buffer_usage,
571            } = shim;
572            let Core { v1, v2, v3, v4 } = core;
573
574            let mut buffer_data = BufferData::new();
575            buffer_data.bytes_mut().copy_from_slice(&buffer);
576
577            Ok(Hasher {
578                seed,
579                accumulators: Accumulators([v1, v2, v3, v4]),
580                buffer: Buffer {
581                    offset: buffer_usage,
582                    data: buffer_data,
583                },
584                length: total_len,
585            })
586        }
587    }
588
589    impl Serialize for Hasher {
590        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
591        where
592            S: serde::Serializer,
593        {
594            let Hasher {
595                seed,
596                ref accumulators,
597                ref buffer,
598                length,
599            } = *self;
600            let [v1, v2, v3, v4] = accumulators.0;
601            let Buffer { offset, ref data } = *buffer;
602            let buffer = *data.bytes();
603
604            let shim = Shim {
605                total_len: length,
606                seed,
607                core: Core { v1, v2, v3, v4 },
608                buffer,
609                buffer_usage: offset,
610            };
611
612            shim.serialize(serializer)
613        }
614    }
615
616    #[derive(Serialize, Deserialize)]
617    struct Shim {
618        total_len: u64,
619        seed: u64,
620        core: Core,
621        buffer: [u8; 32],
622        buffer_usage: usize,
623    }
624
625    #[derive(Serialize, Deserialize)]
626    struct Core {
627        v1: u64,
628        v2: u64,
629        v3: u64,
630        v4: u64,
631    }
632
633    #[cfg(test)]
634    mod test {
635        use std::hash::Hasher as _;
636
637        use super::*;
638
639        type Result<T = (), E = serde_json::Error> = core::result::Result<T, E>;
640
641        #[test]
642        fn test_serialization_cycle() -> Result {
643            let mut hasher = Hasher::with_seed(0);
644            hasher.write(b"Hello, world!\0");
645            let _ = hasher.finish();
646
647            let serialized = serde_json::to_string(&hasher)?;
648            let unserialized: Hasher = serde_json::from_str(&serialized)?;
649            assert_eq!(hasher, unserialized);
650            Ok(())
651        }
652
653        #[test]
654        fn test_serialization_stability() -> Result {
655            let mut hasher = Hasher::with_seed(0);
656            hasher.write(b"Hello, world!\0");
657            let _ = hasher.finish();
658
659            let expected_serialized = r#"{
660                "total_len": 14,
661                "seed": 0,
662                "core": {
663                  "v1": 6983438078262162902,
664                  "v2": 14029467366897019727,
665                  "v3": 0,
666                  "v4": 7046029288634856825
667                },
668                "buffer": [
669                  72,  101, 108, 108, 111, 44, 32, 119,
670                  111, 114, 108, 100, 33,  0,  0,  0,
671                  0,   0,   0,   0,   0,   0,  0,  0,
672                  0,   0,   0,   0,   0,   0,  0,  0
673                ],
674                "buffer_usage": 14
675            }"#;
676
677            let unserialized: Hasher = serde_json::from_str(expected_serialized)?;
678            assert_eq!(hasher, unserialized);
679
680            let expected_value: serde_json::Value = serde_json::from_str(expected_serialized)?;
681            let actual_value = serde_json::to_value(&hasher)?;
682            assert_eq!(expected_value, actual_value);
683
684            Ok(())
685        }
686    }
687}