revm_precompile/
blake2.rs

1//! Blake2 precompile. More details in [`run`]
2
3use crate::{
4    crypto, Precompile, PrecompileError, PrecompileId, PrecompileOutput, PrecompileResult,
5};
6
7const F_ROUND: u64 = 1;
8const INPUT_LENGTH: usize = 213;
9
10/// Blake2 precompile
11pub const FUN: Precompile = Precompile::new(PrecompileId::Blake2F, crate::u64_to_address(9), run);
12
13/// reference: <https://eips.ethereum.org/EIPS/eip-152>
14/// input format:
15/// [4 bytes for rounds][64 bytes for h][128 bytes for m][8 bytes for t_0][8 bytes for t_1][1 byte for f]
16pub fn run(input: &[u8], gas_limit: u64) -> PrecompileResult {
17    if input.len() != INPUT_LENGTH {
18        return Err(PrecompileError::Blake2WrongLength);
19    }
20
21    // Parse number of rounds (4 bytes)
22    let rounds = u32::from_be_bytes(input[..4].try_into().unwrap());
23    let gas_used = rounds as u64 * F_ROUND;
24    if gas_used > gas_limit {
25        return Err(PrecompileError::OutOfGas);
26    }
27
28    // Parse final block flag
29    let f = match input[212] {
30        0 => false,
31        1 => true,
32        _ => return Err(PrecompileError::Blake2WrongFinalIndicatorFlag),
33    };
34
35    // Parse state vector h (8 × u64)
36    let mut h = [0u64; 8];
37    input[4..68]
38        .chunks_exact(8)
39        .enumerate()
40        .for_each(|(i, chunk)| {
41            h[i] = u64::from_le_bytes(chunk.try_into().unwrap());
42        });
43
44    // Parse message block m (16 × u64)
45    let mut m = [0u64; 16];
46    input[68..196]
47        .chunks_exact(8)
48        .enumerate()
49        .for_each(|(i, chunk)| {
50            m[i] = u64::from_le_bytes(chunk.try_into().unwrap());
51        });
52
53    // Parse offset counters
54    let t_0 = u64::from_le_bytes(input[196..204].try_into().unwrap());
55    let t_1 = u64::from_le_bytes(input[204..212].try_into().unwrap());
56
57    crypto().blake2_compress(rounds, &mut h, m, [t_0, t_1], f);
58
59    let mut out = [0u8; 64];
60    for (i, h) in (0..64).step_by(8).zip(h.iter()) {
61        out[i..i + 8].copy_from_slice(&h.to_le_bytes());
62    }
63
64    Ok(PrecompileOutput::new(gas_used, out.into()))
65}
66
67/// Blake2 algorithm
68pub mod algo {
69    /// SIGMA from spec: <https://datatracker.ietf.org/doc/html/rfc7693#section-2.7>
70    pub const SIGMA: [[usize; 16]; 10] = [
71        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
72        [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
73        [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
74        [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
75        [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
76        [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
77        [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
78        [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
79        [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
80        [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
81    ];
82
83    /// got IV from: <https://en.wikipedia.org/wiki/BLAKE_(hash_function)>
84    pub const IV: [u64; 8] = [
85        0x6a09e667f3bcc908,
86        0xbb67ae8584caa73b,
87        0x3c6ef372fe94f82b,
88        0xa54ff53a5f1d36f1,
89        0x510e527fade682d1,
90        0x9b05688c2b3e6c1f,
91        0x1f83d9abfb41bd6b,
92        0x5be0cd19137e2179,
93    ];
94
95    #[inline(always)]
96    #[allow(clippy::many_single_char_names)]
97    /// G function: <https://tools.ietf.org/html/rfc7693#section-3.1>
98    fn g(v: &mut [u64; 16], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
99        let mut va = v[a];
100        let mut vb = v[b];
101        let mut vc = v[c];
102        let mut vd = v[d];
103
104        va = va.wrapping_add(vb).wrapping_add(x);
105        vd = (vd ^ va).rotate_right(32);
106        vc = vc.wrapping_add(vd);
107        vb = (vb ^ vc).rotate_right(24);
108
109        va = va.wrapping_add(vb).wrapping_add(y);
110        vd = (vd ^ va).rotate_right(16);
111        vc = vc.wrapping_add(vd);
112        vb = (vb ^ vc).rotate_right(63);
113
114        v[a] = va;
115        v[b] = vb;
116        v[c] = vc;
117        v[d] = vd;
118    }
119
120    /// Compression function F takes as an argument the state vector "h",
121    /// message block vector "m" (last block is padded with zeros to full
122    /// block size, if required), 2w-bit offset counter "t", and final block
123    /// indicator flag "f".  Local vector v[0..15] is used in processing.  F
124    /// returns a new state vector.  The number of rounds, "r", is 12 for
125    /// BLAKE2b and 10 for BLAKE2s.  Rounds are numbered from 0 to r - 1.
126    #[allow(clippy::many_single_char_names)]
127    pub fn compress(rounds: usize, h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool) {
128        #[cfg(all(target_feature = "avx2", feature = "std"))]
129        {
130            // only if it is compiled with avx2 flag and it is std, we can use avx2.
131            if std::is_x86_feature_detected!("avx2") {
132                // avx2 is 1.8x more performant than portable implementation.
133                unsafe {
134                    super::avx2::compress_block(
135                        rounds,
136                        &m,
137                        h,
138                        ((t[1] as u128) << 64) | (t[0] as u128),
139                        if f { !0 } else { 0 },
140                        0,
141                    );
142                }
143                return;
144            }
145        }
146
147        // if avx2 is not available, use the fallback portable implementation
148
149        let mut v = [0u64; 16];
150        v[..h.len()].copy_from_slice(h); // First half from state.
151        v[h.len()..].copy_from_slice(&IV); // Second half from IV.
152
153        v[12] ^= t[0];
154        v[13] ^= t[1];
155
156        if f {
157            v[14] = !v[14] // Invert all bits if the last-block-flag is set.
158        }
159        for i in 0..rounds {
160            round(&mut v, &m, i);
161        }
162
163        for i in 0..8 {
164            h[i] ^= v[i] ^ v[i + 8];
165        }
166    }
167
168    #[inline(always)]
169    fn round(v: &mut [u64; 16], m: &[u64; 16], r: usize) {
170        // Message word selection permutation for this round.
171        let s = &SIGMA[r % 10];
172        // g1
173        g(v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
174        g(v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
175        g(v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
176        g(v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
177
178        // g2
179        g(v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
180        g(v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
181        g(v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
182        g(v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
183    }
184}
185
186// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
187macro_rules! _MM_SHUFFLE {
188    ($z:expr, $y:expr, $x:expr, $w:expr) => {
189        ($z << 6) | ($y << 4) | ($x << 2) | $w
190    };
191}
192
193/// Code adapted from https://github.com/oconnor663/blake2_simd/blob/82b3e2aee4d2384aabbeb146058301ff0dbd453f/blake2b/src/avx2.rs
194#[cfg(all(target_feature = "avx2", feature = "std"))]
195#[allow(clippy::ptr_offset_with_cast)] // From array_refs
196mod avx2 {
197    #[cfg(target_arch = "x86")]
198    use core::arch::x86::*;
199    #[cfg(target_arch = "x86_64")]
200    use core::arch::x86_64::*;
201
202    use super::algo::IV;
203    use arrayref::{array_refs, mut_array_refs};
204
205    type Word = u64;
206    type Count = u128;
207    /// The number input bytes passed to each call to the compression function. Small benchmarks need
208    /// to use an even multiple of `BLOCKBYTES`, or else their apparent throughput will be low.
209    const BLOCKBYTES: usize = 16 * size_of::<Word>();
210
211    const DEGREE: usize = 4;
212
213    /// Compress a block of data using the BLAKE2 algorithm.
214    #[inline(always)]
215    pub(crate) unsafe fn compress_block(
216        mut rounds: usize,
217        block: &[Word; 16],
218        words: &mut [Word; 8],
219        count: Count,
220        last_block: Word,
221        last_node: Word,
222    ) {
223        let (words_low, words_high) = mut_array_refs!(words, DEGREE, DEGREE);
224        let (iv_low, iv_high) = array_refs!(&IV, DEGREE, DEGREE);
225        let mut a = loadu(words_low);
226        let mut b = loadu(words_high);
227        let mut c = loadu(iv_low);
228        let flags = set4(count_low(count), count_high(count), last_block, last_node);
229        let mut d = xor(loadu(iv_high), flags);
230
231        let block: &[u8; BLOCKBYTES] = std::mem::transmute(block);
232        let msg_chunks = array_refs!(block, 16, 16, 16, 16, 16, 16, 16, 16);
233        let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
234        let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
235        let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2));
236        let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3));
237        let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4));
238        let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5));
239        let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6));
240        let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7));
241
242        let iv0 = a;
243        let iv1 = b;
244        let mut t0;
245        let mut t1;
246        let mut b0;
247
248        loop {
249            if rounds == 0 {
250                break;
251            }
252            rounds -= 1;
253
254            // round 1
255            t0 = _mm256_unpacklo_epi64(m0, m1);
256            t1 = _mm256_unpacklo_epi64(m2, m3);
257            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
258            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
259            t0 = _mm256_unpackhi_epi64(m0, m1);
260            t1 = _mm256_unpackhi_epi64(m2, m3);
261            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
262            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
263            diagonalize(&mut a, &mut b, &mut c, &mut d);
264            t0 = _mm256_unpacklo_epi64(m7, m4);
265            t1 = _mm256_unpacklo_epi64(m5, m6);
266            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
267            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
268            t0 = _mm256_unpackhi_epi64(m7, m4);
269            t1 = _mm256_unpackhi_epi64(m5, m6);
270            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
271            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
272            undiagonalize(&mut a, &mut b, &mut c, &mut d);
273
274            if rounds == 0 {
275                break;
276            }
277            rounds -= 1;
278
279            // round 2
280            t0 = _mm256_unpacklo_epi64(m7, m2);
281            t1 = _mm256_unpackhi_epi64(m4, m6);
282            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
283            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
284            t0 = _mm256_unpacklo_epi64(m5, m4);
285            t1 = _mm256_alignr_epi8(m3, m7, 8);
286            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
287            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
288            diagonalize(&mut a, &mut b, &mut c, &mut d);
289            t0 = _mm256_unpackhi_epi64(m2, m0);
290            t1 = _mm256_blend_epi32(m5, m0, 0x33);
291            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
292            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
293            t0 = _mm256_alignr_epi8(m6, m1, 8);
294            t1 = _mm256_blend_epi32(m3, m1, 0x33);
295            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
296            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
297            undiagonalize(&mut a, &mut b, &mut c, &mut d);
298
299            if rounds == 0 {
300                break;
301            }
302            rounds -= 1;
303
304            // round 3
305            t0 = _mm256_alignr_epi8(m6, m5, 8);
306            t1 = _mm256_unpackhi_epi64(m2, m7);
307            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
308            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
309            t0 = _mm256_unpacklo_epi64(m4, m0);
310            t1 = _mm256_blend_epi32(m6, m1, 0x33);
311            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
312            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
313            diagonalize(&mut a, &mut b, &mut c, &mut d);
314            t0 = _mm256_alignr_epi8(m5, m4, 8);
315            t1 = _mm256_unpackhi_epi64(m1, m3);
316            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
317            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
318            t0 = _mm256_unpacklo_epi64(m2, m7);
319            t1 = _mm256_blend_epi32(m0, m3, 0x33);
320            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
321            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
322            undiagonalize(&mut a, &mut b, &mut c, &mut d);
323
324            if rounds == 0 {
325                break;
326            }
327            rounds -= 1;
328
329            // round 4
330            t0 = _mm256_unpackhi_epi64(m3, m1);
331            t1 = _mm256_unpackhi_epi64(m6, m5);
332            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
333            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
334            t0 = _mm256_unpackhi_epi64(m4, m0);
335            t1 = _mm256_unpacklo_epi64(m6, m7);
336            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
337            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
338            diagonalize(&mut a, &mut b, &mut c, &mut d);
339            t0 = _mm256_alignr_epi8(m1, m7, 8);
340            t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2));
341            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
342            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
343            t0 = _mm256_unpacklo_epi64(m4, m3);
344            t1 = _mm256_unpacklo_epi64(m5, m0);
345            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
346            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
347            undiagonalize(&mut a, &mut b, &mut c, &mut d);
348
349            if rounds == 0 {
350                break;
351            }
352            rounds -= 1;
353
354            // round 5
355            t0 = _mm256_unpackhi_epi64(m4, m2);
356            t1 = _mm256_unpacklo_epi64(m1, m5);
357            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
358            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
359            t0 = _mm256_blend_epi32(m3, m0, 0x33);
360            t1 = _mm256_blend_epi32(m7, m2, 0x33);
361            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
362            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
363            diagonalize(&mut a, &mut b, &mut c, &mut d);
364            t0 = _mm256_alignr_epi8(m7, m1, 8);
365            t1 = _mm256_alignr_epi8(m3, m5, 8);
366            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
367            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
368            t0 = _mm256_unpackhi_epi64(m6, m0);
369            t1 = _mm256_unpacklo_epi64(m6, m4);
370            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
371            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
372            undiagonalize(&mut a, &mut b, &mut c, &mut d);
373
374            if rounds == 0 {
375                break;
376            }
377            rounds -= 1;
378
379            // round 6
380            t0 = _mm256_unpacklo_epi64(m1, m3);
381            t1 = _mm256_unpacklo_epi64(m0, m4);
382            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
383            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
384            t0 = _mm256_unpacklo_epi64(m6, m5);
385            t1 = _mm256_unpackhi_epi64(m5, m1);
386            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
387            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
388            diagonalize(&mut a, &mut b, &mut c, &mut d);
389            t0 = _mm256_alignr_epi8(m2, m0, 8);
390            t1 = _mm256_unpackhi_epi64(m3, m7);
391            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
392            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
393            t0 = _mm256_unpackhi_epi64(m4, m6);
394            t1 = _mm256_alignr_epi8(m7, m2, 8);
395            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
396            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
397            undiagonalize(&mut a, &mut b, &mut c, &mut d);
398
399            if rounds == 0 {
400                break;
401            }
402            rounds -= 1;
403
404            // round 7
405            t0 = _mm256_blend_epi32(m0, m6, 0x33);
406            t1 = _mm256_unpacklo_epi64(m7, m2);
407            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
408            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
409            t0 = _mm256_unpackhi_epi64(m2, m7);
410            t1 = _mm256_alignr_epi8(m5, m6, 8);
411            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
412            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
413            diagonalize(&mut a, &mut b, &mut c, &mut d);
414            t0 = _mm256_unpacklo_epi64(m4, m0);
415            t1 = _mm256_blend_epi32(m4, m3, 0x33);
416            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
417            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
418            t0 = _mm256_unpackhi_epi64(m5, m3);
419            t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2));
420            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
421            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
422            undiagonalize(&mut a, &mut b, &mut c, &mut d);
423
424            if rounds == 0 {
425                break;
426            }
427            rounds -= 1;
428            // round 8
429            t0 = _mm256_unpackhi_epi64(m6, m3);
430            t1 = _mm256_blend_epi32(m1, m6, 0x33);
431            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
432            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
433            t0 = _mm256_alignr_epi8(m7, m5, 8);
434            t1 = _mm256_unpackhi_epi64(m0, m4);
435            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
436            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
437            diagonalize(&mut a, &mut b, &mut c, &mut d);
438            t0 = _mm256_blend_epi32(m2, m1, 0x33);
439            t1 = _mm256_alignr_epi8(m4, m7, 8);
440            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
441            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
442            t0 = _mm256_unpacklo_epi64(m5, m0);
443            t1 = _mm256_unpacklo_epi64(m2, m3);
444            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
445            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
446            undiagonalize(&mut a, &mut b, &mut c, &mut d);
447
448            if rounds == 0 {
449                break;
450            }
451            rounds -= 1;
452
453            // round 9
454            t0 = _mm256_unpacklo_epi64(m3, m7);
455            t1 = _mm256_alignr_epi8(m0, m5, 8);
456            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
457            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
458            t0 = _mm256_unpackhi_epi64(m7, m4);
459            t1 = _mm256_alignr_epi8(m4, m1, 8);
460            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
461            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
462            diagonalize(&mut a, &mut b, &mut c, &mut d);
463            t0 = _mm256_unpacklo_epi64(m5, m6);
464            t1 = _mm256_unpackhi_epi64(m6, m0);
465            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
466            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
467            t0 = _mm256_alignr_epi8(m1, m2, 8);
468            t1 = _mm256_alignr_epi8(m2, m3, 8);
469            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
470            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
471            undiagonalize(&mut a, &mut b, &mut c, &mut d);
472
473            if rounds == 0 {
474                break;
475            }
476            rounds -= 1;
477
478            // round 10
479            t0 = _mm256_unpacklo_epi64(m5, m4);
480            t1 = _mm256_unpackhi_epi64(m3, m0);
481            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
482            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
483            t0 = _mm256_unpacklo_epi64(m1, m2);
484            t1 = _mm256_blend_epi32(m2, m3, 0x33);
485            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
486            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
487            diagonalize(&mut a, &mut b, &mut c, &mut d);
488            t0 = _mm256_unpackhi_epi64(m6, m7);
489            t1 = _mm256_unpackhi_epi64(m4, m1);
490            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
491            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
492            t0 = _mm256_blend_epi32(m5, m0, 0x33);
493            t1 = _mm256_unpacklo_epi64(m7, m6);
494            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
495            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
496            undiagonalize(&mut a, &mut b, &mut c, &mut d);
497
498            // last two rounds are removed
499        }
500        a = xor(a, c);
501        b = xor(b, d);
502        a = xor(a, iv0);
503        b = xor(b, iv1);
504
505        storeu(a, words_low);
506        storeu(b, words_high);
507    }
508
509    #[inline(always)]
510    pub(crate) fn count_low(count: Count) -> Word {
511        count as Word
512    }
513
514    #[inline(always)]
515    pub(crate) fn count_high(count: Count) -> Word {
516        (count >> Word::BITS as usize) as Word
517    }
518
519    #[inline(always)]
520    unsafe fn loadu(src: *const [Word; DEGREE]) -> __m256i {
521        // This is an unaligned load, so the pointer cast is allowed.
522        _mm256_loadu_si256(src as *const __m256i)
523    }
524
525    #[inline(always)]
526    unsafe fn storeu(src: __m256i, dest: *mut [Word; DEGREE]) {
527        // This is an unaligned store, so the pointer cast is allowed.
528        _mm256_storeu_si256(dest as *mut __m256i, src)
529    }
530
531    #[inline(always)]
532    unsafe fn loadu_128(mem_addr: &[u8; 16]) -> __m128i {
533        _mm_loadu_si128(mem_addr.as_ptr() as *const __m128i)
534    }
535
536    #[inline(always)]
537    unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
538        _mm256_add_epi64(a, b)
539    }
540
541    #[inline(always)]
542    unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
543        _mm256_xor_si256(a, b)
544    }
545
546    #[inline(always)]
547    unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i {
548        _mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64)
549    }
550
551    // These rotations are the "simple version". For the "complicated version", see
552    // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2b-common.h#L43-L46.
553    // For a discussion of the tradeoffs, see
554    // https://github.com/sneves/blake2-avx2/pull/5. In short:
555    // - Due to an LLVM bug (https://bugs.llvm.org/show_bug.cgi?id=44379), this
556    //   version performs better on recent x86 chips.
557    // - LLVM is able to optimize this version to AVX-512 rotation instructions
558    //   when those are enabled.
559    #[inline(always)]
560    unsafe fn rot32(x: __m256i) -> __m256i {
561        _mm256_or_si256(_mm256_srli_epi64(x, 32), _mm256_slli_epi64(x, 64 - 32))
562    }
563
564    #[inline(always)]
565    unsafe fn rot24(x: __m256i) -> __m256i {
566        _mm256_or_si256(_mm256_srli_epi64(x, 24), _mm256_slli_epi64(x, 64 - 24))
567    }
568
569    #[inline(always)]
570    unsafe fn rot16(x: __m256i) -> __m256i {
571        _mm256_or_si256(_mm256_srli_epi64(x, 16), _mm256_slli_epi64(x, 64 - 16))
572    }
573
574    #[inline(always)]
575    unsafe fn rot63(x: __m256i) -> __m256i {
576        _mm256_or_si256(_mm256_srli_epi64(x, 63), _mm256_slli_epi64(x, 64 - 63))
577    }
578
579    #[inline(always)]
580    unsafe fn g1(
581        a: &mut __m256i,
582        b: &mut __m256i,
583        c: &mut __m256i,
584        d: &mut __m256i,
585        m: &mut __m256i,
586    ) {
587        *a = add(*a, *m);
588        *a = add(*a, *b);
589        *d = xor(*d, *a);
590        *d = rot32(*d);
591        *c = add(*c, *d);
592        *b = xor(*b, *c);
593        *b = rot24(*b);
594    }
595
596    #[inline(always)]
597    unsafe fn g2(
598        a: &mut __m256i,
599        b: &mut __m256i,
600        c: &mut __m256i,
601        d: &mut __m256i,
602        m: &mut __m256i,
603    ) {
604        *a = add(*a, *m);
605        *a = add(*a, *b);
606        *d = xor(*d, *a);
607        *d = rot16(*d);
608        *c = add(*c, *d);
609        *b = xor(*b, *c);
610        *b = rot63(*b);
611    }
612
613    // Note the optimization here of leaving b as the unrotated row, rather than a.
614    // All the message loads below are adjusted to compensate for this. See
615    // discussion at https://github.com/sneves/blake2-avx2/pull/4
616    #[inline(always)]
617    unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
618        *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3));
619        *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
620        *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1));
621    }
622
623    #[inline(always)]
624    unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
625        *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1));
626        *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
627        *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3));
628    }
629}
630
631#[cfg(test)]
632mod tests {
633    use super::*;
634    use primitives::hex;
635    use std::time::Instant;
636
637    #[test]
638    fn perfblake2() {
639        let input = [hex!("0000040048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b616162636465666768696a6b6c6d6e6f700000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001")
640        ,hex!("0000020048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001")
641        ,hex!("0000004048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001")];
642
643        let time = Instant::now();
644        for i in 0..3000 {
645            let _ = run(&input[i % 3], u64::MAX).unwrap();
646        }
647        println!("duration: {:?}", time.elapsed());
648    }
649}