revm_precompile/
blake2.rs

1//! Blake2 precompile. More details in [`run`]
2
3use crate::{PrecompileError, PrecompileOutput, PrecompileResult, PrecompileWithAddress};
4
5const F_ROUND: u64 = 1;
6const INPUT_LENGTH: usize = 213;
7
8/// Blake2 precompile
9pub const FUN: PrecompileWithAddress = PrecompileWithAddress(crate::u64_to_address(9), run);
10
11/// reference: <https://eips.ethereum.org/EIPS/eip-152>
12/// input format:
13/// [4 bytes for rounds][64 bytes for h][128 bytes for m][8 bytes for t_0][8 bytes for t_1][1 byte for f]
14pub fn run(input: &[u8], gas_limit: u64) -> PrecompileResult {
15    if input.len() != INPUT_LENGTH {
16        return Err(PrecompileError::Blake2WrongLength);
17    }
18
19    // Rounds 4 bytes
20    let rounds = u32::from_be_bytes(input[..4].try_into().unwrap()) as usize;
21    let gas_used = rounds as u64 * F_ROUND;
22    if gas_used > gas_limit {
23        return Err(PrecompileError::OutOfGas);
24    }
25
26    let f = match input[212] {
27        1 => true,
28        0 => false,
29        _ => return Err(PrecompileError::Blake2WrongFinalIndicatorFlag),
30    };
31
32    let mut h = [0u64; 8];
33    //let mut m = [0u64; 16];
34
35    let t;
36    // Optimized parsing using ptr::read_unaligned for potentially better performance
37
38    let m;
39    unsafe {
40        let ptr = input.as_ptr();
41
42        // Read h values
43        for (i, item) in h.iter_mut().enumerate() {
44            *item = u64::from_le_bytes(core::ptr::read_unaligned(
45                ptr.add(4 + i * 8) as *const [u8; 8]
46            ));
47        }
48
49        m = input[68..68 + 16 * size_of::<u64>()].try_into().unwrap();
50
51        t = [
52            u64::from_le_bytes(core::ptr::read_unaligned(ptr.add(196) as *const [u8; 8])),
53            u64::from_le_bytes(core::ptr::read_unaligned(ptr.add(204) as *const [u8; 8])),
54        ];
55    }
56    algo::compress(rounds, &mut h, m, t, f);
57
58    let mut out = [0u8; 64];
59    for (i, h) in (0..64).step_by(8).zip(h.iter()) {
60        out[i..i + 8].copy_from_slice(&h.to_le_bytes());
61    }
62
63    Ok(PrecompileOutput::new(gas_used, out.into()))
64}
65
66/// Blake2 algorithm
67pub mod algo {
68    /// SIGMA from spec: <https://datatracker.ietf.org/doc/html/rfc7693#section-2.7>
69    pub const SIGMA: [[usize; 16]; 10] = [
70        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
71        [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
72        [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
73        [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
74        [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
75        [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
76        [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
77        [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
78        [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
79        [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
80    ];
81
82    /// got IV from: <https://en.wikipedia.org/wiki/BLAKE_(hash_function)>
83    pub const IV: [u64; 8] = [
84        0x6a09e667f3bcc908,
85        0xbb67ae8584caa73b,
86        0x3c6ef372fe94f82b,
87        0xa54ff53a5f1d36f1,
88        0x510e527fade682d1,
89        0x9b05688c2b3e6c1f,
90        0x1f83d9abfb41bd6b,
91        0x5be0cd19137e2179,
92    ];
93
94    #[inline(always)]
95    #[allow(clippy::many_single_char_names)]
96    /// G function: <https://tools.ietf.org/html/rfc7693#section-3.1>
97    pub fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
98        v[a] = v[a].wrapping_add(v[b]);
99        v[a] = v[a].wrapping_add(x);
100        v[d] ^= v[a];
101        v[d] = v[d].rotate_right(32);
102        v[c] = v[c].wrapping_add(v[d]);
103        v[b] ^= v[c];
104        v[b] = v[b].rotate_right(24);
105
106        v[a] = v[a].wrapping_add(v[b]);
107        v[a] = v[a].wrapping_add(y);
108        v[d] ^= v[a];
109        v[d] = v[d].rotate_right(16);
110        v[c] = v[c].wrapping_add(v[d]);
111        v[b] ^= v[c];
112        v[b] = v[b].rotate_right(63);
113    }
114
115    /// Compression function F takes as an argument the state vector "h",
116    /// message block vector "m" (last block is padded with zeros to full
117    /// block size, if required), 2w-bit offset counter "t", and final block
118    /// indicator flag "f".  Local vector v[0..15] is used in processing.  F
119    /// returns a new state vector.  The number of rounds, "r", is 12 for
120    /// BLAKE2b and 10 for BLAKE2s.  Rounds are numbered from 0 to r - 1.
121    #[allow(clippy::many_single_char_names)]
122    pub fn compress(
123        rounds: usize,
124        h: &mut [u64; 8],
125        m_slice: &[u8; 16 * size_of::<u64>()],
126        t: [u64; 2],
127        f: bool,
128    ) {
129        assert!(m_slice.len() == 16 * size_of::<u64>());
130
131        #[cfg(all(target_feature = "avx2", feature = "std"))]
132        {
133            // only if it is compiled with avx2 flag and it is std, we can use avx2.
134            if std::is_x86_feature_detected!("avx2") {
135                // avx2 is 1.8x more performant than portable implementation.
136                unsafe {
137                    super::avx2::compress_block(
138                        rounds,
139                        m_slice,
140                        h,
141                        ((t[1] as u128) << 64) | (t[0] as u128),
142                        if f { !0 } else { 0 },
143                        0,
144                    );
145                }
146                return;
147            }
148        }
149
150        // if avx2 is not available, use the fallback portable implementation
151
152        // Read m values
153        let mut m = [0u64; 16];
154        for (i, item) in m.iter_mut().enumerate() {
155            *item = u64::from_le_bytes(unsafe {
156                core::ptr::read_unaligned(m_slice.as_ptr().add(i * 8) as *const [u8; 8])
157            });
158        }
159
160        let mut v = [0u64; 16];
161        v[..h.len()].copy_from_slice(h); // First half from state.
162        v[h.len()..].copy_from_slice(&IV); // Second half from IV.
163
164        v[12] ^= t[0];
165        v[13] ^= t[1];
166
167        if f {
168            v[14] = !v[14] // Invert all bits if the last-block-flag is set.
169        }
170        for i in 0..rounds {
171            round(&mut v, &m, i);
172        }
173
174        for i in 0..8 {
175            h[i] ^= v[i] ^ v[i + 8];
176        }
177    }
178
179    #[inline(always)]
180    fn round(v: &mut [u64; 16], m: &[u64; 16], r: usize) {
181        // Message word selection permutation for this round.
182        let s = &SIGMA[r % 10];
183        // g1
184        g(v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
185        g(v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
186        g(v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
187        g(v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
188
189        // g2
190        g(v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
191        g(v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
192        g(v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
193        g(v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
194    }
195}
196
197// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
198macro_rules! _MM_SHUFFLE {
199    ($z:expr, $y:expr, $x:expr, $w:expr) => {
200        ($z << 6) | ($y << 4) | ($x << 2) | $w
201    };
202}
203
204/// Code adapted from https://github.com/oconnor663/blake2_simd/blob/82b3e2aee4d2384aabbeb146058301ff0dbd453f/blake2b/src/avx2.rs
205#[cfg(all(target_feature = "avx2", feature = "std"))]
206mod avx2 {
207    #[cfg(target_arch = "x86")]
208    use core::arch::x86::*;
209    #[cfg(target_arch = "x86_64")]
210    use core::arch::x86_64::*;
211
212    use super::algo::IV;
213    use arrayref::{array_refs, mut_array_refs};
214
215    type Word = u64;
216    type Count = u128;
217    /// The number input bytes passed to each call to the compression function. Small benchmarks need
218    /// to use an even multiple of `BLOCKBYTES`, or else their apparent throughput will be low.
219    const BLOCKBYTES: usize = 16 * size_of::<Word>();
220
221    const DEGREE: usize = 4;
222
223    /// Compress a block of data using the BLAKE2 algorithm.
224    #[inline(always)]
225    pub(crate) unsafe fn compress_block(
226        mut rounds: usize,
227        block: &[u8; BLOCKBYTES],
228        words: &mut [Word; 8],
229        count: Count,
230        last_block: Word,
231        last_node: Word,
232    ) {
233        let (words_low, words_high) = mut_array_refs!(words, DEGREE, DEGREE);
234        let (iv_low, iv_high) = array_refs!(&IV, DEGREE, DEGREE);
235        let mut a = loadu(words_low);
236        let mut b = loadu(words_high);
237        let mut c = loadu(iv_low);
238        let flags = set4(count_low(count), count_high(count), last_block, last_node);
239        let mut d = xor(loadu(iv_high), flags);
240
241        let msg_chunks = array_refs!(block, 16, 16, 16, 16, 16, 16, 16, 16);
242        let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
243        let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
244        let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2));
245        let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3));
246        let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4));
247        let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5));
248        let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6));
249        let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7));
250
251        let iv0 = a;
252        let iv1 = b;
253        let mut t0;
254        let mut t1;
255        let mut b0;
256
257        loop {
258            if rounds == 0 {
259                break;
260            }
261            rounds -= 1;
262
263            // round 1
264            t0 = _mm256_unpacklo_epi64(m0, m1);
265            t1 = _mm256_unpacklo_epi64(m2, m3);
266            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
267            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
268            t0 = _mm256_unpackhi_epi64(m0, m1);
269            t1 = _mm256_unpackhi_epi64(m2, m3);
270            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
271            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
272            diagonalize(&mut a, &mut b, &mut c, &mut d);
273            t0 = _mm256_unpacklo_epi64(m7, m4);
274            t1 = _mm256_unpacklo_epi64(m5, m6);
275            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
276            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
277            t0 = _mm256_unpackhi_epi64(m7, m4);
278            t1 = _mm256_unpackhi_epi64(m5, m6);
279            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
280            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
281            undiagonalize(&mut a, &mut b, &mut c, &mut d);
282
283            if rounds == 0 {
284                break;
285            }
286            rounds -= 1;
287
288            // round 2
289            t0 = _mm256_unpacklo_epi64(m7, m2);
290            t1 = _mm256_unpackhi_epi64(m4, m6);
291            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
292            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
293            t0 = _mm256_unpacklo_epi64(m5, m4);
294            t1 = _mm256_alignr_epi8(m3, m7, 8);
295            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
296            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
297            diagonalize(&mut a, &mut b, &mut c, &mut d);
298            t0 = _mm256_unpackhi_epi64(m2, m0);
299            t1 = _mm256_blend_epi32(m5, m0, 0x33);
300            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
301            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
302            t0 = _mm256_alignr_epi8(m6, m1, 8);
303            t1 = _mm256_blend_epi32(m3, m1, 0x33);
304            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
305            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
306            undiagonalize(&mut a, &mut b, &mut c, &mut d);
307
308            if rounds == 0 {
309                break;
310            }
311            rounds -= 1;
312
313            // round 3
314            t0 = _mm256_alignr_epi8(m6, m5, 8);
315            t1 = _mm256_unpackhi_epi64(m2, m7);
316            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
317            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
318            t0 = _mm256_unpacklo_epi64(m4, m0);
319            t1 = _mm256_blend_epi32(m6, m1, 0x33);
320            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
321            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
322            diagonalize(&mut a, &mut b, &mut c, &mut d);
323            t0 = _mm256_alignr_epi8(m5, m4, 8);
324            t1 = _mm256_unpackhi_epi64(m1, m3);
325            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
326            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
327            t0 = _mm256_unpacklo_epi64(m2, m7);
328            t1 = _mm256_blend_epi32(m0, m3, 0x33);
329            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
330            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
331            undiagonalize(&mut a, &mut b, &mut c, &mut d);
332
333            if rounds == 0 {
334                break;
335            }
336            rounds -= 1;
337
338            // round 4
339            t0 = _mm256_unpackhi_epi64(m3, m1);
340            t1 = _mm256_unpackhi_epi64(m6, m5);
341            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
342            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
343            t0 = _mm256_unpackhi_epi64(m4, m0);
344            t1 = _mm256_unpacklo_epi64(m6, m7);
345            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
346            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
347            diagonalize(&mut a, &mut b, &mut c, &mut d);
348            t0 = _mm256_alignr_epi8(m1, m7, 8);
349            t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2));
350            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
351            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
352            t0 = _mm256_unpacklo_epi64(m4, m3);
353            t1 = _mm256_unpacklo_epi64(m5, m0);
354            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
355            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
356            undiagonalize(&mut a, &mut b, &mut c, &mut d);
357
358            if rounds == 0 {
359                break;
360            }
361            rounds -= 1;
362
363            // round 5
364            t0 = _mm256_unpackhi_epi64(m4, m2);
365            t1 = _mm256_unpacklo_epi64(m1, m5);
366            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
367            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
368            t0 = _mm256_blend_epi32(m3, m0, 0x33);
369            t1 = _mm256_blend_epi32(m7, m2, 0x33);
370            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
371            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
372            diagonalize(&mut a, &mut b, &mut c, &mut d);
373            t0 = _mm256_alignr_epi8(m7, m1, 8);
374            t1 = _mm256_alignr_epi8(m3, m5, 8);
375            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
376            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
377            t0 = _mm256_unpackhi_epi64(m6, m0);
378            t1 = _mm256_unpacklo_epi64(m6, m4);
379            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
380            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
381            undiagonalize(&mut a, &mut b, &mut c, &mut d);
382
383            if rounds == 0 {
384                break;
385            }
386            rounds -= 1;
387
388            // round 6
389            t0 = _mm256_unpacklo_epi64(m1, m3);
390            t1 = _mm256_unpacklo_epi64(m0, m4);
391            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
392            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
393            t0 = _mm256_unpacklo_epi64(m6, m5);
394            t1 = _mm256_unpackhi_epi64(m5, m1);
395            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
396            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
397            diagonalize(&mut a, &mut b, &mut c, &mut d);
398            t0 = _mm256_alignr_epi8(m2, m0, 8);
399            t1 = _mm256_unpackhi_epi64(m3, m7);
400            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
401            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
402            t0 = _mm256_unpackhi_epi64(m4, m6);
403            t1 = _mm256_alignr_epi8(m7, m2, 8);
404            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
405            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
406            undiagonalize(&mut a, &mut b, &mut c, &mut d);
407
408            if rounds == 0 {
409                break;
410            }
411            rounds -= 1;
412
413            // round 7
414            t0 = _mm256_blend_epi32(m0, m6, 0x33);
415            t1 = _mm256_unpacklo_epi64(m7, m2);
416            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
417            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
418            t0 = _mm256_unpackhi_epi64(m2, m7);
419            t1 = _mm256_alignr_epi8(m5, m6, 8);
420            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
421            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
422            diagonalize(&mut a, &mut b, &mut c, &mut d);
423            t0 = _mm256_unpacklo_epi64(m4, m0);
424            t1 = _mm256_blend_epi32(m4, m3, 0x33);
425            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
426            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
427            t0 = _mm256_unpackhi_epi64(m5, m3);
428            t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2));
429            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
430            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
431            undiagonalize(&mut a, &mut b, &mut c, &mut d);
432
433            if rounds == 0 {
434                break;
435            }
436            rounds -= 1;
437            // round 8
438            t0 = _mm256_unpackhi_epi64(m6, m3);
439            t1 = _mm256_blend_epi32(m1, m6, 0x33);
440            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
441            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
442            t0 = _mm256_alignr_epi8(m7, m5, 8);
443            t1 = _mm256_unpackhi_epi64(m0, m4);
444            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
445            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
446            diagonalize(&mut a, &mut b, &mut c, &mut d);
447            t0 = _mm256_blend_epi32(m2, m1, 0x33);
448            t1 = _mm256_alignr_epi8(m4, m7, 8);
449            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
450            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
451            t0 = _mm256_unpacklo_epi64(m5, m0);
452            t1 = _mm256_unpacklo_epi64(m2, m3);
453            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
454            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
455            undiagonalize(&mut a, &mut b, &mut c, &mut d);
456
457            if rounds == 0 {
458                break;
459            }
460            rounds -= 1;
461
462            // round 9
463            t0 = _mm256_unpacklo_epi64(m3, m7);
464            t1 = _mm256_alignr_epi8(m0, m5, 8);
465            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
466            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
467            t0 = _mm256_unpackhi_epi64(m7, m4);
468            t1 = _mm256_alignr_epi8(m4, m1, 8);
469            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
470            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
471            diagonalize(&mut a, &mut b, &mut c, &mut d);
472            t0 = _mm256_unpacklo_epi64(m5, m6);
473            t1 = _mm256_unpackhi_epi64(m6, m0);
474            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
475            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
476            t0 = _mm256_alignr_epi8(m1, m2, 8);
477            t1 = _mm256_alignr_epi8(m2, m3, 8);
478            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
479            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
480            undiagonalize(&mut a, &mut b, &mut c, &mut d);
481
482            if rounds == 0 {
483                break;
484            }
485            rounds -= 1;
486
487            // round 10
488            t0 = _mm256_unpacklo_epi64(m5, m4);
489            t1 = _mm256_unpackhi_epi64(m3, m0);
490            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
491            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
492            t0 = _mm256_unpacklo_epi64(m1, m2);
493            t1 = _mm256_blend_epi32(m2, m3, 0x33);
494            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
495            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
496            diagonalize(&mut a, &mut b, &mut c, &mut d);
497            t0 = _mm256_unpackhi_epi64(m6, m7);
498            t1 = _mm256_unpackhi_epi64(m4, m1);
499            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
500            g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
501            t0 = _mm256_blend_epi32(m5, m0, 0x33);
502            t1 = _mm256_unpacklo_epi64(m7, m6);
503            b0 = _mm256_blend_epi32(t0, t1, 0xF0);
504            g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
505            undiagonalize(&mut a, &mut b, &mut c, &mut d);
506
507            // last two rounds are removed
508        }
509        a = xor(a, c);
510        b = xor(b, d);
511        a = xor(a, iv0);
512        b = xor(b, iv1);
513
514        storeu(a, words_low);
515        storeu(b, words_high);
516    }
517
518    #[inline(always)]
519    pub(crate) fn count_low(count: Count) -> Word {
520        count as Word
521    }
522
523    #[inline(always)]
524    pub(crate) fn count_high(count: Count) -> Word {
525        (count >> 8 * size_of::<Word>()) as Word
526    }
527
528    #[inline(always)]
529    unsafe fn loadu(src: *const [Word; DEGREE]) -> __m256i {
530        // This is an unaligned load, so the pointer cast is allowed.
531        _mm256_loadu_si256(src as *const __m256i)
532    }
533
534    #[inline(always)]
535    unsafe fn storeu(src: __m256i, dest: *mut [Word; DEGREE]) {
536        // This is an unaligned store, so the pointer cast is allowed.
537        _mm256_storeu_si256(dest as *mut __m256i, src)
538    }
539
540    #[inline(always)]
541    unsafe fn loadu_128(mem_addr: &[u8; 16]) -> __m128i {
542        _mm_loadu_si128(mem_addr.as_ptr() as *const __m128i)
543    }
544
545    #[inline(always)]
546    unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
547        _mm256_add_epi64(a, b)
548    }
549
550    #[inline(always)]
551    unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
552        _mm256_xor_si256(a, b)
553    }
554
555    #[inline(always)]
556    unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i {
557        _mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64)
558    }
559
560    // These rotations are the "simple version". For the "complicated version", see
561    // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2b-common.h#L43-L46.
562    // For a discussion of the tradeoffs, see
563    // https://github.com/sneves/blake2-avx2/pull/5. In short:
564    // - Due to an LLVM bug (https://bugs.llvm.org/show_bug.cgi?id=44379), this
565    //   version performs better on recent x86 chips.
566    // - LLVM is able to optimize this version to AVX-512 rotation instructions
567    //   when those are enabled.
568    #[inline(always)]
569    unsafe fn rot32(x: __m256i) -> __m256i {
570        _mm256_or_si256(_mm256_srli_epi64(x, 32), _mm256_slli_epi64(x, 64 - 32))
571    }
572
573    #[inline(always)]
574    unsafe fn rot24(x: __m256i) -> __m256i {
575        _mm256_or_si256(_mm256_srli_epi64(x, 24), _mm256_slli_epi64(x, 64 - 24))
576    }
577
578    #[inline(always)]
579    unsafe fn rot16(x: __m256i) -> __m256i {
580        _mm256_or_si256(_mm256_srli_epi64(x, 16), _mm256_slli_epi64(x, 64 - 16))
581    }
582
583    #[inline(always)]
584    unsafe fn rot63(x: __m256i) -> __m256i {
585        _mm256_or_si256(_mm256_srli_epi64(x, 63), _mm256_slli_epi64(x, 64 - 63))
586    }
587
588    #[inline(always)]
589    unsafe fn g1(
590        a: &mut __m256i,
591        b: &mut __m256i,
592        c: &mut __m256i,
593        d: &mut __m256i,
594        m: &mut __m256i,
595    ) {
596        *a = add(*a, *m);
597        *a = add(*a, *b);
598        *d = xor(*d, *a);
599        *d = rot32(*d);
600        *c = add(*c, *d);
601        *b = xor(*b, *c);
602        *b = rot24(*b);
603    }
604
605    #[inline(always)]
606    unsafe fn g2(
607        a: &mut __m256i,
608        b: &mut __m256i,
609        c: &mut __m256i,
610        d: &mut __m256i,
611        m: &mut __m256i,
612    ) {
613        *a = add(*a, *m);
614        *a = add(*a, *b);
615        *d = xor(*d, *a);
616        *d = rot16(*d);
617        *c = add(*c, *d);
618        *b = xor(*b, *c);
619        *b = rot63(*b);
620    }
621
622    // Note the optimization here of leaving b as the unrotated row, rather than a.
623    // All the message loads below are adjusted to compensate for this. See
624    // discussion at https://github.com/sneves/blake2-avx2/pull/4
625    #[inline(always)]
626    unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
627        *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3));
628        *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
629        *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1));
630    }
631
632    #[inline(always)]
633    unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
634        *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1));
635        *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
636        *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3));
637    }
638}
639
640#[cfg(test)]
641mod tests {
642    use super::*;
643    use primitives::hex;
644    use std::time::Instant;
645
646    #[test]
647    fn perfblake2() {
648        let input = [hex!("0000040048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b616162636465666768696a6b6c6d6e6f700000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001")
649        ,hex!("0000020048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001")
650        ,hex!("0000004048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001")];
651
652        let time = Instant::now();
653        for i in 0..3000 {
654            let _ = run(&input[i % 3], u64::MAX).unwrap();
655        }
656        println!("duration: {:?}", time.elapsed());
657    }
658}