1use crate::{
4 crypto, Precompile, PrecompileError, PrecompileId, PrecompileOutput, PrecompileResult,
5};
6
7const F_ROUND: u64 = 1;
8const INPUT_LENGTH: usize = 213;
9
10pub const FUN: Precompile = Precompile::new(PrecompileId::Blake2F, crate::u64_to_address(9), run);
12
13pub fn run(input: &[u8], gas_limit: u64) -> PrecompileResult {
17 if input.len() != INPUT_LENGTH {
18 return Err(PrecompileError::Blake2WrongLength);
19 }
20
21 let rounds = u32::from_be_bytes(input[..4].try_into().unwrap());
23 let gas_used = rounds as u64 * F_ROUND;
24 if gas_used > gas_limit {
25 return Err(PrecompileError::OutOfGas);
26 }
27
28 let f = match input[212] {
30 0 => false,
31 1 => true,
32 _ => return Err(PrecompileError::Blake2WrongFinalIndicatorFlag),
33 };
34
35 let mut h = [0u64; 8];
37 input[4..68]
38 .chunks_exact(8)
39 .enumerate()
40 .for_each(|(i, chunk)| {
41 h[i] = u64::from_le_bytes(chunk.try_into().unwrap());
42 });
43
44 let mut m = [0u64; 16];
46 input[68..196]
47 .chunks_exact(8)
48 .enumerate()
49 .for_each(|(i, chunk)| {
50 m[i] = u64::from_le_bytes(chunk.try_into().unwrap());
51 });
52
53 let t_0 = u64::from_le_bytes(input[196..204].try_into().unwrap());
55 let t_1 = u64::from_le_bytes(input[204..212].try_into().unwrap());
56
57 crypto().blake2_compress(rounds, &mut h, m, [t_0, t_1], f);
58
59 let mut out = [0u8; 64];
60 for (i, h) in (0..64).step_by(8).zip(h.iter()) {
61 out[i..i + 8].copy_from_slice(&h.to_le_bytes());
62 }
63
64 Ok(PrecompileOutput::new(gas_used, out.into()))
65}
66
67pub mod algo {
69 pub const SIGMA: [[usize; 16]; 10] = [
71 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
72 [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
73 [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
74 [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
75 [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
76 [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
77 [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
78 [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
79 [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
80 [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
81 ];
82
83 pub const IV: [u64; 8] = [
85 0x6a09e667f3bcc908,
86 0xbb67ae8584caa73b,
87 0x3c6ef372fe94f82b,
88 0xa54ff53a5f1d36f1,
89 0x510e527fade682d1,
90 0x9b05688c2b3e6c1f,
91 0x1f83d9abfb41bd6b,
92 0x5be0cd19137e2179,
93 ];
94
95 #[inline(always)]
96 #[allow(clippy::many_single_char_names)]
97 fn g(v: &mut [u64; 16], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
99 let mut va = v[a];
100 let mut vb = v[b];
101 let mut vc = v[c];
102 let mut vd = v[d];
103
104 va = va.wrapping_add(vb).wrapping_add(x);
105 vd = (vd ^ va).rotate_right(32);
106 vc = vc.wrapping_add(vd);
107 vb = (vb ^ vc).rotate_right(24);
108
109 va = va.wrapping_add(vb).wrapping_add(y);
110 vd = (vd ^ va).rotate_right(16);
111 vc = vc.wrapping_add(vd);
112 vb = (vb ^ vc).rotate_right(63);
113
114 v[a] = va;
115 v[b] = vb;
116 v[c] = vc;
117 v[d] = vd;
118 }
119
120 #[allow(clippy::many_single_char_names)]
127 pub fn compress(rounds: usize, h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool) {
128 #[cfg(all(target_feature = "avx2", feature = "std"))]
129 {
130 if std::is_x86_feature_detected!("avx2") {
132 unsafe {
134 super::avx2::compress_block(
135 rounds,
136 &m,
137 h,
138 ((t[1] as u128) << 64) | (t[0] as u128),
139 if f { !0 } else { 0 },
140 0,
141 );
142 }
143 return;
144 }
145 }
146
147 let mut v = [0u64; 16];
150 v[..h.len()].copy_from_slice(h); v[h.len()..].copy_from_slice(&IV); v[12] ^= t[0];
154 v[13] ^= t[1];
155
156 if f {
157 v[14] = !v[14] }
159 for i in 0..rounds {
160 round(&mut v, &m, i);
161 }
162
163 for i in 0..8 {
164 h[i] ^= v[i] ^ v[i + 8];
165 }
166 }
167
168 #[inline(always)]
169 fn round(v: &mut [u64; 16], m: &[u64; 16], r: usize) {
170 let s = &SIGMA[r % 10];
172 g(v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
174 g(v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
175 g(v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
176 g(v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
177
178 g(v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
180 g(v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
181 g(v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
182 g(v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
183 }
184}
185
186macro_rules! _MM_SHUFFLE {
188 ($z:expr, $y:expr, $x:expr, $w:expr) => {
189 ($z << 6) | ($y << 4) | ($x << 2) | $w
190 };
191}
192
193#[cfg(all(target_feature = "avx2", feature = "std"))]
195#[allow(clippy::ptr_offset_with_cast)] mod avx2 {
197 #[cfg(target_arch = "x86")]
198 use core::arch::x86::*;
199 #[cfg(target_arch = "x86_64")]
200 use core::arch::x86_64::*;
201
202 use super::algo::IV;
203 use arrayref::{array_refs, mut_array_refs};
204
205 type Word = u64;
206 type Count = u128;
207 const BLOCKBYTES: usize = 16 * size_of::<Word>();
210
211 const DEGREE: usize = 4;
212
213 #[inline(always)]
215 pub(crate) unsafe fn compress_block(
216 mut rounds: usize,
217 block: &[Word; 16],
218 words: &mut [Word; 8],
219 count: Count,
220 last_block: Word,
221 last_node: Word,
222 ) {
223 let (words_low, words_high) = mut_array_refs!(words, DEGREE, DEGREE);
224 let (iv_low, iv_high) = array_refs!(&IV, DEGREE, DEGREE);
225 let mut a = loadu(words_low);
226 let mut b = loadu(words_high);
227 let mut c = loadu(iv_low);
228 let flags = set4(count_low(count), count_high(count), last_block, last_node);
229 let mut d = xor(loadu(iv_high), flags);
230
231 let block: &[u8; BLOCKBYTES] = std::mem::transmute(block);
232 let msg_chunks = array_refs!(block, 16, 16, 16, 16, 16, 16, 16, 16);
233 let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
234 let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
235 let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2));
236 let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3));
237 let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4));
238 let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5));
239 let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6));
240 let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7));
241
242 let iv0 = a;
243 let iv1 = b;
244 let mut t0;
245 let mut t1;
246 let mut b0;
247
248 loop {
249 if rounds == 0 {
250 break;
251 }
252 rounds -= 1;
253
254 t0 = _mm256_unpacklo_epi64(m0, m1);
256 t1 = _mm256_unpacklo_epi64(m2, m3);
257 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
258 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
259 t0 = _mm256_unpackhi_epi64(m0, m1);
260 t1 = _mm256_unpackhi_epi64(m2, m3);
261 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
262 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
263 diagonalize(&mut a, &mut b, &mut c, &mut d);
264 t0 = _mm256_unpacklo_epi64(m7, m4);
265 t1 = _mm256_unpacklo_epi64(m5, m6);
266 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
267 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
268 t0 = _mm256_unpackhi_epi64(m7, m4);
269 t1 = _mm256_unpackhi_epi64(m5, m6);
270 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
271 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
272 undiagonalize(&mut a, &mut b, &mut c, &mut d);
273
274 if rounds == 0 {
275 break;
276 }
277 rounds -= 1;
278
279 t0 = _mm256_unpacklo_epi64(m7, m2);
281 t1 = _mm256_unpackhi_epi64(m4, m6);
282 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
283 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
284 t0 = _mm256_unpacklo_epi64(m5, m4);
285 t1 = _mm256_alignr_epi8(m3, m7, 8);
286 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
287 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
288 diagonalize(&mut a, &mut b, &mut c, &mut d);
289 t0 = _mm256_unpackhi_epi64(m2, m0);
290 t1 = _mm256_blend_epi32(m5, m0, 0x33);
291 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
292 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
293 t0 = _mm256_alignr_epi8(m6, m1, 8);
294 t1 = _mm256_blend_epi32(m3, m1, 0x33);
295 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
296 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
297 undiagonalize(&mut a, &mut b, &mut c, &mut d);
298
299 if rounds == 0 {
300 break;
301 }
302 rounds -= 1;
303
304 t0 = _mm256_alignr_epi8(m6, m5, 8);
306 t1 = _mm256_unpackhi_epi64(m2, m7);
307 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
308 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
309 t0 = _mm256_unpacklo_epi64(m4, m0);
310 t1 = _mm256_blend_epi32(m6, m1, 0x33);
311 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
312 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
313 diagonalize(&mut a, &mut b, &mut c, &mut d);
314 t0 = _mm256_alignr_epi8(m5, m4, 8);
315 t1 = _mm256_unpackhi_epi64(m1, m3);
316 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
317 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
318 t0 = _mm256_unpacklo_epi64(m2, m7);
319 t1 = _mm256_blend_epi32(m0, m3, 0x33);
320 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
321 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
322 undiagonalize(&mut a, &mut b, &mut c, &mut d);
323
324 if rounds == 0 {
325 break;
326 }
327 rounds -= 1;
328
329 t0 = _mm256_unpackhi_epi64(m3, m1);
331 t1 = _mm256_unpackhi_epi64(m6, m5);
332 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
333 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
334 t0 = _mm256_unpackhi_epi64(m4, m0);
335 t1 = _mm256_unpacklo_epi64(m6, m7);
336 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
337 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
338 diagonalize(&mut a, &mut b, &mut c, &mut d);
339 t0 = _mm256_alignr_epi8(m1, m7, 8);
340 t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2));
341 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
342 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
343 t0 = _mm256_unpacklo_epi64(m4, m3);
344 t1 = _mm256_unpacklo_epi64(m5, m0);
345 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
346 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
347 undiagonalize(&mut a, &mut b, &mut c, &mut d);
348
349 if rounds == 0 {
350 break;
351 }
352 rounds -= 1;
353
354 t0 = _mm256_unpackhi_epi64(m4, m2);
356 t1 = _mm256_unpacklo_epi64(m1, m5);
357 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
358 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
359 t0 = _mm256_blend_epi32(m3, m0, 0x33);
360 t1 = _mm256_blend_epi32(m7, m2, 0x33);
361 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
362 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
363 diagonalize(&mut a, &mut b, &mut c, &mut d);
364 t0 = _mm256_alignr_epi8(m7, m1, 8);
365 t1 = _mm256_alignr_epi8(m3, m5, 8);
366 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
367 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
368 t0 = _mm256_unpackhi_epi64(m6, m0);
369 t1 = _mm256_unpacklo_epi64(m6, m4);
370 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
371 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
372 undiagonalize(&mut a, &mut b, &mut c, &mut d);
373
374 if rounds == 0 {
375 break;
376 }
377 rounds -= 1;
378
379 t0 = _mm256_unpacklo_epi64(m1, m3);
381 t1 = _mm256_unpacklo_epi64(m0, m4);
382 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
383 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
384 t0 = _mm256_unpacklo_epi64(m6, m5);
385 t1 = _mm256_unpackhi_epi64(m5, m1);
386 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
387 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
388 diagonalize(&mut a, &mut b, &mut c, &mut d);
389 t0 = _mm256_alignr_epi8(m2, m0, 8);
390 t1 = _mm256_unpackhi_epi64(m3, m7);
391 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
392 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
393 t0 = _mm256_unpackhi_epi64(m4, m6);
394 t1 = _mm256_alignr_epi8(m7, m2, 8);
395 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
396 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
397 undiagonalize(&mut a, &mut b, &mut c, &mut d);
398
399 if rounds == 0 {
400 break;
401 }
402 rounds -= 1;
403
404 t0 = _mm256_blend_epi32(m0, m6, 0x33);
406 t1 = _mm256_unpacklo_epi64(m7, m2);
407 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
408 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
409 t0 = _mm256_unpackhi_epi64(m2, m7);
410 t1 = _mm256_alignr_epi8(m5, m6, 8);
411 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
412 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
413 diagonalize(&mut a, &mut b, &mut c, &mut d);
414 t0 = _mm256_unpacklo_epi64(m4, m0);
415 t1 = _mm256_blend_epi32(m4, m3, 0x33);
416 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
417 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
418 t0 = _mm256_unpackhi_epi64(m5, m3);
419 t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2));
420 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
421 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
422 undiagonalize(&mut a, &mut b, &mut c, &mut d);
423
424 if rounds == 0 {
425 break;
426 }
427 rounds -= 1;
428 t0 = _mm256_unpackhi_epi64(m6, m3);
430 t1 = _mm256_blend_epi32(m1, m6, 0x33);
431 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
432 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
433 t0 = _mm256_alignr_epi8(m7, m5, 8);
434 t1 = _mm256_unpackhi_epi64(m0, m4);
435 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
436 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
437 diagonalize(&mut a, &mut b, &mut c, &mut d);
438 t0 = _mm256_blend_epi32(m2, m1, 0x33);
439 t1 = _mm256_alignr_epi8(m4, m7, 8);
440 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
441 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
442 t0 = _mm256_unpacklo_epi64(m5, m0);
443 t1 = _mm256_unpacklo_epi64(m2, m3);
444 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
445 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
446 undiagonalize(&mut a, &mut b, &mut c, &mut d);
447
448 if rounds == 0 {
449 break;
450 }
451 rounds -= 1;
452
453 t0 = _mm256_unpacklo_epi64(m3, m7);
455 t1 = _mm256_alignr_epi8(m0, m5, 8);
456 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
457 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
458 t0 = _mm256_unpackhi_epi64(m7, m4);
459 t1 = _mm256_alignr_epi8(m4, m1, 8);
460 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
461 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
462 diagonalize(&mut a, &mut b, &mut c, &mut d);
463 t0 = _mm256_unpacklo_epi64(m5, m6);
464 t1 = _mm256_unpackhi_epi64(m6, m0);
465 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
466 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
467 t0 = _mm256_alignr_epi8(m1, m2, 8);
468 t1 = _mm256_alignr_epi8(m2, m3, 8);
469 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
470 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
471 undiagonalize(&mut a, &mut b, &mut c, &mut d);
472
473 if rounds == 0 {
474 break;
475 }
476 rounds -= 1;
477
478 t0 = _mm256_unpacklo_epi64(m5, m4);
480 t1 = _mm256_unpackhi_epi64(m3, m0);
481 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
482 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
483 t0 = _mm256_unpacklo_epi64(m1, m2);
484 t1 = _mm256_blend_epi32(m2, m3, 0x33);
485 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
486 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
487 diagonalize(&mut a, &mut b, &mut c, &mut d);
488 t0 = _mm256_unpackhi_epi64(m6, m7);
489 t1 = _mm256_unpackhi_epi64(m4, m1);
490 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
491 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
492 t0 = _mm256_blend_epi32(m5, m0, 0x33);
493 t1 = _mm256_unpacklo_epi64(m7, m6);
494 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
495 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
496 undiagonalize(&mut a, &mut b, &mut c, &mut d);
497
498 }
500 a = xor(a, c);
501 b = xor(b, d);
502 a = xor(a, iv0);
503 b = xor(b, iv1);
504
505 storeu(a, words_low);
506 storeu(b, words_high);
507 }
508
509 #[inline(always)]
510 pub(crate) fn count_low(count: Count) -> Word {
511 count as Word
512 }
513
514 #[inline(always)]
515 pub(crate) fn count_high(count: Count) -> Word {
516 (count >> Word::BITS as usize) as Word
517 }
518
519 #[inline(always)]
520 unsafe fn loadu(src: *const [Word; DEGREE]) -> __m256i {
521 _mm256_loadu_si256(src as *const __m256i)
523 }
524
525 #[inline(always)]
526 unsafe fn storeu(src: __m256i, dest: *mut [Word; DEGREE]) {
527 _mm256_storeu_si256(dest as *mut __m256i, src)
529 }
530
531 #[inline(always)]
532 unsafe fn loadu_128(mem_addr: &[u8; 16]) -> __m128i {
533 _mm_loadu_si128(mem_addr.as_ptr() as *const __m128i)
534 }
535
536 #[inline(always)]
537 unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
538 _mm256_add_epi64(a, b)
539 }
540
541 #[inline(always)]
542 unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
543 _mm256_xor_si256(a, b)
544 }
545
546 #[inline(always)]
547 unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i {
548 _mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64)
549 }
550
551 #[inline(always)]
560 unsafe fn rot32(x: __m256i) -> __m256i {
561 _mm256_or_si256(_mm256_srli_epi64(x, 32), _mm256_slli_epi64(x, 64 - 32))
562 }
563
564 #[inline(always)]
565 unsafe fn rot24(x: __m256i) -> __m256i {
566 _mm256_or_si256(_mm256_srli_epi64(x, 24), _mm256_slli_epi64(x, 64 - 24))
567 }
568
569 #[inline(always)]
570 unsafe fn rot16(x: __m256i) -> __m256i {
571 _mm256_or_si256(_mm256_srli_epi64(x, 16), _mm256_slli_epi64(x, 64 - 16))
572 }
573
574 #[inline(always)]
575 unsafe fn rot63(x: __m256i) -> __m256i {
576 _mm256_or_si256(_mm256_srli_epi64(x, 63), _mm256_slli_epi64(x, 64 - 63))
577 }
578
579 #[inline(always)]
580 unsafe fn g1(
581 a: &mut __m256i,
582 b: &mut __m256i,
583 c: &mut __m256i,
584 d: &mut __m256i,
585 m: &mut __m256i,
586 ) {
587 *a = add(*a, *m);
588 *a = add(*a, *b);
589 *d = xor(*d, *a);
590 *d = rot32(*d);
591 *c = add(*c, *d);
592 *b = xor(*b, *c);
593 *b = rot24(*b);
594 }
595
596 #[inline(always)]
597 unsafe fn g2(
598 a: &mut __m256i,
599 b: &mut __m256i,
600 c: &mut __m256i,
601 d: &mut __m256i,
602 m: &mut __m256i,
603 ) {
604 *a = add(*a, *m);
605 *a = add(*a, *b);
606 *d = xor(*d, *a);
607 *d = rot16(*d);
608 *c = add(*c, *d);
609 *b = xor(*b, *c);
610 *b = rot63(*b);
611 }
612
613 #[inline(always)]
617 unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
618 *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3));
619 *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
620 *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1));
621 }
622
623 #[inline(always)]
624 unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
625 *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1));
626 *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
627 *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3));
628 }
629}
630
631#[cfg(test)]
632mod tests {
633 use super::*;
634 use primitives::hex;
635 use std::time::Instant;
636
637 #[test]
638 fn perfblake2() {
639 let input = [hex!("0000040048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b616162636465666768696a6b6c6d6e6f700000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001")
640 ,hex!("0000020048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001")
641 ,hex!("0000004048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001")];
642
643 let time = Instant::now();
644 for i in 0..3000 {
645 let _ = run(&input[i % 3], u64::MAX).unwrap();
646 }
647 println!("duration: {:?}", time.elapsed());
648 }
649}