1use crate::{PrecompileError, PrecompileOutput, PrecompileResult, PrecompileWithAddress};
4
5const F_ROUND: u64 = 1;
6const INPUT_LENGTH: usize = 213;
7
8pub const FUN: PrecompileWithAddress = PrecompileWithAddress(crate::u64_to_address(9), run);
10
11pub fn run(input: &[u8], gas_limit: u64) -> PrecompileResult {
15 if input.len() != INPUT_LENGTH {
16 return Err(PrecompileError::Blake2WrongLength);
17 }
18
19 let rounds = u32::from_be_bytes(input[..4].try_into().unwrap()) as usize;
21 let gas_used = rounds as u64 * F_ROUND;
22 if gas_used > gas_limit {
23 return Err(PrecompileError::OutOfGas);
24 }
25
26 let f = match input[212] {
27 1 => true,
28 0 => false,
29 _ => return Err(PrecompileError::Blake2WrongFinalIndicatorFlag),
30 };
31
32 let mut h = [0u64; 8];
33 let t;
36 let m;
39 unsafe {
40 let ptr = input.as_ptr();
41
42 for (i, item) in h.iter_mut().enumerate() {
44 *item = u64::from_le_bytes(core::ptr::read_unaligned(
45 ptr.add(4 + i * 8) as *const [u8; 8]
46 ));
47 }
48
49 m = input[68..68 + 16 * size_of::<u64>()].try_into().unwrap();
50
51 t = [
52 u64::from_le_bytes(core::ptr::read_unaligned(ptr.add(196) as *const [u8; 8])),
53 u64::from_le_bytes(core::ptr::read_unaligned(ptr.add(204) as *const [u8; 8])),
54 ];
55 }
56 algo::compress(rounds, &mut h, m, t, f);
57
58 let mut out = [0u8; 64];
59 for (i, h) in (0..64).step_by(8).zip(h.iter()) {
60 out[i..i + 8].copy_from_slice(&h.to_le_bytes());
61 }
62
63 Ok(PrecompileOutput::new(gas_used, out.into()))
64}
65
66pub mod algo {
68 pub const SIGMA: [[usize; 16]; 10] = [
70 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
71 [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
72 [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
73 [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
74 [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
75 [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
76 [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
77 [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
78 [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
79 [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
80 ];
81
82 pub const IV: [u64; 8] = [
84 0x6a09e667f3bcc908,
85 0xbb67ae8584caa73b,
86 0x3c6ef372fe94f82b,
87 0xa54ff53a5f1d36f1,
88 0x510e527fade682d1,
89 0x9b05688c2b3e6c1f,
90 0x1f83d9abfb41bd6b,
91 0x5be0cd19137e2179,
92 ];
93
94 #[inline(always)]
95 #[allow(clippy::many_single_char_names)]
96 pub fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
98 v[a] = v[a].wrapping_add(v[b]);
99 v[a] = v[a].wrapping_add(x);
100 v[d] ^= v[a];
101 v[d] = v[d].rotate_right(32);
102 v[c] = v[c].wrapping_add(v[d]);
103 v[b] ^= v[c];
104 v[b] = v[b].rotate_right(24);
105
106 v[a] = v[a].wrapping_add(v[b]);
107 v[a] = v[a].wrapping_add(y);
108 v[d] ^= v[a];
109 v[d] = v[d].rotate_right(16);
110 v[c] = v[c].wrapping_add(v[d]);
111 v[b] ^= v[c];
112 v[b] = v[b].rotate_right(63);
113 }
114
115 #[allow(clippy::many_single_char_names)]
122 pub fn compress(
123 rounds: usize,
124 h: &mut [u64; 8],
125 m_slice: &[u8; 16 * size_of::<u64>()],
126 t: [u64; 2],
127 f: bool,
128 ) {
129 assert!(m_slice.len() == 16 * size_of::<u64>());
130
131 #[cfg(all(target_feature = "avx2", feature = "std"))]
132 {
133 if std::is_x86_feature_detected!("avx2") {
135 unsafe {
137 super::avx2::compress_block(
138 rounds,
139 m_slice,
140 h,
141 ((t[1] as u128) << 64) | (t[0] as u128),
142 if f { !0 } else { 0 },
143 0,
144 );
145 }
146 return;
147 }
148 }
149
150 let mut m = [0u64; 16];
154 for (i, item) in m.iter_mut().enumerate() {
155 *item = u64::from_le_bytes(unsafe {
156 core::ptr::read_unaligned(m_slice.as_ptr().add(i * 8) as *const [u8; 8])
157 });
158 }
159
160 let mut v = [0u64; 16];
161 v[..h.len()].copy_from_slice(h); v[h.len()..].copy_from_slice(&IV); v[12] ^= t[0];
165 v[13] ^= t[1];
166
167 if f {
168 v[14] = !v[14] }
170 for i in 0..rounds {
171 round(&mut v, &m, i);
172 }
173
174 for i in 0..8 {
175 h[i] ^= v[i] ^ v[i + 8];
176 }
177 }
178
179 #[inline(always)]
180 fn round(v: &mut [u64; 16], m: &[u64; 16], r: usize) {
181 let s = &SIGMA[r % 10];
183 g(v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
185 g(v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
186 g(v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
187 g(v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
188
189 g(v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
191 g(v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
192 g(v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
193 g(v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
194 }
195}
196
197macro_rules! _MM_SHUFFLE {
199 ($z:expr, $y:expr, $x:expr, $w:expr) => {
200 ($z << 6) | ($y << 4) | ($x << 2) | $w
201 };
202}
203
204#[cfg(all(target_feature = "avx2", feature = "std"))]
206mod avx2 {
207 #[cfg(target_arch = "x86")]
208 use core::arch::x86::*;
209 #[cfg(target_arch = "x86_64")]
210 use core::arch::x86_64::*;
211
212 use super::algo::IV;
213 use arrayref::{array_refs, mut_array_refs};
214
215 type Word = u64;
216 type Count = u128;
217 const BLOCKBYTES: usize = 16 * size_of::<Word>();
220
221 const DEGREE: usize = 4;
222
223 #[inline(always)]
225 pub(crate) unsafe fn compress_block(
226 mut rounds: usize,
227 block: &[u8; BLOCKBYTES],
228 words: &mut [Word; 8],
229 count: Count,
230 last_block: Word,
231 last_node: Word,
232 ) {
233 let (words_low, words_high) = mut_array_refs!(words, DEGREE, DEGREE);
234 let (iv_low, iv_high) = array_refs!(&IV, DEGREE, DEGREE);
235 let mut a = loadu(words_low);
236 let mut b = loadu(words_high);
237 let mut c = loadu(iv_low);
238 let flags = set4(count_low(count), count_high(count), last_block, last_node);
239 let mut d = xor(loadu(iv_high), flags);
240
241 let msg_chunks = array_refs!(block, 16, 16, 16, 16, 16, 16, 16, 16);
242 let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
243 let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
244 let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2));
245 let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3));
246 let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4));
247 let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5));
248 let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6));
249 let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7));
250
251 let iv0 = a;
252 let iv1 = b;
253 let mut t0;
254 let mut t1;
255 let mut b0;
256
257 loop {
258 if rounds == 0 {
259 break;
260 }
261 rounds -= 1;
262
263 t0 = _mm256_unpacklo_epi64(m0, m1);
265 t1 = _mm256_unpacklo_epi64(m2, m3);
266 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
267 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
268 t0 = _mm256_unpackhi_epi64(m0, m1);
269 t1 = _mm256_unpackhi_epi64(m2, m3);
270 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
271 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
272 diagonalize(&mut a, &mut b, &mut c, &mut d);
273 t0 = _mm256_unpacklo_epi64(m7, m4);
274 t1 = _mm256_unpacklo_epi64(m5, m6);
275 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
276 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
277 t0 = _mm256_unpackhi_epi64(m7, m4);
278 t1 = _mm256_unpackhi_epi64(m5, m6);
279 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
280 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
281 undiagonalize(&mut a, &mut b, &mut c, &mut d);
282
283 if rounds == 0 {
284 break;
285 }
286 rounds -= 1;
287
288 t0 = _mm256_unpacklo_epi64(m7, m2);
290 t1 = _mm256_unpackhi_epi64(m4, m6);
291 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
292 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
293 t0 = _mm256_unpacklo_epi64(m5, m4);
294 t1 = _mm256_alignr_epi8(m3, m7, 8);
295 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
296 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
297 diagonalize(&mut a, &mut b, &mut c, &mut d);
298 t0 = _mm256_unpackhi_epi64(m2, m0);
299 t1 = _mm256_blend_epi32(m5, m0, 0x33);
300 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
301 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
302 t0 = _mm256_alignr_epi8(m6, m1, 8);
303 t1 = _mm256_blend_epi32(m3, m1, 0x33);
304 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
305 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
306 undiagonalize(&mut a, &mut b, &mut c, &mut d);
307
308 if rounds == 0 {
309 break;
310 }
311 rounds -= 1;
312
313 t0 = _mm256_alignr_epi8(m6, m5, 8);
315 t1 = _mm256_unpackhi_epi64(m2, m7);
316 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
317 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
318 t0 = _mm256_unpacklo_epi64(m4, m0);
319 t1 = _mm256_blend_epi32(m6, m1, 0x33);
320 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
321 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
322 diagonalize(&mut a, &mut b, &mut c, &mut d);
323 t0 = _mm256_alignr_epi8(m5, m4, 8);
324 t1 = _mm256_unpackhi_epi64(m1, m3);
325 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
326 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
327 t0 = _mm256_unpacklo_epi64(m2, m7);
328 t1 = _mm256_blend_epi32(m0, m3, 0x33);
329 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
330 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
331 undiagonalize(&mut a, &mut b, &mut c, &mut d);
332
333 if rounds == 0 {
334 break;
335 }
336 rounds -= 1;
337
338 t0 = _mm256_unpackhi_epi64(m3, m1);
340 t1 = _mm256_unpackhi_epi64(m6, m5);
341 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
342 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
343 t0 = _mm256_unpackhi_epi64(m4, m0);
344 t1 = _mm256_unpacklo_epi64(m6, m7);
345 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
346 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
347 diagonalize(&mut a, &mut b, &mut c, &mut d);
348 t0 = _mm256_alignr_epi8(m1, m7, 8);
349 t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2));
350 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
351 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
352 t0 = _mm256_unpacklo_epi64(m4, m3);
353 t1 = _mm256_unpacklo_epi64(m5, m0);
354 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
355 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
356 undiagonalize(&mut a, &mut b, &mut c, &mut d);
357
358 if rounds == 0 {
359 break;
360 }
361 rounds -= 1;
362
363 t0 = _mm256_unpackhi_epi64(m4, m2);
365 t1 = _mm256_unpacklo_epi64(m1, m5);
366 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
367 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
368 t0 = _mm256_blend_epi32(m3, m0, 0x33);
369 t1 = _mm256_blend_epi32(m7, m2, 0x33);
370 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
371 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
372 diagonalize(&mut a, &mut b, &mut c, &mut d);
373 t0 = _mm256_alignr_epi8(m7, m1, 8);
374 t1 = _mm256_alignr_epi8(m3, m5, 8);
375 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
376 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
377 t0 = _mm256_unpackhi_epi64(m6, m0);
378 t1 = _mm256_unpacklo_epi64(m6, m4);
379 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
380 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
381 undiagonalize(&mut a, &mut b, &mut c, &mut d);
382
383 if rounds == 0 {
384 break;
385 }
386 rounds -= 1;
387
388 t0 = _mm256_unpacklo_epi64(m1, m3);
390 t1 = _mm256_unpacklo_epi64(m0, m4);
391 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
392 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
393 t0 = _mm256_unpacklo_epi64(m6, m5);
394 t1 = _mm256_unpackhi_epi64(m5, m1);
395 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
396 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
397 diagonalize(&mut a, &mut b, &mut c, &mut d);
398 t0 = _mm256_alignr_epi8(m2, m0, 8);
399 t1 = _mm256_unpackhi_epi64(m3, m7);
400 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
401 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
402 t0 = _mm256_unpackhi_epi64(m4, m6);
403 t1 = _mm256_alignr_epi8(m7, m2, 8);
404 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
405 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
406 undiagonalize(&mut a, &mut b, &mut c, &mut d);
407
408 if rounds == 0 {
409 break;
410 }
411 rounds -= 1;
412
413 t0 = _mm256_blend_epi32(m0, m6, 0x33);
415 t1 = _mm256_unpacklo_epi64(m7, m2);
416 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
417 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
418 t0 = _mm256_unpackhi_epi64(m2, m7);
419 t1 = _mm256_alignr_epi8(m5, m6, 8);
420 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
421 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
422 diagonalize(&mut a, &mut b, &mut c, &mut d);
423 t0 = _mm256_unpacklo_epi64(m4, m0);
424 t1 = _mm256_blend_epi32(m4, m3, 0x33);
425 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
426 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
427 t0 = _mm256_unpackhi_epi64(m5, m3);
428 t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2));
429 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
430 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
431 undiagonalize(&mut a, &mut b, &mut c, &mut d);
432
433 if rounds == 0 {
434 break;
435 }
436 rounds -= 1;
437 t0 = _mm256_unpackhi_epi64(m6, m3);
439 t1 = _mm256_blend_epi32(m1, m6, 0x33);
440 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
441 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
442 t0 = _mm256_alignr_epi8(m7, m5, 8);
443 t1 = _mm256_unpackhi_epi64(m0, m4);
444 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
445 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
446 diagonalize(&mut a, &mut b, &mut c, &mut d);
447 t0 = _mm256_blend_epi32(m2, m1, 0x33);
448 t1 = _mm256_alignr_epi8(m4, m7, 8);
449 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
450 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
451 t0 = _mm256_unpacklo_epi64(m5, m0);
452 t1 = _mm256_unpacklo_epi64(m2, m3);
453 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
454 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
455 undiagonalize(&mut a, &mut b, &mut c, &mut d);
456
457 if rounds == 0 {
458 break;
459 }
460 rounds -= 1;
461
462 t0 = _mm256_unpacklo_epi64(m3, m7);
464 t1 = _mm256_alignr_epi8(m0, m5, 8);
465 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
466 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
467 t0 = _mm256_unpackhi_epi64(m7, m4);
468 t1 = _mm256_alignr_epi8(m4, m1, 8);
469 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
470 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
471 diagonalize(&mut a, &mut b, &mut c, &mut d);
472 t0 = _mm256_unpacklo_epi64(m5, m6);
473 t1 = _mm256_unpackhi_epi64(m6, m0);
474 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
475 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
476 t0 = _mm256_alignr_epi8(m1, m2, 8);
477 t1 = _mm256_alignr_epi8(m2, m3, 8);
478 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
479 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
480 undiagonalize(&mut a, &mut b, &mut c, &mut d);
481
482 if rounds == 0 {
483 break;
484 }
485 rounds -= 1;
486
487 t0 = _mm256_unpacklo_epi64(m5, m4);
489 t1 = _mm256_unpackhi_epi64(m3, m0);
490 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
491 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
492 t0 = _mm256_unpacklo_epi64(m1, m2);
493 t1 = _mm256_blend_epi32(m2, m3, 0x33);
494 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
495 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
496 diagonalize(&mut a, &mut b, &mut c, &mut d);
497 t0 = _mm256_unpackhi_epi64(m6, m7);
498 t1 = _mm256_unpackhi_epi64(m4, m1);
499 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
500 g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
501 t0 = _mm256_blend_epi32(m5, m0, 0x33);
502 t1 = _mm256_unpacklo_epi64(m7, m6);
503 b0 = _mm256_blend_epi32(t0, t1, 0xF0);
504 g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
505 undiagonalize(&mut a, &mut b, &mut c, &mut d);
506
507 }
509 a = xor(a, c);
510 b = xor(b, d);
511 a = xor(a, iv0);
512 b = xor(b, iv1);
513
514 storeu(a, words_low);
515 storeu(b, words_high);
516 }
517
518 #[inline(always)]
519 pub(crate) fn count_low(count: Count) -> Word {
520 count as Word
521 }
522
523 #[inline(always)]
524 pub(crate) fn count_high(count: Count) -> Word {
525 (count >> 8 * size_of::<Word>()) as Word
526 }
527
528 #[inline(always)]
529 unsafe fn loadu(src: *const [Word; DEGREE]) -> __m256i {
530 _mm256_loadu_si256(src as *const __m256i)
532 }
533
534 #[inline(always)]
535 unsafe fn storeu(src: __m256i, dest: *mut [Word; DEGREE]) {
536 _mm256_storeu_si256(dest as *mut __m256i, src)
538 }
539
540 #[inline(always)]
541 unsafe fn loadu_128(mem_addr: &[u8; 16]) -> __m128i {
542 _mm_loadu_si128(mem_addr.as_ptr() as *const __m128i)
543 }
544
545 #[inline(always)]
546 unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
547 _mm256_add_epi64(a, b)
548 }
549
550 #[inline(always)]
551 unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
552 _mm256_xor_si256(a, b)
553 }
554
555 #[inline(always)]
556 unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i {
557 _mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64)
558 }
559
560 #[inline(always)]
569 unsafe fn rot32(x: __m256i) -> __m256i {
570 _mm256_or_si256(_mm256_srli_epi64(x, 32), _mm256_slli_epi64(x, 64 - 32))
571 }
572
573 #[inline(always)]
574 unsafe fn rot24(x: __m256i) -> __m256i {
575 _mm256_or_si256(_mm256_srli_epi64(x, 24), _mm256_slli_epi64(x, 64 - 24))
576 }
577
578 #[inline(always)]
579 unsafe fn rot16(x: __m256i) -> __m256i {
580 _mm256_or_si256(_mm256_srli_epi64(x, 16), _mm256_slli_epi64(x, 64 - 16))
581 }
582
583 #[inline(always)]
584 unsafe fn rot63(x: __m256i) -> __m256i {
585 _mm256_or_si256(_mm256_srli_epi64(x, 63), _mm256_slli_epi64(x, 64 - 63))
586 }
587
588 #[inline(always)]
589 unsafe fn g1(
590 a: &mut __m256i,
591 b: &mut __m256i,
592 c: &mut __m256i,
593 d: &mut __m256i,
594 m: &mut __m256i,
595 ) {
596 *a = add(*a, *m);
597 *a = add(*a, *b);
598 *d = xor(*d, *a);
599 *d = rot32(*d);
600 *c = add(*c, *d);
601 *b = xor(*b, *c);
602 *b = rot24(*b);
603 }
604
605 #[inline(always)]
606 unsafe fn g2(
607 a: &mut __m256i,
608 b: &mut __m256i,
609 c: &mut __m256i,
610 d: &mut __m256i,
611 m: &mut __m256i,
612 ) {
613 *a = add(*a, *m);
614 *a = add(*a, *b);
615 *d = xor(*d, *a);
616 *d = rot16(*d);
617 *c = add(*c, *d);
618 *b = xor(*b, *c);
619 *b = rot63(*b);
620 }
621
622 #[inline(always)]
626 unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
627 *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3));
628 *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
629 *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1));
630 }
631
632 #[inline(always)]
633 unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
634 *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1));
635 *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
636 *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3));
637 }
638}
639
640#[cfg(test)]
641mod tests {
642 use super::*;
643 use primitives::hex;
644 use std::time::Instant;
645
646 #[test]
647 fn perfblake2() {
648 let input = [hex!("0000040048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b616162636465666768696a6b6c6d6e6f700000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001")
649 ,hex!("0000020048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001")
650 ,hex!("0000004048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001")];
651
652 let time = Instant::now();
653 for i in 0..3000 {
654 let _ = run(&input[i % 3], u64::MAX).unwrap();
655 }
656 println!("duration: {:?}", time.elapsed());
657 }
658}