3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
24 # AES-128/-192/-256+SHA256 this(**)gain
25 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
27 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
29 # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
30 # Westmere is omitted from loop, this is because gain was not
31 # estimated high enough to justify the effort;
32 # (**) these are EVP-free results, results obtained with 'speed
33 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
37 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
43 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
44 die "can't locate x86_64-xlate.pl";
46 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
47 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
48 $avx = ($1>=2.19) + ($1>=2.22);
51 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
52 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
53 $avx = ($1>=2.09) + ($1>=2.10);
56 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
57 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
58 $avx = ($1>=10) + ($1>=11);
61 open OUT,"| \"$^X\" $xlate $flavour $output";
64 $func="aesni_cbc_sha256_enc";
67 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
68 "%r8d","%r9d","%r10d","%r11d");
69 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
76 ########################################################################
77 # void aesni_cbc_sha256_enc(const void *inp,
84 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
85 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
89 $_inp="16*$SZ+0*8(%rsp)";
90 $_out="16*$SZ+1*8(%rsp)";
91 $_end="16*$SZ+2*8(%rsp)";
92 $_key="16*$SZ+3*8(%rsp)";
93 $_ivp="16*$SZ+4*8(%rsp)";
94 $_ctx="16*$SZ+5*8(%rsp)";
95 $_in0="16*$SZ+6*8(%rsp)";
96 $_rsp="16*$SZ+7*8(%rsp)";
102 .extern OPENSSL_ia32cap_P
104 .type $func,\@abi-omnipotent
108 $code.=<<___ if ($avx);
109 lea OPENSSL_ia32cap_P(%rip),%r11
111 cmp \$0,`$win64?"%rcx":"%rdi"`
117 test \$`1<<11`,%r10d # check for XOP
120 $code.=<<___ if ($avx>1);
121 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
122 cmp \$`1<<8|1<<5|1<<3`,%r11d
125 $code.=<<___ if ($avx);
126 and \$`1<<30`,%eax # mask "Intel CPU" bit
127 and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
129 cmp \$`1<<28|1<<9|1<<30`,%r10d
135 cmp \$0,`$win64?"%rcx":"%rdi"`
143 .type $TABLE,\@object
145 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
146 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
147 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
148 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
149 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
150 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
151 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
152 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
153 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
154 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
155 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
156 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
157 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
158 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
159 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
160 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
161 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
162 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
163 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
164 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
165 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
166 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
167 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
168 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
169 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
170 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
171 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
172 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
173 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
174 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
175 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
176 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
178 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
179 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
180 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
181 .long 0,0,0,0, 0,0,0,0
182 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
186 ######################################################################
190 ($iv,$inout,$roundkey,$temp,
191 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
195 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
196 ## &vmovdqu ($inout,($inp));
197 ## &mov ($_inp,$inp);
199 '&vpxor ($inout,$inout,$roundkey);'.
200 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
202 '&vpxor ($inout,$inout,$iv);',
204 '&vaesenc ($inout,$inout,$roundkey);'.
205 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
207 '&vaesenc ($inout,$inout,$roundkey);'.
208 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
210 '&vaesenc ($inout,$inout,$roundkey);'.
211 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
213 '&vaesenc ($inout,$inout,$roundkey);'.
214 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
216 '&vaesenc ($inout,$inout,$roundkey);'.
217 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
219 '&vaesenc ($inout,$inout,$roundkey);'.
220 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
222 '&vaesenc ($inout,$inout,$roundkey);'.
223 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
225 '&vaesenc ($inout,$inout,$roundkey);'.
226 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
228 '&vaesenc ($inout,$inout,$roundkey);'.
229 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
231 '&vaesenclast ($temp,$inout,$roundkey);'.
232 ' &vaesenc ($inout,$inout,$roundkey);'.
233 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
235 '&vpand ($iv,$temp,$mask10);'.
236 ' &vaesenc ($inout,$inout,$roundkey);'.
237 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
239 '&vaesenclast ($temp,$inout,$roundkey);'.
240 ' &vaesenc ($inout,$inout,$roundkey);'.
241 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
243 '&vpand ($temp,$temp,$mask12);'.
244 ' &vaesenc ($inout,$inout,$roundkey);'.
245 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
247 '&vpor ($iv,$iv,$temp);'.
248 ' &vaesenclast ($temp,$inout,$roundkey);'.
249 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
251 ## &mov ($inp,$_inp);
252 ## &mov ($out,$_out);
253 ## &vpand ($temp,$temp,$mask14);
254 ## &vpor ($iv,$iv,$temp);
255 ## &vmovdqu ($iv,($out,$inp);
256 ## &lea (inp,16($inp));
260 my ($a,$b,$c,$d,$e,$f,$g,$h);
262 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
263 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
265 $arg = "\$$arg" if ($arg*1 eq $arg);
266 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
271 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
273 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
278 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
279 '&xor ($a4,$g)', # f^g
281 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
283 '&and ($a4,$e)', # (f^g)&e
285 @aesni_cbc_block[$aesni_cbc_idx++].
287 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
290 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
291 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
292 '&xor ($a2,$b)', # a^b, b^c in next round
294 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
295 '&add ($h,$a4)', # h+=Ch(e,f,g)
296 '&and ($a3,$a2)', # (b^c)&(a^b)
299 '&add ($h,$a0)', # h+=Sigma1(e)
300 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
302 '&add ($d,$h)', # d+=h
303 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
304 '&add ($h,$a3)', # h+=Maj(a,b,c)
307 '&add ($a1,$h);'. # h+=Sigma0(a)
308 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
313 ######################################################################
317 .type ${func}_xop,\@function,6
321 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
328 mov %rsp,%r11 # copy %rsp
329 sub \$`$framesz+$win64*16*10`,%rsp
330 and \$-64,%rsp # align stack frame
333 sub $inp,$out # re-bias
335 add $inp,$len # end of input
337 #mov $inp,$_inp # saved later
340 #mov $key,$_key # remains resident in $inp register
346 $code.=<<___ if ($win64);
347 movaps %xmm6,`$framesz+16*0`(%rsp)
348 movaps %xmm7,`$framesz+16*1`(%rsp)
349 movaps %xmm8,`$framesz+16*2`(%rsp)
350 movaps %xmm9,`$framesz+16*3`(%rsp)
351 movaps %xmm10,`$framesz+16*4`(%rsp)
352 movaps %xmm11,`$framesz+16*5`(%rsp)
353 movaps %xmm12,`$framesz+16*6`(%rsp)
354 movaps %xmm13,`$framesz+16*7`(%rsp)
355 movaps %xmm14,`$framesz+16*8`(%rsp)
356 movaps %xmm15,`$framesz+16*9`(%rsp)
362 mov $inp,%r12 # borrow $a4
363 lea 0x80($key),$inp # size optimization, reassign
364 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
365 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
366 mov $ctx,%r15 # borrow $a2
367 mov $in0,%rsi # borrow $a3
368 vmovdqu ($ivp),$iv # load IV
380 vmovdqa 0x00(%r13,%r14,8),$mask14
381 vmovdqa 0x10(%r13,%r14,8),$mask12
382 vmovdqa 0x20(%r13,%r14,8),$mask10
383 vmovdqu 0x00-0x80($inp),$roundkey
386 if ($SZ==4) { # SHA256
387 my @X = map("%xmm$_",(0..3));
388 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
393 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
394 vmovdqu 0x00(%rsi,%r12),@X[0]
395 vmovdqu 0x10(%rsi,%r12),@X[1]
396 vmovdqu 0x20(%rsi,%r12),@X[2]
397 vmovdqu 0x30(%rsi,%r12),@X[3]
398 vpshufb $t3,@X[0],@X[0]
399 lea $TABLE(%rip),$Tbl
400 vpshufb $t3,@X[1],@X[1]
401 vpshufb $t3,@X[2],@X[2]
402 vpaddd 0x00($Tbl),@X[0],$t0
403 vpshufb $t3,@X[3],@X[3]
404 vpaddd 0x20($Tbl),@X[1],$t1
405 vpaddd 0x40($Tbl),@X[2],$t2
406 vpaddd 0x60($Tbl),@X[3],$t3
407 vmovdqa $t0,0x00(%rsp)
409 vmovdqa $t1,0x10(%rsp)
411 vmovdqa $t2,0x20(%rsp)
413 vmovdqa $t3,0x30(%rsp)
419 sub \$-16*2*$SZ,$Tbl # size optimization
420 vmovdqu (%r12),$inout # $a4
423 sub XOP_256_00_47 () {
427 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
429 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
432 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
435 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
438 &vpsrld ($t0,$t0,$sigma0[2]);
441 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
446 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
449 &vpxor ($t0,$t0,$t1);
454 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
457 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
460 &vpsrld ($t2,@X[3],$sigma1[2]);
463 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
466 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
469 &vpxor ($t3,$t3,$t2);
474 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
479 &vpsrldq ($t3,$t3,8);
484 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
489 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
492 &vpsrld ($t2,@X[0],$sigma1[2]);
495 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
498 &vpxor ($t3,$t3,$t2);
503 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
508 &vpslldq ($t3,$t3,8); # 22 instructions
513 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
518 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
519 foreach (@insns) { eval; } # remaining instructions
520 &vmovdqa (16*$j."(%rsp)",$t2);
524 for ($i=0,$j=0; $j<4; $j++) {
525 &XOP_256_00_47($j,\&body_00_15,@X);
526 push(@X,shift(@X)); # rotate(@X)
528 &mov ("%r12",$_inp); # borrow $a4
529 &vpand ($temp,$temp,$mask14);
530 &mov ("%r15",$_out); # borrow $a2
531 &vpor ($iv,$iv,$temp);
532 &vmovdqu ("(%r15,%r12)",$iv); # write output
533 &lea ("%r12","16(%r12)"); # inp++
535 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
536 &jne (".Lxop_00_47");
538 &vmovdqu ($inout,"(%r12)");
542 for ($i=0; $i<16; ) {
543 foreach(body_00_15()) { eval; }
547 mov $_inp,%r12 # borrow $a4
548 mov $_out,%r13 # borrow $a0
549 mov $_ctx,%r15 # borrow $a2
550 mov $_in0,%rsi # borrow $a3
552 vpand $mask14,$temp,$temp
555 vmovdqu $iv,(%r13,%r12) # write output
556 lea 16(%r12),%r12 # inp++
582 vmovdqu $iv,($ivp) # output IV
585 $code.=<<___ if ($win64);
586 movaps `$framesz+16*0`(%rsp),%xmm6
587 movaps `$framesz+16*1`(%rsp),%xmm7
588 movaps `$framesz+16*2`(%rsp),%xmm8
589 movaps `$framesz+16*3`(%rsp),%xmm9
590 movaps `$framesz+16*4`(%rsp),%xmm10
591 movaps `$framesz+16*5`(%rsp),%xmm11
592 movaps `$framesz+16*6`(%rsp),%xmm12
593 movaps `$framesz+16*7`(%rsp),%xmm13
594 movaps `$framesz+16*8`(%rsp),%xmm14
595 movaps `$framesz+16*9`(%rsp),%xmm15
607 .size ${func}_xop,.-${func}_xop
609 ######################################################################
612 local *ror = sub { &shrd(@_[0],@_) };
615 .type ${func}_avx,\@function,6
619 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
626 mov %rsp,%r11 # copy %rsp
627 sub \$`$framesz+$win64*16*10`,%rsp
628 and \$-64,%rsp # align stack frame
631 sub $inp,$out # re-bias
633 add $inp,$len # end of input
635 #mov $inp,$_inp # saved later
638 #mov $key,$_key # remains resident in $inp register
644 $code.=<<___ if ($win64);
645 movaps %xmm6,`$framesz+16*0`(%rsp)
646 movaps %xmm7,`$framesz+16*1`(%rsp)
647 movaps %xmm8,`$framesz+16*2`(%rsp)
648 movaps %xmm9,`$framesz+16*3`(%rsp)
649 movaps %xmm10,`$framesz+16*4`(%rsp)
650 movaps %xmm11,`$framesz+16*5`(%rsp)
651 movaps %xmm12,`$framesz+16*6`(%rsp)
652 movaps %xmm13,`$framesz+16*7`(%rsp)
653 movaps %xmm14,`$framesz+16*8`(%rsp)
654 movaps %xmm15,`$framesz+16*9`(%rsp)
660 mov $inp,%r12 # borrow $a4
661 lea 0x80($key),$inp # size optimization, reassign
662 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
663 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
664 mov $ctx,%r15 # borrow $a2
665 mov $in0,%rsi # borrow $a3
666 vmovdqu ($ivp),$iv # load IV
678 vmovdqa 0x00(%r13,%r14,8),$mask14
679 vmovdqa 0x10(%r13,%r14,8),$mask12
680 vmovdqa 0x20(%r13,%r14,8),$mask10
681 vmovdqu 0x00-0x80($inp),$roundkey
683 if ($SZ==4) { # SHA256
684 my @X = map("%xmm$_",(0..3));
685 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
691 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
692 vmovdqu 0x00(%rsi,%r12),@X[0]
693 vmovdqu 0x10(%rsi,%r12),@X[1]
694 vmovdqu 0x20(%rsi,%r12),@X[2]
695 vmovdqu 0x30(%rsi,%r12),@X[3]
696 vpshufb $t3,@X[0],@X[0]
697 lea $TABLE(%rip),$Tbl
698 vpshufb $t3,@X[1],@X[1]
699 vpshufb $t3,@X[2],@X[2]
700 vpaddd 0x00($Tbl),@X[0],$t0
701 vpshufb $t3,@X[3],@X[3]
702 vpaddd 0x20($Tbl),@X[1],$t1
703 vpaddd 0x40($Tbl),@X[2],$t2
704 vpaddd 0x60($Tbl),@X[3],$t3
705 vmovdqa $t0,0x00(%rsp)
707 vmovdqa $t1,0x10(%rsp)
709 vmovdqa $t2,0x20(%rsp)
711 vmovdqa $t3,0x30(%rsp)
717 sub \$-16*2*$SZ,$Tbl # size optimization
718 vmovdqu (%r12),$inout # $a4
721 sub Xupdate_256_AVX () {
723 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
724 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
725 '&vpsrld ($t2,$t0,$sigma0[0]);',
726 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
727 '&vpsrld ($t3,$t0,$sigma0[2])',
728 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
729 '&vpxor ($t0,$t3,$t2)',
730 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
731 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
732 '&vpxor ($t0,$t0,$t1)',
733 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
734 '&vpxor ($t0,$t0,$t2)',
735 '&vpsrld ($t2,$t3,$sigma1[2]);',
736 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
737 '&vpsrlq ($t3,$t3,$sigma1[0]);',
738 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
739 '&vpxor ($t2,$t2,$t3);',
740 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
741 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
742 '&vpshufd ($t2,$t2,0b10000100)',
743 '&vpsrldq ($t2,$t2,8)',
744 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
745 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
746 '&vpsrld ($t2,$t3,$sigma1[2])',
747 '&vpsrlq ($t3,$t3,$sigma1[0])',
748 '&vpxor ($t2,$t2,$t3);',
749 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
750 '&vpxor ($t2,$t2,$t3)',
751 '&vpshufd ($t2,$t2,0b11101000)',
752 '&vpslldq ($t2,$t2,8)',
753 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
757 sub AVX_256_00_47 () {
761 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
763 foreach (Xupdate_256_AVX()) { # 29 instructions
769 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
770 foreach (@insns) { eval; } # remaining instructions
771 &vmovdqa (16*$j."(%rsp)",$t2);
775 for ($i=0,$j=0; $j<4; $j++) {
776 &AVX_256_00_47($j,\&body_00_15,@X);
777 push(@X,shift(@X)); # rotate(@X)
779 &mov ("%r12",$_inp); # borrow $a4
780 &vpand ($temp,$temp,$mask14);
781 &mov ("%r15",$_out); # borrow $a2
782 &vpor ($iv,$iv,$temp);
783 &vmovdqu ("(%r15,%r12)",$iv); # write output
784 &lea ("%r12","16(%r12)"); # inp++
786 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
787 &jne (".Lavx_00_47");
789 &vmovdqu ($inout,"(%r12)");
793 for ($i=0; $i<16; ) {
794 foreach(body_00_15()) { eval; }
799 mov $_inp,%r12 # borrow $a4
800 mov $_out,%r13 # borrow $a0
801 mov $_ctx,%r15 # borrow $a2
802 mov $_in0,%rsi # borrow $a3
804 vpand $mask14,$temp,$temp
807 vmovdqu $iv,(%r13,%r12) # write output
808 lea 16(%r12),%r12 # inp++
833 vmovdqu $iv,($ivp) # output IV
836 $code.=<<___ if ($win64);
837 movaps `$framesz+16*0`(%rsp),%xmm6
838 movaps `$framesz+16*1`(%rsp),%xmm7
839 movaps `$framesz+16*2`(%rsp),%xmm8
840 movaps `$framesz+16*3`(%rsp),%xmm9
841 movaps `$framesz+16*4`(%rsp),%xmm10
842 movaps `$framesz+16*5`(%rsp),%xmm11
843 movaps `$framesz+16*6`(%rsp),%xmm12
844 movaps `$framesz+16*7`(%rsp),%xmm13
845 movaps `$framesz+16*8`(%rsp),%xmm14
846 movaps `$framesz+16*9`(%rsp),%xmm15
858 .size ${func}_avx,.-${func}_avx
862 ######################################################################
865 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
870 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
872 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
874 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
875 '&and ($a4,$e)', # f&e
876 '&rorx ($a0,$e,$Sigma1[2])',
877 '&rorx ($a2,$e,$Sigma1[1])',
879 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
880 '&lea ($h,"($h,$a4)")',
881 '&andn ($a4,$e,$g)', # ~e&g
884 '&rorx ($a1,$e,$Sigma1[0])',
885 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
886 '&xor ($a0,$a1)', # Sigma1(e)
889 '&rorx ($a4,$a,$Sigma0[2])',
890 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
891 '&xor ($a2,$b)', # a^b, b^c in next round
892 '&rorx ($a1,$a,$Sigma0[1])',
894 '&rorx ($a0,$a,$Sigma0[0])',
895 '&lea ($d,"($d,$h)")', # d+=h
896 '&and ($a3,$a2)', # (b^c)&(a^b)
897 @aesni_cbc_block[$aesni_cbc_idx++].
900 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
901 '&xor ($a1,$a0)', # Sigma0(a)
902 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
903 '&mov ($a4,$e)', # copy of f in future
905 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
907 # and at the finish one has to $a+=$a1
911 .type ${func}_avx2,\@function,6
915 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
922 mov %rsp,%r11 # copy %rsp
923 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
924 and \$-256*$SZ,%rsp # align stack frame
925 add \$`2*$SZ*($rounds-8)`,%rsp
928 sub $inp,$out # re-bias
930 add $inp,$len # end of input
932 #mov $inp,$_inp # saved later
933 #mov $out,$_out # kept in $offload
935 #mov $key,$_key # remains resident in $inp register
941 $code.=<<___ if ($win64);
942 movaps %xmm6,`$framesz+16*0`(%rsp)
943 movaps %xmm7,`$framesz+16*1`(%rsp)
944 movaps %xmm8,`$framesz+16*2`(%rsp)
945 movaps %xmm9,`$framesz+16*3`(%rsp)
946 movaps %xmm10,`$framesz+16*4`(%rsp)
947 movaps %xmm11,`$framesz+16*5`(%rsp)
948 movaps %xmm12,`$framesz+16*6`(%rsp)
949 movaps %xmm13,`$framesz+16*7`(%rsp)
950 movaps %xmm14,`$framesz+16*8`(%rsp)
951 movaps %xmm15,`$framesz+16*9`(%rsp)
957 mov $inp,%r13 # borrow $a0
958 vpinsrq \$1,$out,$offload,$offload
959 lea 0x80($key),$inp # size optimization, reassign
960 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
961 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
962 mov $ctx,%r15 # borrow $a2
963 mov $in0,%rsi # borrow $a3
964 vmovdqu ($ivp),$iv # load IV
967 vmovdqa 0x00(%r12,%r14,8),$mask14
968 vmovdqa 0x10(%r12,%r14,8),$mask12
969 vmovdqa 0x20(%r12,%r14,8),$mask10
971 sub \$-16*$SZ,%r13 # inp++, size optimization
973 xor %r12,%r12 # borrow $a0
975 cmp $len,%r13 # $_end
983 vmovdqu 0x00-0x80($inp),$roundkey
985 if ($SZ==4) { # SHA256
986 my @X = map("%ymm$_",(0..3));
987 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
993 shl \$`log(16*$SZ)/log(2)`,%r12
994 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
996 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
997 add %rsi,%r12 # next or same input block
998 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
999 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1000 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1002 vinserti128 \$1,(%r12,%r13),@X[0],@X[0]
1003 vinserti128 \$1,16(%r12,%r13),@X[1],@X[1]
1004 vpshufb $t3,@X[0],@X[0]
1005 vinserti128 \$1,32(%r12,%r13),@X[2],@X[2]
1006 vpshufb $t3,@X[1],@X[1]
1007 vinserti128 \$1,48(%r12,%r13),@X[3],@X[3]
1009 lea $TABLE(%rip),$Tbl
1010 vpshufb $t3,@X[2],@X[2]
1011 lea -16*$SZ(%r13),%r13
1012 vpaddd 0x00($Tbl),@X[0],$t0
1013 vpshufb $t3,@X[3],@X[3]
1014 vpaddd 0x20($Tbl),@X[1],$t1
1015 vpaddd 0x40($Tbl),@X[2],$t2
1016 vpaddd 0x60($Tbl),@X[3],$t3
1017 vmovdqa $t0,0x00(%rsp)
1019 vmovdqa $t1,0x20(%rsp)
1020 lea -$PUSH8(%rsp),%rsp
1022 vmovdqa $t2,0x00(%rsp)
1024 vmovdqa $t3,0x20(%rsp)
1026 sub \$-16*2*$SZ,$Tbl # size optimization
1031 vmovdqu (%r13),$inout
1032 vpinsrq \$0,%r13,$offload,$offload
1035 sub AVX2_256_00_47 () {
1039 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1040 my $base = "+2*$PUSH8(%rsp)";
1042 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1043 foreach (Xupdate_256_AVX()) { # 29 instructions
1045 eval(shift(@insns));
1046 eval(shift(@insns));
1047 eval(shift(@insns));
1049 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1050 foreach (@insns) { eval; } # remaining instructions
1051 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1054 for ($i=0,$j=0; $j<4; $j++) {
1055 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1056 push(@X,shift(@X)); # rotate(@X)
1058 &vmovq ("%r13",$offload); # borrow $a0
1059 &vpextrq ("%r15",$offload,1); # borrow $a2
1060 &vpand ($temp,$temp,$mask14);
1061 &vpor ($iv,$iv,$temp);
1062 &vmovdqu ("(%r15,%r13)",$iv); # write output
1063 &lea ("%r13","16(%r13)"); # inp++
1065 &lea ($Tbl,16*2*$SZ."($Tbl)");
1066 &cmpb (($SZ-1)."($Tbl)",0);
1067 &jne (".Lavx2_00_47");
1069 &vmovdqu ($inout,"(%r13)");
1070 &vpinsrq ($offload,$offload,"%r13",0);
1073 for ($i=0; $i<16; ) {
1074 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1075 foreach(bodyx_00_15()) { eval; }
1079 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1080 vmovq $offload,%r13 # $_inp, borrow $a0
1081 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1083 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1085 vpand $mask14,$temp,$temp
1087 vmovdqu $iv,(%r12,%r13) # write output
1108 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1118 vmovdqu (%r13),$inout
1119 vpinsrq \$0,%r13,$offload,$offload
1122 for ($i=0; $i<16; ) {
1123 my $base="+16($Tbl)";
1124 foreach(bodyx_00_15()) { eval; }
1125 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1128 vmovq $offload,%r13 # borrow $a0
1129 vpextrq \$1,$offload,%r15 # borrow $a2
1130 vpand $mask14,$temp,$temp
1132 lea -$PUSH8($Tbl),$Tbl
1133 vmovdqu $iv,(%r15,%r13) # write output
1134 lea 16(%r13),%r13 # inp++
1138 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1139 lea 16*$SZ(%r13),%r13
1140 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1142 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1173 vmovdqu $iv,($ivp) # output IV
1176 $code.=<<___ if ($win64);
1177 movaps `$framesz+16*0`(%rsp),%xmm6
1178 movaps `$framesz+16*1`(%rsp),%xmm7
1179 movaps `$framesz+16*2`(%rsp),%xmm8
1180 movaps `$framesz+16*3`(%rsp),%xmm9
1181 movaps `$framesz+16*4`(%rsp),%xmm10
1182 movaps `$framesz+16*5`(%rsp),%xmm11
1183 movaps `$framesz+16*6`(%rsp),%xmm12
1184 movaps `$framesz+16*7`(%rsp),%xmm13
1185 movaps `$framesz+16*8`(%rsp),%xmm14
1186 movaps `$framesz+16*9`(%rsp),%xmm15
1198 .size ${func}_avx2,.-${func}_avx2
1203 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1204 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1211 $code.=<<___ if ($avx);
1212 .extern __imp_RtlVirtualUnwind
1213 .type se_handler,\@abi-omnipotent
1227 mov 120($context),%rax # pull context->Rax
1228 mov 248($context),%rbx # pull context->Rip
1230 mov 8($disp),%rsi # disp->ImageBase
1231 mov 56($disp),%r11 # disp->HanderlData
1233 mov 0(%r11),%r10d # HandlerData[0]
1234 lea (%rsi,%r10),%r10 # prologue label
1235 cmp %r10,%rbx # context->Rip<prologue label
1238 mov 152($context),%rax # pull context->Rsp
1240 mov 4(%r11),%r10d # HandlerData[1]
1241 lea (%rsi,%r10),%r10 # epilogue label
1242 cmp %r10,%rbx # context->Rip>=epilogue label
1245 $code.=<<___ if ($avx>1);
1246 lea .Lavx2_shortcut(%rip),%r10
1247 cmp %r10,%rbx # context->Rip<avx2_shortcut
1251 add \$`2*$SZ*($rounds-8)`,%rax
1255 mov %rax,%rsi # put aside Rsp
1256 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1265 mov %rbx,144($context) # restore context->Rbx
1266 mov %rbp,160($context) # restore context->Rbp
1267 mov %r12,216($context) # restore context->R12
1268 mov %r13,224($context) # restore context->R13
1269 mov %r14,232($context) # restore context->R14
1270 mov %r15,240($context) # restore context->R15
1272 lea .Lepilogue(%rip),%r10
1274 jb .Lin_prologue # non-AVX code
1276 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1277 lea 512($context),%rdi # &context.Xmm6
1279 .long 0xa548f3fc # cld; rep movsq
1284 mov %rax,152($context) # restore context->Rsp
1285 mov %rsi,168($context) # restore context->Rsi
1286 mov %rdi,176($context) # restore context->Rdi
1288 mov 40($disp),%rdi # disp->ContextRecord
1289 mov $context,%rsi # context
1290 mov \$154,%ecx # sizeof(CONTEXT)
1291 .long 0xa548f3fc # cld; rep movsq
1294 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1295 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1296 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1297 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1298 mov 40(%rsi),%r10 # disp->ContextRecord
1299 lea 56(%rsi),%r11 # &disp->HandlerData
1300 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1301 mov %r10,32(%rsp) # arg5
1302 mov %r11,40(%rsp) # arg6
1303 mov %r12,48(%rsp) # arg7
1304 mov %rcx,56(%rsp) # arg8, (NULL)
1305 call *__imp_RtlVirtualUnwind(%rip)
1307 mov \$1,%eax # ExceptionContinueSearch
1319 .size se_handler,.-se_handler
1322 .rva .LSEH_begin_${func}_xop
1323 .rva .LSEH_end_${func}_xop
1324 .rva .LSEH_info_${func}_xop
1326 .rva .LSEH_begin_${func}_avx
1327 .rva .LSEH_end_${func}_avx
1328 .rva .LSEH_info_${func}_avx
1330 $code.=<<___ if ($avx>1);
1331 .rva .LSEH_begin_${func}_avx2
1332 .rva .LSEH_end_${func}_avx2
1333 .rva .LSEH_info_${func}_avx2
1335 $code.=<<___ if ($avx);
1338 .LSEH_info_${func}_xop:
1341 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1343 .LSEH_info_${func}_avx:
1346 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1348 $code.=<<___ if ($avx>1);
1349 .LSEH_info_${func}_avx2:
1352 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1356 $code =~ s/\`([^\`]*)\`/eval $1/gem;