3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
67 # conversion conversion/8x block
72 # The ratio values mean that 128-byte blocks will be processed
73 # 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
84 # one byte out of 4096-byte buffer with 128-bit key is:
91 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
92 # suboptimal, but XTS is meant to be used with larger blocks...
98 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105 die "can't locate x86_64-xlate.pl";
107 open STDOUT,"| $^X $xlate $flavour $output";
109 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
110 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
111 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
114 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
117 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
118 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
123 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
124 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
128 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
129 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
151 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
152 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
172 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
173 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
177 &InvInBasisChange (@b);
178 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
179 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
182 sub InvInBasisChange { # OutBasisChange in reverse
183 my @b=@_[5,1,2,6,3,7,0,4];
201 sub InvOutBasisChange { # InBasisChange in reverse
202 my @b=@_[2,5,7,3,6,1,0,4];
223 #;*************************************************************
224 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
225 #;*************************************************************
226 my ($x0,$x1,$y0,$y1,$t0)=@_;
239 sub Mul_GF4_N { # not used, see next subroutine
240 # multiply and scale by N
241 my ($x0,$x1,$y0,$y1,$t0)=@_;
255 # interleaved Mul_GF4_N and Mul_GF4
256 my ($x0,$x1,$y0,$y1,$t0,
257 $x2,$x3,$y2,$y3,$t1)=@_;
285 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
292 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
293 @x[2], @x[3], @y[2], @y[3], @t[2]);
305 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
306 @x[6], @x[7], @y[2], @y[3], @t[2]);
311 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
320 #;********************************************************************
321 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
322 #;********************************************************************
326 # direct optimizations from hardware
381 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
383 # new smaller inversion
417 # output in s3, s2, s1, t1
419 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
421 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
422 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
424 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427 # AES linear components
433 pxor 0x00($key),@x[0]
434 pxor 0x10($key),@x[1]
436 pxor 0x20($key),@x[2]
438 pxor 0x30($key),@x[3]
440 pxor 0x40($key),@x[4]
442 pxor 0x50($key),@x[5]
444 pxor 0x60($key),@x[6]
446 pxor 0x70($key),@x[7]
454 # modified to emit output in order suitable for feeding back to aesenc[last]
458 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
459 pshufd \$0x93, @x[1], @t[1]
460 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
461 pshufd \$0x93, @x[2], @t[2]
463 pshufd \$0x93, @x[3], @t[3]
465 pshufd \$0x93, @x[4], @t[4]
467 pshufd \$0x93, @x[5], @t[5]
469 pshufd \$0x93, @x[6], @t[6]
471 pshufd \$0x93, @x[7], @t[7]
478 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
480 pshufd \$0x4E, @x[1], @x[1]
486 pshufd \$0x4E, @x[4], @t[0]
488 pshufd \$0x4E, @x[5], @t[1]
490 pshufd \$0x4E, @x[3], @x[4]
492 pshufd \$0x4E, @x[7], @x[5]
494 pshufd \$0x4E, @x[6], @x[3]
496 pshufd \$0x4E, @x[2], @x[6]
513 # multiplication by 0x0e
514 pshufd \$0x93, @x[7], @t[7]
516 pxor @x[5], @x[7] # 7 5
517 pxor @x[5], @x[2] # 2 5
518 pshufd \$0x93, @x[0], @t[0]
520 pxor @x[0], @x[5] # 5 0 [1]
521 pxor @x[1], @x[0] # 0 1
522 pshufd \$0x93, @x[1], @t[1]
523 pxor @x[2], @x[1] # 1 25
524 pxor @x[6], @x[0] # 01 6 [2]
525 pxor @x[3], @x[1] # 125 3 [4]
526 pshufd \$0x93, @x[3], @t[3]
527 pxor @x[0], @x[2] # 25 016 [3]
528 pxor @x[7], @x[3] # 3 75
529 pxor @x[6], @x[7] # 75 6 [0]
530 pshufd \$0x93, @x[6], @t[6]
532 pxor @x[4], @x[6] # 6 4
533 pxor @x[3], @x[4] # 4 375 [6]
534 pxor @x[7], @x[3] # 375 756=36
535 pxor @t[5], @x[6] # 64 5 [7]
536 pxor @t[2], @x[3] # 36 2
537 pxor @t[4], @x[3] # 362 4 [5]
538 pshufd \$0x93, @t[5], @t[5]
540 my @y = @x[7,5,0,2,1,3,4,6];
542 # multiplication by 0x0b
546 pshufd \$0x93, @t[2], @t[2]
550 pshufd \$0x93, @t[4], @t[4]
551 pxor @t[6], @t[7] # clobber t[7]
555 pshufd \$0x93, @t[0], @t[0]
559 pshufd \$0x93, @t[1], @t[1]
563 pshufd \$0x93, @t[2], @t[2]
567 pshufd \$0x93, @t[3], @t[3]
573 pxor @t[5], @t[7] # clobber t[7] even more
576 pshufd \$0x93, @t[4], @t[4]
581 pshufd \$0x93, @t[5], @t[5]
582 pxor @t[6], @t[7] # restore t[7]
584 # multiplication by 0x0d
587 pshufd \$0x93, @t[6], @t[6]
591 pshufd \$0x93, @t[7], @t[7]
600 pshufd \$0x93, @t[0], @t[0]
604 pshufd \$0x93, @t[1], @t[1]
609 pshufd \$0x93, @t[2], @t[2]
611 pxor @t[3], @t[6] # clobber t[6]
618 pshufd \$0x93, @t[4], @t[4]
621 pxor @t[3], @t[6] # restore t[6]
623 pshufd \$0x93, @t[5], @t[5]
624 pshufd \$0x93, @t[6], @t[6]
625 pshufd \$0x93, @t[7], @t[7]
626 pshufd \$0x93, @t[3], @t[3]
628 # multiplication by 0x09
630 pxor @y[1], @t[1] # t[1]=y[1]
631 pxor @t[5], @t[0] # clobber t[0]
634 pxor @y[0], @t[0] # t[0]=y[0]
636 pxor @t[7], @t[6] # clobber t[6]
639 pxor @y[4], @t[4] # t[4]=y[4]
641 pxor @y[3], @t[3] # t[3]=y[3]
643 pxor @y[2], @t[2] # t[2]=y[2]
645 pxor @y[5], @t[5] # t[5]=y[5]
648 pxor @y[6], @t[6] # t[6]=y[6]
649 pxor @y[7], @t[7] # t[7]=y[7]
662 sub aesenc { # not used
666 movdqa 0x30($const),@t[0] # .LSR
668 &ShiftRows (@b,@t[0]);
670 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
673 sub aesenclast { # not used
677 movdqa 0x40($const),@t[0] # .LSRM0
679 &ShiftRows (@b,@t[0]);
682 pxor 0x00($key),@b[0]
683 pxor 0x10($key),@b[1]
684 pxor 0x20($key),@b[4]
685 pxor 0x30($key),@b[6]
686 pxor 0x40($key),@b[3]
687 pxor 0x50($key),@b[7]
688 pxor 0x60($key),@b[2]
689 pxor 0x70($key),@b[5]
694 my ($a,$b,$n,$mask,$t)=@_;
706 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
726 my @x=reverse(@_[0..7]);
727 my ($t0,$t1,$t2,$t3)=@_[8..11];
729 movdqa 0x00($const),$t0 # .LBS0
730 movdqa 0x10($const),$t1 # .LBS1
732 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
733 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
735 movdqa 0x20($const),$t0 # .LBS2
737 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
738 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
740 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
741 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
747 .extern asm_AES_encrypt
748 .extern asm_AES_decrypt
750 .type _bsaes_encrypt8,\@abi-omnipotent
753 lea .LBS0(%rip), $const # constants table
755 movdqa ($key), @XMM[9] # round 0 key
757 movdqa 0x60($const), @XMM[8] # .LM0SR
758 pxor @XMM[9], @XMM[0] # xor with round0 key
759 pxor @XMM[9], @XMM[1]
760 pshufb @XMM[8], @XMM[0]
761 pxor @XMM[9], @XMM[2]
762 pshufb @XMM[8], @XMM[1]
763 pxor @XMM[9], @XMM[3]
764 pshufb @XMM[8], @XMM[2]
765 pxor @XMM[9], @XMM[4]
766 pshufb @XMM[8], @XMM[3]
767 pxor @XMM[9], @XMM[5]
768 pshufb @XMM[8], @XMM[4]
769 pxor @XMM[9], @XMM[6]
770 pshufb @XMM[8], @XMM[5]
771 pxor @XMM[9], @XMM[7]
772 pshufb @XMM[8], @XMM[6]
773 pshufb @XMM[8], @XMM[7]
774 _bsaes_encrypt8_bitslice:
776 &bitslice (@XMM[0..7, 8..11]);
783 &ShiftRows (@XMM[0..7, 8]);
784 $code.=".Lenc_sbox:\n";
785 &Sbox (@XMM[0..7, 8..15]);
790 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
792 movdqa 0x30($const), @XMM[8] # .LSR
794 movdqa 0x40($const), @XMM[8] # .LSRM0
799 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
800 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
802 movdqa ($key), @XMM[8] # last round key
803 pxor @XMM[8], @XMM[4]
804 pxor @XMM[8], @XMM[6]
805 pxor @XMM[8], @XMM[3]
806 pxor @XMM[8], @XMM[7]
807 pxor @XMM[8], @XMM[2]
808 pxor @XMM[8], @XMM[5]
809 pxor @XMM[8], @XMM[0]
810 pxor @XMM[8], @XMM[1]
812 .size _bsaes_encrypt8,.-_bsaes_encrypt8
814 .type _bsaes_decrypt8,\@abi-omnipotent
817 lea .LBS0(%rip), $const # constants table
819 movdqa ($key), @XMM[9] # round 0 key
821 movdqa -0x30($const), @XMM[8] # .LM0ISR
822 pxor @XMM[9], @XMM[0] # xor with round0 key
823 pxor @XMM[9], @XMM[1]
824 pshufb @XMM[8], @XMM[0]
825 pxor @XMM[9], @XMM[2]
826 pshufb @XMM[8], @XMM[1]
827 pxor @XMM[9], @XMM[3]
828 pshufb @XMM[8], @XMM[2]
829 pxor @XMM[9], @XMM[4]
830 pshufb @XMM[8], @XMM[3]
831 pxor @XMM[9], @XMM[5]
832 pshufb @XMM[8], @XMM[4]
833 pxor @XMM[9], @XMM[6]
834 pshufb @XMM[8], @XMM[5]
835 pxor @XMM[9], @XMM[7]
836 pshufb @XMM[8], @XMM[6]
837 pshufb @XMM[8], @XMM[7]
839 &bitslice (@XMM[0..7, 8..11]);
846 &ShiftRows (@XMM[0..7, 8]);
847 $code.=".Ldec_sbox:\n";
848 &InvSbox (@XMM[0..7, 8..15]);
853 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
855 movdqa -0x10($const), @XMM[8] # .LISR
857 movdqa -0x20($const), @XMM[8] # .LISRM0
862 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
864 movdqa ($key), @XMM[8] # last round key
865 pxor @XMM[8], @XMM[6]
866 pxor @XMM[8], @XMM[4]
867 pxor @XMM[8], @XMM[2]
868 pxor @XMM[8], @XMM[7]
869 pxor @XMM[8], @XMM[3]
870 pxor @XMM[8], @XMM[5]
871 pxor @XMM[8], @XMM[0]
872 pxor @XMM[8], @XMM[1]
874 .size _bsaes_decrypt8,.-_bsaes_decrypt8
878 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
881 my @x=reverse(@_[0..7]);
882 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
884 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
886 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
890 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
892 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
894 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
900 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
901 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
905 .type _bsaes_key_convert,\@abi-omnipotent
908 lea .LBS1(%rip), $const
909 movdqu ($inp), %xmm7 # load round 0 key
910 movdqa -0x10($const), %xmm8 # .LBS0
911 movdqa 0x00($const), %xmm9 # .LBS1
912 movdqa 0x10($const), %xmm10 # .LBS2
913 movdqa 0x40($const), %xmm13 # .LM0
914 movdqa 0x60($const), %xmm14 # .LNOT
916 movdqu 0x10($inp), %xmm6 # load round 1 key
918 movdqa %xmm7, ($out) # save round 0 key
924 pshufb %xmm13, %xmm6 # .LM0
927 &bitslice_key (map("%xmm$_",(0..7, 8..12)));
929 pxor %xmm14, %xmm5 # "pnot"
934 movdqa %xmm0, 0x00($out) # write bit-sliced round key
935 movdqa %xmm1, 0x10($out)
936 movdqa %xmm2, 0x20($out)
937 movdqa %xmm3, 0x30($out)
938 movdqa %xmm4, 0x40($out)
939 movdqa %xmm5, 0x50($out)
940 movdqa %xmm6, 0x60($out)
941 movdqa %xmm7, 0x70($out)
943 movdqu ($inp), %xmm6 # load next round key
947 movdqa 0x70($const), %xmm7 # .L63
948 #movdqa %xmm6, ($out) # don't save last round key
950 .size _bsaes_key_convert,.-_bsaes_key_convert
954 if (0 && !$win64) { # following four functions are unsupported interface
955 # used for benchmarking...
957 .globl bsaes_enc_key_convert
958 .type bsaes_enc_key_convert,\@function,2
960 bsaes_enc_key_convert:
961 mov 240($inp),%r10d # pass rounds
962 mov $inp,%rcx # pass key
963 mov $out,%rax # pass key schedule
964 call _bsaes_key_convert
965 pxor %xmm6,%xmm7 # fix up last round key
966 movdqa %xmm7,(%rax) # save last round key
968 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
970 .globl bsaes_encrypt_128
971 .type bsaes_encrypt_128,\@function,4
975 movdqu 0x00($inp), @XMM[0] # load input
976 movdqu 0x10($inp), @XMM[1]
977 movdqu 0x20($inp), @XMM[2]
978 movdqu 0x30($inp), @XMM[3]
979 movdqu 0x40($inp), @XMM[4]
980 movdqu 0x50($inp), @XMM[5]
981 movdqu 0x60($inp), @XMM[6]
982 movdqu 0x70($inp), @XMM[7]
983 mov $key, %rax # pass the $key
989 movdqu @XMM[0], 0x00($out) # write output
990 movdqu @XMM[1], 0x10($out)
991 movdqu @XMM[4], 0x20($out)
992 movdqu @XMM[6], 0x30($out)
993 movdqu @XMM[3], 0x40($out)
994 movdqu @XMM[7], 0x50($out)
995 movdqu @XMM[2], 0x60($out)
996 movdqu @XMM[5], 0x70($out)
1001 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1003 .globl bsaes_dec_key_convert
1004 .type bsaes_dec_key_convert,\@function,2
1006 bsaes_dec_key_convert:
1007 mov 240($inp),%r10d # pass rounds
1008 mov $inp,%rcx # pass key
1009 mov $out,%rax # pass key schedule
1010 call _bsaes_key_convert
1011 pxor ($out),%xmm7 # fix up round 0 key
1012 movdqa %xmm6,(%rax) # save last round key
1015 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1017 .globl bsaes_decrypt_128
1018 .type bsaes_decrypt_128,\@function,4
1022 movdqu 0x00($inp), @XMM[0] # load input
1023 movdqu 0x10($inp), @XMM[1]
1024 movdqu 0x20($inp), @XMM[2]
1025 movdqu 0x30($inp), @XMM[3]
1026 movdqu 0x40($inp), @XMM[4]
1027 movdqu 0x50($inp), @XMM[5]
1028 movdqu 0x60($inp), @XMM[6]
1029 movdqu 0x70($inp), @XMM[7]
1030 mov $key, %rax # pass the $key
1031 lea 0x80($inp), $inp
1034 call _bsaes_decrypt8
1036 movdqu @XMM[0], 0x00($out) # write output
1037 movdqu @XMM[1], 0x10($out)
1038 movdqu @XMM[6], 0x20($out)
1039 movdqu @XMM[4], 0x30($out)
1040 movdqu @XMM[2], 0x40($out)
1041 movdqu @XMM[7], 0x50($out)
1042 movdqu @XMM[3], 0x60($out)
1043 movdqu @XMM[5], 0x70($out)
1044 lea 0x80($out), $out
1048 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1052 ######################################################################
1056 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1057 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1058 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1062 .globl bsaes_ecb_encrypt_blocks
1063 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1065 bsaes_ecb_encrypt_blocks:
1074 lea -0x48(%rsp),%rsp
1076 $code.=<<___ if ($win64);
1077 lea -0xa0(%rsp), %rsp
1078 movaps %xmm6, 0x40(%rsp)
1079 movaps %xmm7, 0x50(%rsp)
1080 movaps %xmm8, 0x60(%rsp)
1081 movaps %xmm9, 0x70(%rsp)
1082 movaps %xmm10, 0x80(%rsp)
1083 movaps %xmm11, 0x90(%rsp)
1084 movaps %xmm12, 0xa0(%rsp)
1085 movaps %xmm13, 0xb0(%rsp)
1086 movaps %xmm14, 0xc0(%rsp)
1087 movaps %xmm15, 0xd0(%rsp)
1091 mov %rsp,%rbp # backup %rsp
1092 mov 240($arg4),%eax # rounds
1093 mov $arg1,$inp # backup arguments
1100 mov %eax,%ebx # backup rounds
1101 shl \$7,%rax # 128 bytes per inner round key
1102 sub \$`128-32`,%rax # size of bit-sliced key schedule
1104 mov %rsp,%rax # pass key schedule
1105 mov $key,%rcx # pass key
1106 mov %ebx,%r10d # pass rounds
1107 call _bsaes_key_convert
1108 pxor %xmm6,%xmm7 # fix up last round key
1109 movdqa %xmm7,(%rax) # save last round key
1113 movdqu 0x00($inp), @XMM[0] # load input
1114 movdqu 0x10($inp), @XMM[1]
1115 movdqu 0x20($inp), @XMM[2]
1116 movdqu 0x30($inp), @XMM[3]
1117 movdqu 0x40($inp), @XMM[4]
1118 movdqu 0x50($inp), @XMM[5]
1119 mov %rsp, %rax # pass key schedule
1120 movdqu 0x60($inp), @XMM[6]
1121 mov %ebx,%r10d # pass rounds
1122 movdqu 0x70($inp), @XMM[7]
1123 lea 0x80($inp), $inp
1125 call _bsaes_encrypt8
1127 movdqu @XMM[0], 0x00($out) # write output
1128 movdqu @XMM[1], 0x10($out)
1129 movdqu @XMM[4], 0x20($out)
1130 movdqu @XMM[6], 0x30($out)
1131 movdqu @XMM[3], 0x40($out)
1132 movdqu @XMM[7], 0x50($out)
1133 movdqu @XMM[2], 0x60($out)
1134 movdqu @XMM[5], 0x70($out)
1135 lea 0x80($out), $out
1142 movdqu 0x00($inp), @XMM[0] # load input
1143 mov %rsp, %rax # pass key schedule
1144 mov %ebx,%r10d # pass rounds
1147 movdqu 0x10($inp), @XMM[1]
1149 movdqu 0x20($inp), @XMM[2]
1152 movdqu 0x30($inp), @XMM[3]
1154 movdqu 0x40($inp), @XMM[4]
1157 movdqu 0x50($inp), @XMM[5]
1159 movdqu 0x60($inp), @XMM[6]
1160 call _bsaes_encrypt8
1161 movdqu @XMM[0], 0x00($out) # write output
1162 movdqu @XMM[1], 0x10($out)
1163 movdqu @XMM[4], 0x20($out)
1164 movdqu @XMM[6], 0x30($out)
1165 movdqu @XMM[3], 0x40($out)
1166 movdqu @XMM[7], 0x50($out)
1167 movdqu @XMM[2], 0x60($out)
1171 call _bsaes_encrypt8
1172 movdqu @XMM[0], 0x00($out) # write output
1173 movdqu @XMM[1], 0x10($out)
1174 movdqu @XMM[4], 0x20($out)
1175 movdqu @XMM[6], 0x30($out)
1176 movdqu @XMM[3], 0x40($out)
1177 movdqu @XMM[7], 0x50($out)
1181 call _bsaes_encrypt8
1182 movdqu @XMM[0], 0x00($out) # write output
1183 movdqu @XMM[1], 0x10($out)
1184 movdqu @XMM[4], 0x20($out)
1185 movdqu @XMM[6], 0x30($out)
1186 movdqu @XMM[3], 0x40($out)
1190 call _bsaes_encrypt8
1191 movdqu @XMM[0], 0x00($out) # write output
1192 movdqu @XMM[1], 0x10($out)
1193 movdqu @XMM[4], 0x20($out)
1194 movdqu @XMM[6], 0x30($out)
1198 call _bsaes_encrypt8
1199 movdqu @XMM[0], 0x00($out) # write output
1200 movdqu @XMM[1], 0x10($out)
1201 movdqu @XMM[4], 0x20($out)
1205 call _bsaes_encrypt8
1206 movdqu @XMM[0], 0x00($out) # write output
1207 movdqu @XMM[1], 0x10($out)
1211 call _bsaes_encrypt8
1212 movdqu @XMM[0], 0x00($out) # write output
1219 call asm_AES_encrypt
1228 .Lecb_enc_bzero: # wipe key schedule [if any]
1229 movdqa %xmm0, 0x00(%rax)
1230 movdqa %xmm0, 0x10(%rax)
1231 lea 0x20(%rax), %rax
1235 lea (%rbp),%rsp # restore %rsp
1237 $code.=<<___ if ($win64);
1238 movaps 0x40(%rbp), %xmm6
1239 movaps 0x50(%rbp), %xmm7
1240 movaps 0x60(%rbp), %xmm8
1241 movaps 0x70(%rbp), %xmm9
1242 movaps 0x80(%rbp), %xmm10
1243 movaps 0x90(%rbp), %xmm11
1244 movaps 0xa0(%rbp), %xmm12
1245 movaps 0xb0(%rbp), %xmm13
1246 movaps 0xc0(%rbp), %xmm14
1247 movaps 0xd0(%rbp), %xmm15
1248 lea 0xa0(%rbp), %rsp
1251 mov 0x48(%rsp), %r15
1252 mov 0x50(%rsp), %r14
1253 mov 0x58(%rsp), %r13
1254 mov 0x60(%rsp), %r12
1255 mov 0x68(%rsp), %rbx
1256 mov 0x70(%rsp), %rax
1257 lea 0x78(%rsp), %rsp
1261 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1263 .globl bsaes_ecb_decrypt_blocks
1264 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1266 bsaes_ecb_decrypt_blocks:
1275 lea -0x48(%rsp),%rsp
1277 $code.=<<___ if ($win64);
1278 lea -0xa0(%rsp), %rsp
1279 movaps %xmm6, 0x40(%rsp)
1280 movaps %xmm7, 0x50(%rsp)
1281 movaps %xmm8, 0x60(%rsp)
1282 movaps %xmm9, 0x70(%rsp)
1283 movaps %xmm10, 0x80(%rsp)
1284 movaps %xmm11, 0x90(%rsp)
1285 movaps %xmm12, 0xa0(%rsp)
1286 movaps %xmm13, 0xb0(%rsp)
1287 movaps %xmm14, 0xc0(%rsp)
1288 movaps %xmm15, 0xd0(%rsp)
1292 mov %rsp,%rbp # backup %rsp
1293 mov 240($arg4),%eax # rounds
1294 mov $arg1,$inp # backup arguments
1301 mov %eax,%ebx # backup rounds
1302 shl \$7,%rax # 128 bytes per inner round key
1303 sub \$`128-32`,%rax # size of bit-sliced key schedule
1305 mov %rsp,%rax # pass key schedule
1306 mov $key,%rcx # pass key
1307 mov %ebx,%r10d # pass rounds
1308 call _bsaes_key_convert
1309 pxor (%rsp),%xmm7 # fix up 0 round key
1310 movdqa %xmm6,(%rax) # save last round key
1315 movdqu 0x00($inp), @XMM[0] # load input
1316 movdqu 0x10($inp), @XMM[1]
1317 movdqu 0x20($inp), @XMM[2]
1318 movdqu 0x30($inp), @XMM[3]
1319 movdqu 0x40($inp), @XMM[4]
1320 movdqu 0x50($inp), @XMM[5]
1321 mov %rsp, %rax # pass key schedule
1322 movdqu 0x60($inp), @XMM[6]
1323 mov %ebx,%r10d # pass rounds
1324 movdqu 0x70($inp), @XMM[7]
1325 lea 0x80($inp), $inp
1327 call _bsaes_decrypt8
1329 movdqu @XMM[0], 0x00($out) # write output
1330 movdqu @XMM[1], 0x10($out)
1331 movdqu @XMM[6], 0x20($out)
1332 movdqu @XMM[4], 0x30($out)
1333 movdqu @XMM[2], 0x40($out)
1334 movdqu @XMM[7], 0x50($out)
1335 movdqu @XMM[3], 0x60($out)
1336 movdqu @XMM[5], 0x70($out)
1337 lea 0x80($out), $out
1344 movdqu 0x00($inp), @XMM[0] # load input
1345 mov %rsp, %rax # pass key schedule
1346 mov %ebx,%r10d # pass rounds
1349 movdqu 0x10($inp), @XMM[1]
1351 movdqu 0x20($inp), @XMM[2]
1354 movdqu 0x30($inp), @XMM[3]
1356 movdqu 0x40($inp), @XMM[4]
1359 movdqu 0x50($inp), @XMM[5]
1361 movdqu 0x60($inp), @XMM[6]
1362 call _bsaes_decrypt8
1363 movdqu @XMM[0], 0x00($out) # write output
1364 movdqu @XMM[1], 0x10($out)
1365 movdqu @XMM[6], 0x20($out)
1366 movdqu @XMM[4], 0x30($out)
1367 movdqu @XMM[2], 0x40($out)
1368 movdqu @XMM[7], 0x50($out)
1369 movdqu @XMM[3], 0x60($out)
1373 call _bsaes_decrypt8
1374 movdqu @XMM[0], 0x00($out) # write output
1375 movdqu @XMM[1], 0x10($out)
1376 movdqu @XMM[6], 0x20($out)
1377 movdqu @XMM[4], 0x30($out)
1378 movdqu @XMM[2], 0x40($out)
1379 movdqu @XMM[7], 0x50($out)
1383 call _bsaes_decrypt8
1384 movdqu @XMM[0], 0x00($out) # write output
1385 movdqu @XMM[1], 0x10($out)
1386 movdqu @XMM[6], 0x20($out)
1387 movdqu @XMM[4], 0x30($out)
1388 movdqu @XMM[2], 0x40($out)
1392 call _bsaes_decrypt8
1393 movdqu @XMM[0], 0x00($out) # write output
1394 movdqu @XMM[1], 0x10($out)
1395 movdqu @XMM[6], 0x20($out)
1396 movdqu @XMM[4], 0x30($out)
1400 call _bsaes_decrypt8
1401 movdqu @XMM[0], 0x00($out) # write output
1402 movdqu @XMM[1], 0x10($out)
1403 movdqu @XMM[6], 0x20($out)
1407 call _bsaes_decrypt8
1408 movdqu @XMM[0], 0x00($out) # write output
1409 movdqu @XMM[1], 0x10($out)
1413 call _bsaes_decrypt8
1414 movdqu @XMM[0], 0x00($out) # write output
1421 call asm_AES_decrypt
1430 .Lecb_dec_bzero: # wipe key schedule [if any]
1431 movdqa %xmm0, 0x00(%rax)
1432 movdqa %xmm0, 0x10(%rax)
1433 lea 0x20(%rax), %rax
1437 lea (%rbp),%rsp # restore %rsp
1439 $code.=<<___ if ($win64);
1440 movaps 0x40(%rbp), %xmm6
1441 movaps 0x50(%rbp), %xmm7
1442 movaps 0x60(%rbp), %xmm8
1443 movaps 0x70(%rbp), %xmm9
1444 movaps 0x80(%rbp), %xmm10
1445 movaps 0x90(%rbp), %xmm11
1446 movaps 0xa0(%rbp), %xmm12
1447 movaps 0xb0(%rbp), %xmm13
1448 movaps 0xc0(%rbp), %xmm14
1449 movaps 0xd0(%rbp), %xmm15
1450 lea 0xa0(%rbp), %rsp
1453 mov 0x48(%rsp), %r15
1454 mov 0x50(%rsp), %r14
1455 mov 0x58(%rsp), %r13
1456 mov 0x60(%rsp), %r12
1457 mov 0x68(%rsp), %rbx
1458 mov 0x70(%rsp), %rax
1459 lea 0x78(%rsp), %rsp
1463 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1467 .extern asm_AES_cbc_encrypt
1468 .globl bsaes_cbc_encrypt
1469 .type bsaes_cbc_encrypt,\@abi-omnipotent
1473 $code.=<<___ if ($win64);
1474 mov 48(%rsp),$arg6 # pull direction flag
1478 jne asm_AES_cbc_encrypt
1480 jb asm_AES_cbc_encrypt
1490 lea -0x48(%rsp), %rsp
1492 $code.=<<___ if ($win64);
1493 mov 0xa0(%rsp),$arg5 # pull ivp
1494 lea -0xa0(%rsp), %rsp
1495 movaps %xmm6, 0x40(%rsp)
1496 movaps %xmm7, 0x50(%rsp)
1497 movaps %xmm8, 0x60(%rsp)
1498 movaps %xmm9, 0x70(%rsp)
1499 movaps %xmm10, 0x80(%rsp)
1500 movaps %xmm11, 0x90(%rsp)
1501 movaps %xmm12, 0xa0(%rsp)
1502 movaps %xmm13, 0xb0(%rsp)
1503 movaps %xmm14, 0xc0(%rsp)
1504 movaps %xmm15, 0xd0(%rsp)
1508 mov %rsp, %rbp # backup %rsp
1509 mov 240($arg4), %eax # rounds
1510 mov $arg1, $inp # backup arguments
1515 shr \$4, $len # bytes to blocks
1517 mov %eax, %edx # rounds
1518 shl \$7, %rax # 128 bytes per inner round key
1519 sub \$`128-32`, %rax # size of bit-sliced key schedule
1522 mov %rsp, %rax # pass key schedule
1523 mov $key, %rcx # pass key
1524 mov %edx, %r10d # pass rounds
1525 call _bsaes_key_convert
1526 pxor (%rsp),%xmm7 # fix up 0 round key
1527 movdqa %xmm6,(%rax) # save last round key
1530 movdqu (%rbx), @XMM[15] # load IV
1533 movdqu 0x00($inp), @XMM[0] # load input
1534 movdqu 0x10($inp), @XMM[1]
1535 movdqu 0x20($inp), @XMM[2]
1536 movdqu 0x30($inp), @XMM[3]
1537 movdqu 0x40($inp), @XMM[4]
1538 movdqu 0x50($inp), @XMM[5]
1539 mov %rsp, %rax # pass key schedule
1540 movdqu 0x60($inp), @XMM[6]
1541 mov %edx,%r10d # pass rounds
1542 movdqu 0x70($inp), @XMM[7]
1543 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1545 call _bsaes_decrypt8
1547 pxor 0x20(%rbp), @XMM[0] # ^= IV
1548 movdqu 0x00($inp), @XMM[8] # re-load input
1549 movdqu 0x10($inp), @XMM[9]
1550 pxor @XMM[8], @XMM[1]
1551 movdqu 0x20($inp), @XMM[10]
1552 pxor @XMM[9], @XMM[6]
1553 movdqu 0x30($inp), @XMM[11]
1554 pxor @XMM[10], @XMM[4]
1555 movdqu 0x40($inp), @XMM[12]
1556 pxor @XMM[11], @XMM[2]
1557 movdqu 0x50($inp), @XMM[13]
1558 pxor @XMM[12], @XMM[7]
1559 movdqu 0x60($inp), @XMM[14]
1560 pxor @XMM[13], @XMM[3]
1561 movdqu 0x70($inp), @XMM[15] # IV
1562 pxor @XMM[14], @XMM[5]
1563 movdqu @XMM[0], 0x00($out) # write output
1564 lea 0x80($inp), $inp
1565 movdqu @XMM[1], 0x10($out)
1566 movdqu @XMM[6], 0x20($out)
1567 movdqu @XMM[4], 0x30($out)
1568 movdqu @XMM[2], 0x40($out)
1569 movdqu @XMM[7], 0x50($out)
1570 movdqu @XMM[3], 0x60($out)
1571 movdqu @XMM[5], 0x70($out)
1572 lea 0x80($out), $out
1579 movdqu 0x00($inp), @XMM[0] # load input
1580 mov %rsp, %rax # pass key schedule
1581 mov %edx, %r10d # pass rounds
1584 movdqu 0x10($inp), @XMM[1]
1586 movdqu 0x20($inp), @XMM[2]
1589 movdqu 0x30($inp), @XMM[3]
1591 movdqu 0x40($inp), @XMM[4]
1594 movdqu 0x50($inp), @XMM[5]
1596 movdqu 0x60($inp), @XMM[6]
1597 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1598 call _bsaes_decrypt8
1599 pxor 0x20(%rbp), @XMM[0] # ^= IV
1600 movdqu 0x00($inp), @XMM[8] # re-load input
1601 movdqu 0x10($inp), @XMM[9]
1602 pxor @XMM[8], @XMM[1]
1603 movdqu 0x20($inp), @XMM[10]
1604 pxor @XMM[9], @XMM[6]
1605 movdqu 0x30($inp), @XMM[11]
1606 pxor @XMM[10], @XMM[4]
1607 movdqu 0x40($inp), @XMM[12]
1608 pxor @XMM[11], @XMM[2]
1609 movdqu 0x50($inp), @XMM[13]
1610 pxor @XMM[12], @XMM[7]
1611 movdqu 0x60($inp), @XMM[15] # IV
1612 pxor @XMM[13], @XMM[3]
1613 movdqu @XMM[0], 0x00($out) # write output
1614 movdqu @XMM[1], 0x10($out)
1615 movdqu @XMM[6], 0x20($out)
1616 movdqu @XMM[4], 0x30($out)
1617 movdqu @XMM[2], 0x40($out)
1618 movdqu @XMM[7], 0x50($out)
1619 movdqu @XMM[3], 0x60($out)
1623 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1624 call _bsaes_decrypt8
1625 pxor 0x20(%rbp), @XMM[0] # ^= IV
1626 movdqu 0x00($inp), @XMM[8] # re-load input
1627 movdqu 0x10($inp), @XMM[9]
1628 pxor @XMM[8], @XMM[1]
1629 movdqu 0x20($inp), @XMM[10]
1630 pxor @XMM[9], @XMM[6]
1631 movdqu 0x30($inp), @XMM[11]
1632 pxor @XMM[10], @XMM[4]
1633 movdqu 0x40($inp), @XMM[12]
1634 pxor @XMM[11], @XMM[2]
1635 movdqu 0x50($inp), @XMM[15] # IV
1636 pxor @XMM[12], @XMM[7]
1637 movdqu @XMM[0], 0x00($out) # write output
1638 movdqu @XMM[1], 0x10($out)
1639 movdqu @XMM[6], 0x20($out)
1640 movdqu @XMM[4], 0x30($out)
1641 movdqu @XMM[2], 0x40($out)
1642 movdqu @XMM[7], 0x50($out)
1646 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1647 call _bsaes_decrypt8
1648 pxor 0x20(%rbp), @XMM[0] # ^= IV
1649 movdqu 0x00($inp), @XMM[8] # re-load input
1650 movdqu 0x10($inp), @XMM[9]
1651 pxor @XMM[8], @XMM[1]
1652 movdqu 0x20($inp), @XMM[10]
1653 pxor @XMM[9], @XMM[6]
1654 movdqu 0x30($inp), @XMM[11]
1655 pxor @XMM[10], @XMM[4]
1656 movdqu 0x40($inp), @XMM[15] # IV
1657 pxor @XMM[11], @XMM[2]
1658 movdqu @XMM[0], 0x00($out) # write output
1659 movdqu @XMM[1], 0x10($out)
1660 movdqu @XMM[6], 0x20($out)
1661 movdqu @XMM[4], 0x30($out)
1662 movdqu @XMM[2], 0x40($out)
1666 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1667 call _bsaes_decrypt8
1668 pxor 0x20(%rbp), @XMM[0] # ^= IV
1669 movdqu 0x00($inp), @XMM[8] # re-load input
1670 movdqu 0x10($inp), @XMM[9]
1671 pxor @XMM[8], @XMM[1]
1672 movdqu 0x20($inp), @XMM[10]
1673 pxor @XMM[9], @XMM[6]
1674 movdqu 0x30($inp), @XMM[15] # IV
1675 pxor @XMM[10], @XMM[4]
1676 movdqu @XMM[0], 0x00($out) # write output
1677 movdqu @XMM[1], 0x10($out)
1678 movdqu @XMM[6], 0x20($out)
1679 movdqu @XMM[4], 0x30($out)
1683 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1684 call _bsaes_decrypt8
1685 pxor 0x20(%rbp), @XMM[0] # ^= IV
1686 movdqu 0x00($inp), @XMM[8] # re-load input
1687 movdqu 0x10($inp), @XMM[9]
1688 pxor @XMM[8], @XMM[1]
1689 movdqu 0x20($inp), @XMM[15] # IV
1690 pxor @XMM[9], @XMM[6]
1691 movdqu @XMM[0], 0x00($out) # write output
1692 movdqu @XMM[1], 0x10($out)
1693 movdqu @XMM[6], 0x20($out)
1697 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1698 call _bsaes_decrypt8
1699 pxor 0x20(%rbp), @XMM[0] # ^= IV
1700 movdqu 0x00($inp), @XMM[8] # re-load input
1701 movdqu 0x10($inp), @XMM[15] # IV
1702 pxor @XMM[8], @XMM[1]
1703 movdqu @XMM[0], 0x00($out) # write output
1704 movdqu @XMM[1], 0x10($out)
1709 lea 0x20(%rbp), $arg2 # buffer output
1711 call asm_AES_decrypt # doesn't touch %xmm
1712 pxor 0x20(%rbp), @XMM[15] # ^= IV
1713 movdqu @XMM[15], ($out) # write output
1714 movdqa @XMM[0], @XMM[15] # IV
1717 movdqu @XMM[15], (%rbx) # return IV
1720 .Lcbc_dec_bzero: # wipe key schedule [if any]
1721 movdqa %xmm0, 0x00(%rax)
1722 movdqa %xmm0, 0x10(%rax)
1723 lea 0x20(%rax), %rax
1727 lea (%rbp),%rsp # restore %rsp
1729 $code.=<<___ if ($win64);
1730 movaps 0x40(%rbp), %xmm6
1731 movaps 0x50(%rbp), %xmm7
1732 movaps 0x60(%rbp), %xmm8
1733 movaps 0x70(%rbp), %xmm9
1734 movaps 0x80(%rbp), %xmm10
1735 movaps 0x90(%rbp), %xmm11
1736 movaps 0xa0(%rbp), %xmm12
1737 movaps 0xb0(%rbp), %xmm13
1738 movaps 0xc0(%rbp), %xmm14
1739 movaps 0xd0(%rbp), %xmm15
1740 lea 0xa0(%rbp), %rsp
1743 mov 0x48(%rsp), %r15
1744 mov 0x50(%rsp), %r14
1745 mov 0x58(%rsp), %r13
1746 mov 0x60(%rsp), %r12
1747 mov 0x68(%rsp), %rbx
1748 mov 0x70(%rsp), %rax
1749 lea 0x78(%rsp), %rsp
1753 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1755 .globl bsaes_ctr32_encrypt_blocks
1756 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1758 bsaes_ctr32_encrypt_blocks:
1767 lea -0x48(%rsp), %rsp
1769 $code.=<<___ if ($win64);
1770 mov 0xa0(%rsp),$arg5 # pull ivp
1771 lea -0xa0(%rsp), %rsp
1772 movaps %xmm6, 0x40(%rsp)
1773 movaps %xmm7, 0x50(%rsp)
1774 movaps %xmm8, 0x60(%rsp)
1775 movaps %xmm9, 0x70(%rsp)
1776 movaps %xmm10, 0x80(%rsp)
1777 movaps %xmm11, 0x90(%rsp)
1778 movaps %xmm12, 0xa0(%rsp)
1779 movaps %xmm13, 0xb0(%rsp)
1780 movaps %xmm14, 0xc0(%rsp)
1781 movaps %xmm15, 0xd0(%rsp)
1785 mov %rsp, %rbp # backup %rsp
1786 movdqu ($arg5), %xmm0 # load counter
1787 mov 240($arg4), %eax # rounds
1788 mov $arg1, $inp # backup arguments
1792 movdqa %xmm0, 0x20(%rbp) # copy counter
1796 mov %eax, %ebx # rounds
1797 shl \$7, %rax # 128 bytes per inner round key
1798 sub \$`128-32`, %rax # size of bit-sliced key schedule
1801 mov %rsp, %rax # pass key schedule
1802 mov $key, %rcx # pass key
1803 mov %ebx, %r10d # pass rounds
1804 call _bsaes_key_convert
1805 pxor %xmm6,%xmm7 # fix up last round key
1806 movdqa %xmm7,(%rax) # save last round key
1808 movdqa (%rsp), @XMM[9] # load round0 key
1809 lea .LADD1(%rip), %r11
1810 movdqa 0x20(%rbp), @XMM[0] # counter copy
1811 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1812 pshufb @XMM[8], @XMM[9] # byte swap upper part
1813 pshufb @XMM[8], @XMM[0]
1814 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1818 movdqa @XMM[0], 0x20(%rbp) # save counter
1819 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1820 movdqa @XMM[0], @XMM[2]
1821 paddd 0x00(%r11), @XMM[1] # .LADD1
1822 movdqa @XMM[0], @XMM[3]
1823 paddd 0x10(%r11), @XMM[2] # .LADD2
1824 movdqa @XMM[0], @XMM[4]
1825 paddd 0x20(%r11), @XMM[3] # .LADD3
1826 movdqa @XMM[0], @XMM[5]
1827 paddd 0x30(%r11), @XMM[4] # .LADD4
1828 movdqa @XMM[0], @XMM[6]
1829 paddd 0x40(%r11), @XMM[5] # .LADD5
1830 movdqa @XMM[0], @XMM[7]
1831 paddd 0x50(%r11), @XMM[6] # .LADD6
1832 paddd 0x60(%r11), @XMM[7] # .LADD7
1834 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1835 # to flip byte order in 32-bit counter
1836 movdqa (%rsp), @XMM[9] # round 0 key
1837 lea 0x10(%rsp), %rax # pass key schedule
1838 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1839 pxor @XMM[9], @XMM[0] # xor with round0 key
1840 pxor @XMM[9], @XMM[1]
1841 pshufb @XMM[8], @XMM[0]
1842 pxor @XMM[9], @XMM[2]
1843 pshufb @XMM[8], @XMM[1]
1844 pxor @XMM[9], @XMM[3]
1845 pshufb @XMM[8], @XMM[2]
1846 pxor @XMM[9], @XMM[4]
1847 pshufb @XMM[8], @XMM[3]
1848 pxor @XMM[9], @XMM[5]
1849 pshufb @XMM[8], @XMM[4]
1850 pxor @XMM[9], @XMM[6]
1851 pshufb @XMM[8], @XMM[5]
1852 pxor @XMM[9], @XMM[7]
1853 pshufb @XMM[8], @XMM[6]
1854 lea .LBS0(%rip), %r11 # constants table
1855 pshufb @XMM[8], @XMM[7]
1856 mov %ebx,%r10d # pass rounds
1858 call _bsaes_encrypt8_bitslice
1861 jc .Lctr_enc_loop_done
1863 movdqu 0x00($inp), @XMM[8] # load input
1864 movdqu 0x10($inp), @XMM[9]
1865 movdqu 0x20($inp), @XMM[10]
1866 movdqu 0x30($inp), @XMM[11]
1867 movdqu 0x40($inp), @XMM[12]
1868 movdqu 0x50($inp), @XMM[13]
1869 movdqu 0x60($inp), @XMM[14]
1870 movdqu 0x70($inp), @XMM[15]
1872 pxor @XMM[0], @XMM[8]
1873 movdqa 0x20(%rbp), @XMM[0] # load counter
1874 pxor @XMM[9], @XMM[1]
1875 movdqu @XMM[8], 0x00($out) # write output
1876 pxor @XMM[10], @XMM[4]
1877 movdqu @XMM[1], 0x10($out)
1878 pxor @XMM[11], @XMM[6]
1879 movdqu @XMM[4], 0x20($out)
1880 pxor @XMM[12], @XMM[3]
1881 movdqu @XMM[6], 0x30($out)
1882 pxor @XMM[13], @XMM[7]
1883 movdqu @XMM[3], 0x40($out)
1884 pxor @XMM[14], @XMM[2]
1885 movdqu @XMM[7], 0x50($out)
1886 pxor @XMM[15], @XMM[5]
1887 movdqu @XMM[2], 0x60($out)
1888 lea .LADD1(%rip), %r11
1889 movdqu @XMM[5], 0x70($out)
1890 lea 0x80($out), $out
1891 paddd 0x70(%r11), @XMM[0] # .LADD8
1896 .Lctr_enc_loop_done:
1897 movdqu 0x00($inp), @XMM[8] # load input
1898 pxor @XMM[8], @XMM[0]
1899 movdqu @XMM[0], 0x00($out) # write output
1902 movdqu 0x10($inp), @XMM[9]
1903 pxor @XMM[9], @XMM[1]
1904 movdqu @XMM[1], 0x10($out)
1906 movdqu 0x20($inp), @XMM[10]
1907 pxor @XMM[10], @XMM[4]
1908 movdqu @XMM[4], 0x20($out)
1911 movdqu 0x30($inp), @XMM[11]
1912 pxor @XMM[11], @XMM[6]
1913 movdqu @XMM[6], 0x30($out)
1915 movdqu 0x40($inp), @XMM[12]
1916 pxor @XMM[12], @XMM[3]
1917 movdqu @XMM[3], 0x40($out)
1920 movdqu 0x50($inp), @XMM[13]
1921 pxor @XMM[13], @XMM[7]
1922 movdqu @XMM[7], 0x50($out)
1924 movdqu 0x60($inp), @XMM[14]
1925 pxor @XMM[14], @XMM[2]
1926 movdqu @XMM[2], 0x60($out)
1931 lea 0x20(%rbp), $arg1
1932 lea 0x30(%rbp), $arg2
1934 call asm_AES_encrypt
1935 movdqu ($inp), @XMM[1]
1937 mov 0x2c(%rbp), %eax # load 32-bit counter
1939 pxor 0x30(%rbp), @XMM[1]
1940 inc %eax # increment
1941 movdqu @XMM[1], ($out)
1944 mov %eax, 0x2c(%rsp) # save 32-bit counter
1951 .Lctr_enc_bzero: # wipe key schedule [if any]
1952 movdqa %xmm0, 0x00(%rax)
1953 movdqa %xmm0, 0x10(%rax)
1954 lea 0x20(%rax), %rax
1958 lea (%rbp),%rsp # restore %rsp
1960 $code.=<<___ if ($win64);
1961 movaps 0x40(%rbp), %xmm6
1962 movaps 0x50(%rbp), %xmm7
1963 movaps 0x60(%rbp), %xmm8
1964 movaps 0x70(%rbp), %xmm9
1965 movaps 0x80(%rbp), %xmm10
1966 movaps 0x90(%rbp), %xmm11
1967 movaps 0xa0(%rbp), %xmm12
1968 movaps 0xb0(%rbp), %xmm13
1969 movaps 0xc0(%rbp), %xmm14
1970 movaps 0xd0(%rbp), %xmm15
1971 lea 0xa0(%rbp), %rsp
1974 mov 0x48(%rsp), %r15
1975 mov 0x50(%rsp), %r14
1976 mov 0x58(%rsp), %r13
1977 mov 0x60(%rsp), %r12
1978 mov 0x68(%rsp), %rbx
1979 mov 0x70(%rsp), %rax
1980 lea 0x78(%rsp), %rsp
1984 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1986 ######################################################################
1987 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1988 # const AES_KEY *key1, const AES_KEY *key2,
1989 # const unsigned char iv[16]);
1991 my ($twmask,$twres,$twtmp)=@XMM[13..15];
1993 .globl bsaes_xts_encrypt
1994 .type bsaes_xts_encrypt,\@abi-omnipotent
2005 lea -0x48(%rsp), %rsp
2007 $code.=<<___ if ($win64);
2008 mov 0xa0(%rsp),$arg5 # pull key2
2009 mov 0xa8(%rsp),$arg6 # pull ivp
2010 lea -0xa0(%rsp), %rsp
2011 movaps %xmm6, 0x40(%rsp)
2012 movaps %xmm7, 0x50(%rsp)
2013 movaps %xmm8, 0x60(%rsp)
2014 movaps %xmm9, 0x70(%rsp)
2015 movaps %xmm10, 0x80(%rsp)
2016 movaps %xmm11, 0x90(%rsp)
2017 movaps %xmm12, 0xa0(%rsp)
2018 movaps %xmm13, 0xb0(%rsp)
2019 movaps %xmm14, 0xc0(%rsp)
2020 movaps %xmm15, 0xd0(%rsp)
2024 mov %rsp, %rbp # backup %rsp
2025 mov $arg1, $inp # backup arguments
2031 lea 0x20(%rbp), $arg2
2033 call asm_AES_encrypt # generate initial tweak
2035 mov 240($key), %eax # rounds
2036 mov $len, %rbx # backup $len
2038 mov %eax, %edx # rounds
2039 shl \$7, %rax # 128 bytes per inner round key
2040 sub \$`128-32`, %rax # size of bit-sliced key schedule
2043 mov %rsp, %rax # pass key schedule
2044 mov $key, %rcx # pass key
2045 mov %edx, %r10d # pass rounds
2046 call _bsaes_key_convert
2047 pxor %xmm6, %xmm7 # fix up last round key
2048 movdqa %xmm7, (%rax) # save last round key
2051 sub \$0x80, %rsp # place for tweak[8]
2052 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2055 movdqa .Lxts_magic(%rip), $twmask
2056 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2065 for ($i=0;$i<7;$i++) {
2067 pshufd \$0x13, $twtmp, $twres
2069 movdqa @XMM[7], @XMM[$i]
2070 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2071 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2072 pand $twmask, $twres # isolate carry and residue
2073 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2074 pxor $twres, @XMM[7]
2076 $code.=<<___ if ($i>=1);
2077 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2079 $code.=<<___ if ($i>=2);
2080 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2084 movdqu 0x60($inp), @XMM[8+6]
2085 pxor @XMM[8+5], @XMM[5]
2086 movdqu 0x70($inp), @XMM[8+7]
2087 lea 0x80($inp), $inp
2088 movdqa @XMM[7], 0x70(%rsp)
2089 pxor @XMM[8+6], @XMM[6]
2090 lea 0x80(%rsp), %rax # pass key schedule
2091 pxor @XMM[8+7], @XMM[7]
2092 mov %edx, %r10d # pass rounds
2094 call _bsaes_encrypt8
2096 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2097 pxor 0x10(%rsp), @XMM[1]
2098 movdqu @XMM[0], 0x00($out) # write output
2099 pxor 0x20(%rsp), @XMM[4]
2100 movdqu @XMM[1], 0x10($out)
2101 pxor 0x30(%rsp), @XMM[6]
2102 movdqu @XMM[4], 0x20($out)
2103 pxor 0x40(%rsp), @XMM[3]
2104 movdqu @XMM[6], 0x30($out)
2105 pxor 0x50(%rsp), @XMM[7]
2106 movdqu @XMM[3], 0x40($out)
2107 pxor 0x60(%rsp), @XMM[2]
2108 movdqu @XMM[7], 0x50($out)
2109 pxor 0x70(%rsp), @XMM[5]
2110 movdqu @XMM[2], 0x60($out)
2111 movdqu @XMM[5], 0x70($out)
2112 lea 0x80($out), $out
2114 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2116 movdqa .Lxts_magic(%rip), $twmask
2117 pcmpgtd @XMM[7], $twtmp
2118 pshufd \$0x13, $twtmp, $twres
2120 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2121 pand $twmask, $twres # isolate carry and residue
2122 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2123 pxor $twres, @XMM[7]
2132 for ($i=0;$i<7;$i++) {
2134 pshufd \$0x13, $twtmp, $twres
2136 movdqa @XMM[7], @XMM[$i]
2137 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2138 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2139 pand $twmask, $twres # isolate carry and residue
2140 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2141 pxor $twres, @XMM[7]
2143 $code.=<<___ if ($i>=1);
2144 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2145 cmp \$`0x10*$i`,$len
2148 $code.=<<___ if ($i>=2);
2149 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2153 movdqu 0x60($inp), @XMM[8+6]
2154 pxor @XMM[8+5], @XMM[5]
2155 movdqa @XMM[7], 0x70(%rsp)
2156 lea 0x70($inp), $inp
2157 pxor @XMM[8+6], @XMM[6]
2158 lea 0x80(%rsp), %rax # pass key schedule
2159 mov %edx, %r10d # pass rounds
2161 call _bsaes_encrypt8
2163 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2164 pxor 0x10(%rsp), @XMM[1]
2165 movdqu @XMM[0], 0x00($out) # write output
2166 pxor 0x20(%rsp), @XMM[4]
2167 movdqu @XMM[1], 0x10($out)
2168 pxor 0x30(%rsp), @XMM[6]
2169 movdqu @XMM[4], 0x20($out)
2170 pxor 0x40(%rsp), @XMM[3]
2171 movdqu @XMM[6], 0x30($out)
2172 pxor 0x50(%rsp), @XMM[7]
2173 movdqu @XMM[3], 0x40($out)
2174 pxor 0x60(%rsp), @XMM[2]
2175 movdqu @XMM[7], 0x50($out)
2176 movdqu @XMM[2], 0x60($out)
2177 lea 0x70($out), $out
2179 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2183 pxor @XMM[8+4], @XMM[4]
2184 lea 0x60($inp), $inp
2185 pxor @XMM[8+5], @XMM[5]
2186 lea 0x80(%rsp), %rax # pass key schedule
2187 mov %edx, %r10d # pass rounds
2189 call _bsaes_encrypt8
2191 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2192 pxor 0x10(%rsp), @XMM[1]
2193 movdqu @XMM[0], 0x00($out) # write output
2194 pxor 0x20(%rsp), @XMM[4]
2195 movdqu @XMM[1], 0x10($out)
2196 pxor 0x30(%rsp), @XMM[6]
2197 movdqu @XMM[4], 0x20($out)
2198 pxor 0x40(%rsp), @XMM[3]
2199 movdqu @XMM[6], 0x30($out)
2200 pxor 0x50(%rsp), @XMM[7]
2201 movdqu @XMM[3], 0x40($out)
2202 movdqu @XMM[7], 0x50($out)
2203 lea 0x60($out), $out
2205 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2209 pxor @XMM[8+3], @XMM[3]
2210 lea 0x50($inp), $inp
2211 pxor @XMM[8+4], @XMM[4]
2212 lea 0x80(%rsp), %rax # pass key schedule
2213 mov %edx, %r10d # pass rounds
2215 call _bsaes_encrypt8
2217 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2218 pxor 0x10(%rsp), @XMM[1]
2219 movdqu @XMM[0], 0x00($out) # write output
2220 pxor 0x20(%rsp), @XMM[4]
2221 movdqu @XMM[1], 0x10($out)
2222 pxor 0x30(%rsp), @XMM[6]
2223 movdqu @XMM[4], 0x20($out)
2224 pxor 0x40(%rsp), @XMM[3]
2225 movdqu @XMM[6], 0x30($out)
2226 movdqu @XMM[3], 0x40($out)
2227 lea 0x50($out), $out
2229 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2233 pxor @XMM[8+2], @XMM[2]
2234 lea 0x40($inp), $inp
2235 pxor @XMM[8+3], @XMM[3]
2236 lea 0x80(%rsp), %rax # pass key schedule
2237 mov %edx, %r10d # pass rounds
2239 call _bsaes_encrypt8
2241 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2242 pxor 0x10(%rsp), @XMM[1]
2243 movdqu @XMM[0], 0x00($out) # write output
2244 pxor 0x20(%rsp), @XMM[4]
2245 movdqu @XMM[1], 0x10($out)
2246 pxor 0x30(%rsp), @XMM[6]
2247 movdqu @XMM[4], 0x20($out)
2248 movdqu @XMM[6], 0x30($out)
2249 lea 0x40($out), $out
2251 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2255 pxor @XMM[8+1], @XMM[1]
2256 lea 0x30($inp), $inp
2257 pxor @XMM[8+2], @XMM[2]
2258 lea 0x80(%rsp), %rax # pass key schedule
2259 mov %edx, %r10d # pass rounds
2261 call _bsaes_encrypt8
2263 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2264 pxor 0x10(%rsp), @XMM[1]
2265 movdqu @XMM[0], 0x00($out) # write output
2266 pxor 0x20(%rsp), @XMM[4]
2267 movdqu @XMM[1], 0x10($out)
2268 movdqu @XMM[4], 0x20($out)
2269 lea 0x30($out), $out
2271 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2275 pxor @XMM[8+0], @XMM[0]
2276 lea 0x20($inp), $inp
2277 pxor @XMM[8+1], @XMM[1]
2278 lea 0x80(%rsp), %rax # pass key schedule
2279 mov %edx, %r10d # pass rounds
2281 call _bsaes_encrypt8
2283 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2284 pxor 0x10(%rsp), @XMM[1]
2285 movdqu @XMM[0], 0x00($out) # write output
2286 movdqu @XMM[1], 0x10($out)
2287 lea 0x20($out), $out
2289 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2293 pxor @XMM[0], @XMM[8]
2294 lea 0x10($inp), $inp
2295 movdqa @XMM[8], 0x20(%rbp)
2296 lea 0x20(%rbp), $arg1
2297 lea 0x20(%rbp), $arg2
2299 call asm_AES_encrypt # doesn't touch %xmm
2300 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2301 #pxor @XMM[8], @XMM[0]
2302 #lea 0x80(%rsp), %rax # pass key schedule
2303 #mov %edx, %r10d # pass rounds
2304 #call _bsaes_encrypt8
2305 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2306 movdqu @XMM[0], 0x00($out) # write output
2307 lea 0x10($out), $out
2309 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2318 movzb -16(%rdx), %ecx
2326 movdqu -16($out), @XMM[0]
2327 lea 0x20(%rbp), $arg1
2328 pxor @XMM[7], @XMM[0]
2329 lea 0x20(%rbp), $arg2
2330 movdqa @XMM[0], 0x20(%rbp)
2332 call asm_AES_encrypt # doesn't touch %xmm
2333 pxor 0x20(%rbp), @XMM[7]
2334 movdqu @XMM[7], -16($out)
2339 .Lxts_enc_bzero: # wipe key schedule [if any]
2340 movdqa %xmm0, 0x00(%rax)
2341 movdqa %xmm0, 0x10(%rax)
2342 lea 0x20(%rax), %rax
2346 lea (%rbp),%rsp # restore %rsp
2348 $code.=<<___ if ($win64);
2349 movaps 0x40(%rbp), %xmm6
2350 movaps 0x50(%rbp), %xmm7
2351 movaps 0x60(%rbp), %xmm8
2352 movaps 0x70(%rbp), %xmm9
2353 movaps 0x80(%rbp), %xmm10
2354 movaps 0x90(%rbp), %xmm11
2355 movaps 0xa0(%rbp), %xmm12
2356 movaps 0xb0(%rbp), %xmm13
2357 movaps 0xc0(%rbp), %xmm14
2358 movaps 0xd0(%rbp), %xmm15
2359 lea 0xa0(%rbp), %rsp
2362 mov 0x48(%rsp), %r15
2363 mov 0x50(%rsp), %r14
2364 mov 0x58(%rsp), %r13
2365 mov 0x60(%rsp), %r12
2366 mov 0x68(%rsp), %rbx
2367 mov 0x70(%rsp), %rax
2368 lea 0x78(%rsp), %rsp
2372 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2374 .globl bsaes_xts_decrypt
2375 .type bsaes_xts_decrypt,\@abi-omnipotent
2386 lea -0x48(%rsp), %rsp
2388 $code.=<<___ if ($win64);
2389 mov 0xa0(%rsp),$arg5 # pull key2
2390 mov 0xa8(%rsp),$arg6 # pull ivp
2391 lea -0xa0(%rsp), %rsp
2392 movaps %xmm6, 0x40(%rsp)
2393 movaps %xmm7, 0x50(%rsp)
2394 movaps %xmm8, 0x60(%rsp)
2395 movaps %xmm9, 0x70(%rsp)
2396 movaps %xmm10, 0x80(%rsp)
2397 movaps %xmm11, 0x90(%rsp)
2398 movaps %xmm12, 0xa0(%rsp)
2399 movaps %xmm13, 0xb0(%rsp)
2400 movaps %xmm14, 0xc0(%rsp)
2401 movaps %xmm15, 0xd0(%rsp)
2405 mov %rsp, %rbp # backup %rsp
2406 mov $arg1, $inp # backup arguments
2412 lea 0x20(%rbp), $arg2
2414 call asm_AES_encrypt # generate initial tweak
2416 mov 240($key), %eax # rounds
2417 mov $len, %rbx # backup $len
2419 mov %eax, %edx # rounds
2420 shl \$7, %rax # 128 bytes per inner round key
2421 sub \$`128-32`, %rax # size of bit-sliced key schedule
2424 mov %rsp, %rax # pass key schedule
2425 mov $key, %rcx # pass key
2426 mov %edx, %r10d # pass rounds
2427 call _bsaes_key_convert
2428 pxor (%rsp), %xmm7 # fix up round 0 key
2429 movdqa %xmm6, (%rax) # save last round key
2430 movdqa %xmm7, (%rsp)
2432 xor %eax, %eax # if ($len%16) len-=16;
2439 sub \$0x80, %rsp # place for tweak[8]
2440 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2443 movdqa .Lxts_magic(%rip), $twmask
2444 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2453 for ($i=0;$i<7;$i++) {
2455 pshufd \$0x13, $twtmp, $twres
2457 movdqa @XMM[7], @XMM[$i]
2458 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2459 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2460 pand $twmask, $twres # isolate carry and residue
2461 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2462 pxor $twres, @XMM[7]
2464 $code.=<<___ if ($i>=1);
2465 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2467 $code.=<<___ if ($i>=2);
2468 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2472 movdqu 0x60($inp), @XMM[8+6]
2473 pxor @XMM[8+5], @XMM[5]
2474 movdqu 0x70($inp), @XMM[8+7]
2475 lea 0x80($inp), $inp
2476 movdqa @XMM[7], 0x70(%rsp)
2477 pxor @XMM[8+6], @XMM[6]
2478 lea 0x80(%rsp), %rax # pass key schedule
2479 pxor @XMM[8+7], @XMM[7]
2480 mov %edx, %r10d # pass rounds
2482 call _bsaes_decrypt8
2484 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2485 pxor 0x10(%rsp), @XMM[1]
2486 movdqu @XMM[0], 0x00($out) # write output
2487 pxor 0x20(%rsp), @XMM[6]
2488 movdqu @XMM[1], 0x10($out)
2489 pxor 0x30(%rsp), @XMM[4]
2490 movdqu @XMM[6], 0x20($out)
2491 pxor 0x40(%rsp), @XMM[2]
2492 movdqu @XMM[4], 0x30($out)
2493 pxor 0x50(%rsp), @XMM[7]
2494 movdqu @XMM[2], 0x40($out)
2495 pxor 0x60(%rsp), @XMM[3]
2496 movdqu @XMM[7], 0x50($out)
2497 pxor 0x70(%rsp), @XMM[5]
2498 movdqu @XMM[3], 0x60($out)
2499 movdqu @XMM[5], 0x70($out)
2500 lea 0x80($out), $out
2502 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2504 movdqa .Lxts_magic(%rip), $twmask
2505 pcmpgtd @XMM[7], $twtmp
2506 pshufd \$0x13, $twtmp, $twres
2508 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2509 pand $twmask, $twres # isolate carry and residue
2510 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2511 pxor $twres, @XMM[7]
2520 for ($i=0;$i<7;$i++) {
2522 pshufd \$0x13, $twtmp, $twres
2524 movdqa @XMM[7], @XMM[$i]
2525 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2526 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2527 pand $twmask, $twres # isolate carry and residue
2528 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2529 pxor $twres, @XMM[7]
2531 $code.=<<___ if ($i>=1);
2532 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2533 cmp \$`0x10*$i`,$len
2536 $code.=<<___ if ($i>=2);
2537 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2541 movdqu 0x60($inp), @XMM[8+6]
2542 pxor @XMM[8+5], @XMM[5]
2543 movdqa @XMM[7], 0x70(%rsp)
2544 lea 0x70($inp), $inp
2545 pxor @XMM[8+6], @XMM[6]
2546 lea 0x80(%rsp), %rax # pass key schedule
2547 mov %edx, %r10d # pass rounds
2549 call _bsaes_decrypt8
2551 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2552 pxor 0x10(%rsp), @XMM[1]
2553 movdqu @XMM[0], 0x00($out) # write output
2554 pxor 0x20(%rsp), @XMM[6]
2555 movdqu @XMM[1], 0x10($out)
2556 pxor 0x30(%rsp), @XMM[4]
2557 movdqu @XMM[6], 0x20($out)
2558 pxor 0x40(%rsp), @XMM[2]
2559 movdqu @XMM[4], 0x30($out)
2560 pxor 0x50(%rsp), @XMM[7]
2561 movdqu @XMM[2], 0x40($out)
2562 pxor 0x60(%rsp), @XMM[3]
2563 movdqu @XMM[7], 0x50($out)
2564 movdqu @XMM[3], 0x60($out)
2565 lea 0x70($out), $out
2567 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2571 pxor @XMM[8+4], @XMM[4]
2572 lea 0x60($inp), $inp
2573 pxor @XMM[8+5], @XMM[5]
2574 lea 0x80(%rsp), %rax # pass key schedule
2575 mov %edx, %r10d # pass rounds
2577 call _bsaes_decrypt8
2579 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2580 pxor 0x10(%rsp), @XMM[1]
2581 movdqu @XMM[0], 0x00($out) # write output
2582 pxor 0x20(%rsp), @XMM[6]
2583 movdqu @XMM[1], 0x10($out)
2584 pxor 0x30(%rsp), @XMM[4]
2585 movdqu @XMM[6], 0x20($out)
2586 pxor 0x40(%rsp), @XMM[2]
2587 movdqu @XMM[4], 0x30($out)
2588 pxor 0x50(%rsp), @XMM[7]
2589 movdqu @XMM[2], 0x40($out)
2590 movdqu @XMM[7], 0x50($out)
2591 lea 0x60($out), $out
2593 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2597 pxor @XMM[8+3], @XMM[3]
2598 lea 0x50($inp), $inp
2599 pxor @XMM[8+4], @XMM[4]
2600 lea 0x80(%rsp), %rax # pass key schedule
2601 mov %edx, %r10d # pass rounds
2603 call _bsaes_decrypt8
2605 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2606 pxor 0x10(%rsp), @XMM[1]
2607 movdqu @XMM[0], 0x00($out) # write output
2608 pxor 0x20(%rsp), @XMM[6]
2609 movdqu @XMM[1], 0x10($out)
2610 pxor 0x30(%rsp), @XMM[4]
2611 movdqu @XMM[6], 0x20($out)
2612 pxor 0x40(%rsp), @XMM[2]
2613 movdqu @XMM[4], 0x30($out)
2614 movdqu @XMM[2], 0x40($out)
2615 lea 0x50($out), $out
2617 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2621 pxor @XMM[8+2], @XMM[2]
2622 lea 0x40($inp), $inp
2623 pxor @XMM[8+3], @XMM[3]
2624 lea 0x80(%rsp), %rax # pass key schedule
2625 mov %edx, %r10d # pass rounds
2627 call _bsaes_decrypt8
2629 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2630 pxor 0x10(%rsp), @XMM[1]
2631 movdqu @XMM[0], 0x00($out) # write output
2632 pxor 0x20(%rsp), @XMM[6]
2633 movdqu @XMM[1], 0x10($out)
2634 pxor 0x30(%rsp), @XMM[4]
2635 movdqu @XMM[6], 0x20($out)
2636 movdqu @XMM[4], 0x30($out)
2637 lea 0x40($out), $out
2639 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2643 pxor @XMM[8+1], @XMM[1]
2644 lea 0x30($inp), $inp
2645 pxor @XMM[8+2], @XMM[2]
2646 lea 0x80(%rsp), %rax # pass key schedule
2647 mov %edx, %r10d # pass rounds
2649 call _bsaes_decrypt8
2651 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2652 pxor 0x10(%rsp), @XMM[1]
2653 movdqu @XMM[0], 0x00($out) # write output
2654 pxor 0x20(%rsp), @XMM[6]
2655 movdqu @XMM[1], 0x10($out)
2656 movdqu @XMM[6], 0x20($out)
2657 lea 0x30($out), $out
2659 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2663 pxor @XMM[8+0], @XMM[0]
2664 lea 0x20($inp), $inp
2665 pxor @XMM[8+1], @XMM[1]
2666 lea 0x80(%rsp), %rax # pass key schedule
2667 mov %edx, %r10d # pass rounds
2669 call _bsaes_decrypt8
2671 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2672 pxor 0x10(%rsp), @XMM[1]
2673 movdqu @XMM[0], 0x00($out) # write output
2674 movdqu @XMM[1], 0x10($out)
2675 lea 0x20($out), $out
2677 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2681 pxor @XMM[0], @XMM[8]
2682 lea 0x10($inp), $inp
2683 movdqa @XMM[8], 0x20(%rbp)
2684 lea 0x20(%rbp), $arg1
2685 lea 0x20(%rbp), $arg2
2687 call asm_AES_decrypt # doesn't touch %xmm
2688 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2689 #pxor @XMM[8], @XMM[0]
2690 #lea 0x80(%rsp), %rax # pass key schedule
2691 #mov %edx, %r10d # pass rounds
2692 #call _bsaes_decrypt8
2693 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2694 movdqu @XMM[0], 0x00($out) # write output
2695 lea 0x10($out), $out
2697 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2704 movdqa .Lxts_magic(%rip), $twmask
2705 pcmpgtd @XMM[7], $twtmp
2706 pshufd \$0x13, $twtmp, $twres
2707 movdqa @XMM[7], @XMM[6]
2708 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2709 pand $twmask, $twres # isolate carry and residue
2710 movdqu ($inp), @XMM[0]
2711 pxor $twres, @XMM[7]
2713 lea 0x20(%rbp), $arg1
2714 pxor @XMM[7], @XMM[0]
2715 lea 0x20(%rbp), $arg2
2716 movdqa @XMM[0], 0x20(%rbp)
2718 call asm_AES_decrypt # doesn't touch %xmm
2719 pxor 0x20(%rbp), @XMM[7]
2721 movdqu @XMM[7], ($out)
2724 movzb 16($inp), %eax
2733 movdqu ($out), @XMM[0]
2734 lea 0x20(%rbp), $arg1
2735 pxor @XMM[6], @XMM[0]
2736 lea 0x20(%rbp), $arg2
2737 movdqa @XMM[0], 0x20(%rbp)
2739 call asm_AES_decrypt # doesn't touch %xmm
2740 pxor 0x20(%rbp), @XMM[6]
2741 movdqu @XMM[6], ($out)
2746 .Lxts_dec_bzero: # wipe key schedule [if any]
2747 movdqa %xmm0, 0x00(%rax)
2748 movdqa %xmm0, 0x10(%rax)
2749 lea 0x20(%rax), %rax
2753 lea (%rbp),%rsp # restore %rsp
2755 $code.=<<___ if ($win64);
2756 movaps 0x40(%rbp), %xmm6
2757 movaps 0x50(%rbp), %xmm7
2758 movaps 0x60(%rbp), %xmm8
2759 movaps 0x70(%rbp), %xmm9
2760 movaps 0x80(%rbp), %xmm10
2761 movaps 0x90(%rbp), %xmm11
2762 movaps 0xa0(%rbp), %xmm12
2763 movaps 0xb0(%rbp), %xmm13
2764 movaps 0xc0(%rbp), %xmm14
2765 movaps 0xd0(%rbp), %xmm15
2766 lea 0xa0(%rbp), %rsp
2769 mov 0x48(%rsp), %r15
2770 mov 0x50(%rsp), %r14
2771 mov 0x58(%rsp), %r13
2772 mov 0x60(%rsp), %r12
2773 mov 0x68(%rsp), %rbx
2774 mov 0x70(%rsp), %rax
2775 lea 0x78(%rsp), %rsp
2779 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2783 .type _bsaes_const,\@object
2786 .LM0ISR: # InvShiftRows constants
2787 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2789 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2791 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2792 .LBS0: # bit-slice constants
2793 .quad 0x5555555555555555, 0x5555555555555555
2795 .quad 0x3333333333333333, 0x3333333333333333
2797 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2798 .LSR: # shiftrows constants
2799 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2801 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2803 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2805 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2806 .LNOT: # magic constants
2807 .quad 0xffffffffffffffff, 0xffffffffffffffff
2809 .quad 0x6363636363636363, 0x6363636363636363
2810 .LSWPUP: # byte-swap upper dword
2811 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2813 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2814 .LADD1: # counter increment constants
2815 .quad 0x0000000000000000, 0x0000000100000000
2817 .quad 0x0000000000000000, 0x0000000200000000
2819 .quad 0x0000000000000000, 0x0000000300000000
2821 .quad 0x0000000000000000, 0x0000000400000000
2823 .quad 0x0000000000000000, 0x0000000500000000
2825 .quad 0x0000000000000000, 0x0000000600000000
2827 .quad 0x0000000000000000, 0x0000000700000000
2829 .quad 0x0000000000000000, 0x0000000800000000
2832 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2834 .size _bsaes_const,.-_bsaes_const
2837 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2838 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2846 .extern __imp_RtlVirtualUnwind
2847 .type se_handler,\@abi-omnipotent
2861 mov 120($context),%rax # pull context->Rax
2862 mov 248($context),%rbx # pull context->Rip
2864 mov 8($disp),%rsi # disp->ImageBase
2865 mov 56($disp),%r11 # disp->HandlerData
2867 mov 0(%r11),%r10d # HandlerData[0]
2868 lea (%rsi,%r10),%r10 # prologue label
2869 cmp %r10,%rbx # context->Rip<prologue label
2872 mov 152($context),%rax # pull context->Rsp
2874 mov 4(%r11),%r10d # HandlerData[1]
2875 lea (%rsi,%r10),%r10 # epilogue label
2876 cmp %r10,%rbx # context->Rip>=epilogue label
2879 mov 160($context),%rax # pull context->Rbp
2881 lea 0x40(%rax),%rsi # %xmm save area
2882 lea 512($context),%rdi # &context.Xmm6
2883 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2884 .long 0xa548f3fc # cld; rep movsq
2885 lea 0xa0(%rax),%rax # adjust stack pointer
2893 lea 0x78(%rax),%rax # adjust stack pointer
2894 mov %rbx,144($context) # restore context->Rbx
2895 mov %rbp,160($context) # restore context->Rbp
2896 mov %r12,216($context) # restore context->R12
2897 mov %r13,224($context) # restore context->R13
2898 mov %r14,232($context) # restore context->R14
2899 mov %r15,240($context) # restore context->R15
2902 mov %rax,152($context) # restore context->Rsp
2904 mov 40($disp),%rdi # disp->ContextRecord
2905 mov $context,%rsi # context
2906 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2907 .long 0xa548f3fc # cld; rep movsq
2910 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2911 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2912 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2913 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2914 mov 40(%rsi),%r10 # disp->ContextRecord
2915 lea 56(%rsi),%r11 # &disp->HandlerData
2916 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2917 mov %r10,32(%rsp) # arg5
2918 mov %r11,40(%rsp) # arg6
2919 mov %r12,48(%rsp) # arg7
2920 mov %rcx,56(%rsp) # arg8, (NULL)
2921 call *__imp_RtlVirtualUnwind(%rip)
2923 mov \$1,%eax # ExceptionContinueSearch
2935 .size se_handler,.-se_handler
2940 $code.=<<___ if ($ecb);
2941 .rva .Lecb_enc_prologue
2942 .rva .Lecb_enc_epilogue
2945 .rva .Lecb_dec_prologue
2946 .rva .Lecb_dec_epilogue
2950 .rva .Lcbc_dec_prologue
2951 .rva .Lcbc_dec_epilogue
2954 .rva .Lctr_enc_prologue
2955 .rva .Lctr_enc_epilogue
2958 .rva .Lxts_enc_prologue
2959 .rva .Lxts_enc_epilogue
2962 .rva .Lxts_dec_prologue
2963 .rva .Lxts_dec_epilogue
2969 $code.=<<___ if ($ecb);
2973 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
2977 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
2983 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
2987 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
2991 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
2995 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
2999 $code =~ s/\`([^\`]*)\`/eval($1)/gem;