3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
67 # conversion conversion/8x block
72 # The ratio values mean that 128-byte blocks will be processed
73 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
84 # one byte out of 4096-byte buffer with 128-bit key is:
88 # Atom 18.9 (estimated, not measured yet)
92 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93 # suboptimal, but XTS is meant to be used with larger blocks...
99 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
101 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
103 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106 die "can't locate x86_64-xlate.pl";
108 open OUT,"| \"$^X\" $xlate $flavour $output";
111 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
113 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
116 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
119 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
130 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
153 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
174 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
179 &InvInBasisChange (@b);
180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
184 sub InvInBasisChange { # OutBasisChange in reverse
185 my @b=@_[5,1,2,6,3,7,0,4];
203 sub InvOutBasisChange { # InBasisChange in reverse
204 my @b=@_[2,5,7,3,6,1,0,4];
225 #;*************************************************************
226 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227 #;*************************************************************
228 my ($x0,$x1,$y0,$y1,$t0)=@_;
241 sub Mul_GF4_N { # not used, see next subroutine
242 # multiply and scale by N
243 my ($x0,$x1,$y0,$y1,$t0)=@_;
257 # interleaved Mul_GF4_N and Mul_GF4
258 my ($x0,$x1,$y0,$y1,$t0,
259 $x2,$x3,$y2,$y3,$t1)=@_;
287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
295 @x[2], @x[3], @y[2], @y[3], @t[2]);
307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
308 @x[6], @x[7], @y[2], @y[3], @t[2]);
313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
322 #;********************************************************************
323 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
324 #;********************************************************************
328 # direct optimizations from hardware
383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
385 # new smaller inversion
419 # output in s3, s2, s1, t1
421 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
423 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
426 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
429 # AES linear components
435 pxor 0x00($key),@x[0]
436 pxor 0x10($key),@x[1]
438 pxor 0x20($key),@x[2]
440 pxor 0x30($key),@x[3]
442 pxor 0x40($key),@x[4]
444 pxor 0x50($key),@x[5]
446 pxor 0x60($key),@x[6]
448 pxor 0x70($key),@x[7]
456 # modified to emit output in order suitable for feeding back to aesenc[last]
459 my $inv=@_[16]; # optional
461 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
462 pshufd \$0x93, @x[1], @t[1]
463 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
464 pshufd \$0x93, @x[2], @t[2]
466 pshufd \$0x93, @x[3], @t[3]
468 pshufd \$0x93, @x[4], @t[4]
470 pshufd \$0x93, @x[5], @t[5]
472 pshufd \$0x93, @x[6], @t[6]
474 pshufd \$0x93, @x[7], @t[7]
481 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
483 pshufd \$0x4E, @x[1], @x[1]
489 pshufd \$0x4E, @x[4], @t[0]
491 pshufd \$0x4E, @x[5], @t[1]
493 pshufd \$0x4E, @x[3], @x[4]
495 pshufd \$0x4E, @x[7], @x[5]
497 pshufd \$0x4E, @x[6], @x[3]
499 pshufd \$0x4E, @x[2], @x[6]
502 $code.=<<___ if (!$inv);
510 $code.=<<___ if ($inv);
523 sub InvMixColumns_orig {
528 # multiplication by 0x0e
529 pshufd \$0x93, @x[7], @t[7]
531 pxor @x[5], @x[7] # 7 5
532 pxor @x[5], @x[2] # 2 5
533 pshufd \$0x93, @x[0], @t[0]
535 pxor @x[0], @x[5] # 5 0 [1]
536 pxor @x[1], @x[0] # 0 1
537 pshufd \$0x93, @x[1], @t[1]
538 pxor @x[2], @x[1] # 1 25
539 pxor @x[6], @x[0] # 01 6 [2]
540 pxor @x[3], @x[1] # 125 3 [4]
541 pshufd \$0x93, @x[3], @t[3]
542 pxor @x[0], @x[2] # 25 016 [3]
543 pxor @x[7], @x[3] # 3 75
544 pxor @x[6], @x[7] # 75 6 [0]
545 pshufd \$0x93, @x[6], @t[6]
547 pxor @x[4], @x[6] # 6 4
548 pxor @x[3], @x[4] # 4 375 [6]
549 pxor @x[7], @x[3] # 375 756=36
550 pxor @t[5], @x[6] # 64 5 [7]
551 pxor @t[2], @x[3] # 36 2
552 pxor @t[4], @x[3] # 362 4 [5]
553 pshufd \$0x93, @t[5], @t[5]
555 my @y = @x[7,5,0,2,1,3,4,6];
557 # multiplication by 0x0b
561 pshufd \$0x93, @t[2], @t[2]
565 pshufd \$0x93, @t[4], @t[4]
566 pxor @t[6], @t[7] # clobber t[7]
570 pshufd \$0x93, @t[0], @t[0]
574 pshufd \$0x93, @t[1], @t[1]
578 pshufd \$0x93, @t[2], @t[2]
582 pshufd \$0x93, @t[3], @t[3]
588 pxor @t[5], @t[7] # clobber t[7] even more
591 pshufd \$0x93, @t[4], @t[4]
596 pshufd \$0x93, @t[5], @t[5]
597 pxor @t[6], @t[7] # restore t[7]
599 # multiplication by 0x0d
602 pshufd \$0x93, @t[6], @t[6]
606 pshufd \$0x93, @t[7], @t[7]
615 pshufd \$0x93, @t[0], @t[0]
619 pshufd \$0x93, @t[1], @t[1]
624 pshufd \$0x93, @t[2], @t[2]
626 pxor @t[3], @t[6] # clobber t[6]
633 pshufd \$0x93, @t[4], @t[4]
636 pxor @t[3], @t[6] # restore t[6]
638 pshufd \$0x93, @t[5], @t[5]
639 pshufd \$0x93, @t[6], @t[6]
640 pshufd \$0x93, @t[7], @t[7]
641 pshufd \$0x93, @t[3], @t[3]
643 # multiplication by 0x09
645 pxor @y[1], @t[1] # t[1]=y[1]
646 pxor @t[5], @t[0] # clobber t[0]
649 pxor @y[0], @t[0] # t[0]=y[0]
651 pxor @t[7], @t[6] # clobber t[6]
654 pxor @y[4], @t[4] # t[4]=y[4]
656 pxor @y[3], @t[3] # t[3]=y[3]
658 pxor @y[2], @t[2] # t[2]=y[2]
660 pxor @y[5], @t[5] # t[5]=y[5]
663 pxor @y[6], @t[6] # t[6]=y[6]
664 pxor @y[7], @t[7] # t[7]=y[7]
681 # Thanks to Jussi Kivilinna for providing pointer to
683 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
684 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
685 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
686 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
689 # multiplication by 0x05-0x00-0x04-0x00
690 pshufd \$0x4E, @x[0], @t[0]
691 pshufd \$0x4E, @x[6], @t[6]
693 pshufd \$0x4E, @x[7], @t[7]
695 pshufd \$0x4E, @x[1], @t[1]
697 pshufd \$0x4E, @x[2], @t[2]
699 pshufd \$0x4E, @x[3], @t[3]
703 pshufd \$0x4E, @x[4], @t[4]
707 pshufd \$0x4E, @x[5], @t[5]
722 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
725 sub aesenc { # not used
729 movdqa 0x30($const),@t[0] # .LSR
731 &ShiftRows (@b,@t[0]);
733 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
736 sub aesenclast { # not used
740 movdqa 0x40($const),@t[0] # .LSRM0
742 &ShiftRows (@b,@t[0]);
745 pxor 0x00($key),@b[0]
746 pxor 0x10($key),@b[1]
747 pxor 0x20($key),@b[4]
748 pxor 0x30($key),@b[6]
749 pxor 0x40($key),@b[3]
750 pxor 0x50($key),@b[7]
751 pxor 0x60($key),@b[2]
752 pxor 0x70($key),@b[5]
757 my ($a,$b,$n,$mask,$t)=@_;
769 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
789 my @x=reverse(@_[0..7]);
790 my ($t0,$t1,$t2,$t3)=@_[8..11];
792 movdqa 0x00($const),$t0 # .LBS0
793 movdqa 0x10($const),$t1 # .LBS1
795 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
796 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
798 movdqa 0x20($const),$t0 # .LBS2
800 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
801 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
803 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
804 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
810 .extern asm_AES_encrypt
811 .extern asm_AES_decrypt
813 .type _bsaes_encrypt8,\@abi-omnipotent
816 lea .LBS0(%rip), $const # constants table
818 movdqa ($key), @XMM[9] # round 0 key
820 movdqa 0x50($const), @XMM[8] # .LM0SR
821 pxor @XMM[9], @XMM[0] # xor with round0 key
822 pxor @XMM[9], @XMM[1]
823 pshufb @XMM[8], @XMM[0]
824 pxor @XMM[9], @XMM[2]
825 pshufb @XMM[8], @XMM[1]
826 pxor @XMM[9], @XMM[3]
827 pshufb @XMM[8], @XMM[2]
828 pxor @XMM[9], @XMM[4]
829 pshufb @XMM[8], @XMM[3]
830 pxor @XMM[9], @XMM[5]
831 pshufb @XMM[8], @XMM[4]
832 pxor @XMM[9], @XMM[6]
833 pshufb @XMM[8], @XMM[5]
834 pxor @XMM[9], @XMM[7]
835 pshufb @XMM[8], @XMM[6]
836 pshufb @XMM[8], @XMM[7]
837 _bsaes_encrypt8_bitslice:
839 &bitslice (@XMM[0..7, 8..11]);
846 &ShiftRows (@XMM[0..7, 8]);
847 $code.=".Lenc_sbox:\n";
848 &Sbox (@XMM[0..7, 8..15]);
853 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
855 movdqa 0x30($const), @XMM[8] # .LSR
857 movdqa 0x40($const), @XMM[8] # .LSRM0
862 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
863 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
865 movdqa ($key), @XMM[8] # last round key
866 pxor @XMM[8], @XMM[4]
867 pxor @XMM[8], @XMM[6]
868 pxor @XMM[8], @XMM[3]
869 pxor @XMM[8], @XMM[7]
870 pxor @XMM[8], @XMM[2]
871 pxor @XMM[8], @XMM[5]
872 pxor @XMM[8], @XMM[0]
873 pxor @XMM[8], @XMM[1]
875 .size _bsaes_encrypt8,.-_bsaes_encrypt8
877 .type _bsaes_decrypt8,\@abi-omnipotent
880 lea .LBS0(%rip), $const # constants table
882 movdqa ($key), @XMM[9] # round 0 key
884 movdqa -0x30($const), @XMM[8] # .LM0ISR
885 pxor @XMM[9], @XMM[0] # xor with round0 key
886 pxor @XMM[9], @XMM[1]
887 pshufb @XMM[8], @XMM[0]
888 pxor @XMM[9], @XMM[2]
889 pshufb @XMM[8], @XMM[1]
890 pxor @XMM[9], @XMM[3]
891 pshufb @XMM[8], @XMM[2]
892 pxor @XMM[9], @XMM[4]
893 pshufb @XMM[8], @XMM[3]
894 pxor @XMM[9], @XMM[5]
895 pshufb @XMM[8], @XMM[4]
896 pxor @XMM[9], @XMM[6]
897 pshufb @XMM[8], @XMM[5]
898 pxor @XMM[9], @XMM[7]
899 pshufb @XMM[8], @XMM[6]
900 pshufb @XMM[8], @XMM[7]
902 &bitslice (@XMM[0..7, 8..11]);
909 &ShiftRows (@XMM[0..7, 8]);
910 $code.=".Ldec_sbox:\n";
911 &InvSbox (@XMM[0..7, 8..15]);
916 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
918 movdqa -0x10($const), @XMM[8] # .LISR
920 movdqa -0x20($const), @XMM[8] # .LISRM0
925 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
927 movdqa ($key), @XMM[8] # last round key
928 pxor @XMM[8], @XMM[6]
929 pxor @XMM[8], @XMM[4]
930 pxor @XMM[8], @XMM[2]
931 pxor @XMM[8], @XMM[7]
932 pxor @XMM[8], @XMM[3]
933 pxor @XMM[8], @XMM[5]
934 pxor @XMM[8], @XMM[0]
935 pxor @XMM[8], @XMM[1]
937 .size _bsaes_decrypt8,.-_bsaes_decrypt8
941 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
944 my @x=reverse(@_[0..7]);
945 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
947 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
949 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
953 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
955 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
957 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
963 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
964 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
968 .type _bsaes_key_convert,\@abi-omnipotent
971 lea .Lmasks(%rip), $const
972 movdqu ($inp), %xmm7 # load round 0 key
974 movdqa 0x00($const), %xmm0 # 0x01...
975 movdqa 0x10($const), %xmm1 # 0x02...
976 movdqa 0x20($const), %xmm2 # 0x04...
977 movdqa 0x30($const), %xmm3 # 0x08...
978 movdqa 0x40($const), %xmm4 # .LM0
979 pcmpeqd %xmm5, %xmm5 # .LNOT
981 movdqu ($inp), %xmm6 # load round 1 key
982 movdqa %xmm7, ($out) # save round 0 key
988 pshufb %xmm4, %xmm6 # .LM0
997 psllq \$4, %xmm0 # 0x10...
1000 psllq \$4, %xmm1 # 0x20...
1004 movdqa %xmm0, %xmm12
1005 pcmpeqb %xmm2, %xmm10
1006 psllq \$4, %xmm2 # 0x40...
1007 movdqa %xmm1, %xmm13
1008 pcmpeqb %xmm3, %xmm11
1009 psllq \$4, %xmm3 # 0x80...
1011 movdqa %xmm2, %xmm14
1012 movdqa %xmm3, %xmm15
1013 pxor %xmm5, %xmm8 # "pnot"
1018 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1019 pcmpeqb %xmm0, %xmm12
1020 psrlq \$4, %xmm0 # 0x01...
1021 movdqa %xmm9, 0x10($out)
1022 pcmpeqb %xmm1, %xmm13
1023 psrlq \$4, %xmm1 # 0x02...
1024 lea 0x10($inp), $inp
1028 movdqa %xmm10, 0x20($out)
1029 pcmpeqb %xmm2, %xmm14
1030 psrlq \$4, %xmm2 # 0x04...
1031 movdqa %xmm11, 0x30($out)
1032 pcmpeqb %xmm3, %xmm15
1033 psrlq \$4, %xmm3 # 0x08...
1034 movdqu ($inp), %xmm6 # load next round key
1036 pxor %xmm5, %xmm13 # "pnot"
1038 movdqa %xmm12, 0x40($out)
1039 movdqa %xmm13, 0x50($out)
1040 movdqa %xmm14, 0x60($out)
1041 movdqa %xmm15, 0x70($out)
1046 movdqa 0x50($const), %xmm7 # .L63
1047 #movdqa %xmm6, ($out) # don't save last round key
1049 .size _bsaes_key_convert,.-_bsaes_key_convert
1053 if (0 && !$win64) { # following four functions are unsupported interface
1054 # used for benchmarking...
1056 .globl bsaes_enc_key_convert
1057 .type bsaes_enc_key_convert,\@function,2
1059 bsaes_enc_key_convert:
1060 mov 240($inp),%r10d # pass rounds
1061 mov $inp,%rcx # pass key
1062 mov $out,%rax # pass key schedule
1063 call _bsaes_key_convert
1064 pxor %xmm6,%xmm7 # fix up last round key
1065 movdqa %xmm7,(%rax) # save last round key
1067 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1069 .globl bsaes_encrypt_128
1070 .type bsaes_encrypt_128,\@function,4
1074 movdqu 0x00($inp), @XMM[0] # load input
1075 movdqu 0x10($inp), @XMM[1]
1076 movdqu 0x20($inp), @XMM[2]
1077 movdqu 0x30($inp), @XMM[3]
1078 movdqu 0x40($inp), @XMM[4]
1079 movdqu 0x50($inp), @XMM[5]
1080 movdqu 0x60($inp), @XMM[6]
1081 movdqu 0x70($inp), @XMM[7]
1082 mov $key, %rax # pass the $key
1083 lea 0x80($inp), $inp
1086 call _bsaes_encrypt8
1088 movdqu @XMM[0], 0x00($out) # write output
1089 movdqu @XMM[1], 0x10($out)
1090 movdqu @XMM[4], 0x20($out)
1091 movdqu @XMM[6], 0x30($out)
1092 movdqu @XMM[3], 0x40($out)
1093 movdqu @XMM[7], 0x50($out)
1094 movdqu @XMM[2], 0x60($out)
1095 movdqu @XMM[5], 0x70($out)
1096 lea 0x80($out), $out
1100 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1102 .globl bsaes_dec_key_convert
1103 .type bsaes_dec_key_convert,\@function,2
1105 bsaes_dec_key_convert:
1106 mov 240($inp),%r10d # pass rounds
1107 mov $inp,%rcx # pass key
1108 mov $out,%rax # pass key schedule
1109 call _bsaes_key_convert
1110 pxor ($out),%xmm7 # fix up round 0 key
1111 movdqa %xmm6,(%rax) # save last round key
1114 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1116 .globl bsaes_decrypt_128
1117 .type bsaes_decrypt_128,\@function,4
1121 movdqu 0x00($inp), @XMM[0] # load input
1122 movdqu 0x10($inp), @XMM[1]
1123 movdqu 0x20($inp), @XMM[2]
1124 movdqu 0x30($inp), @XMM[3]
1125 movdqu 0x40($inp), @XMM[4]
1126 movdqu 0x50($inp), @XMM[5]
1127 movdqu 0x60($inp), @XMM[6]
1128 movdqu 0x70($inp), @XMM[7]
1129 mov $key, %rax # pass the $key
1130 lea 0x80($inp), $inp
1133 call _bsaes_decrypt8
1135 movdqu @XMM[0], 0x00($out) # write output
1136 movdqu @XMM[1], 0x10($out)
1137 movdqu @XMM[6], 0x20($out)
1138 movdqu @XMM[4], 0x30($out)
1139 movdqu @XMM[2], 0x40($out)
1140 movdqu @XMM[7], 0x50($out)
1141 movdqu @XMM[3], 0x60($out)
1142 movdqu @XMM[5], 0x70($out)
1143 lea 0x80($out), $out
1147 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1151 ######################################################################
1155 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1156 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1157 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1161 .globl bsaes_ecb_encrypt_blocks
1162 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1164 bsaes_ecb_encrypt_blocks:
1173 lea -0x48(%rsp),%rsp
1175 $code.=<<___ if ($win64);
1176 lea -0xa0(%rsp), %rsp
1177 movaps %xmm6, 0x40(%rsp)
1178 movaps %xmm7, 0x50(%rsp)
1179 movaps %xmm8, 0x60(%rsp)
1180 movaps %xmm9, 0x70(%rsp)
1181 movaps %xmm10, 0x80(%rsp)
1182 movaps %xmm11, 0x90(%rsp)
1183 movaps %xmm12, 0xa0(%rsp)
1184 movaps %xmm13, 0xb0(%rsp)
1185 movaps %xmm14, 0xc0(%rsp)
1186 movaps %xmm15, 0xd0(%rsp)
1190 mov %rsp,%rbp # backup %rsp
1191 mov 240($arg4),%eax # rounds
1192 mov $arg1,$inp # backup arguments
1199 mov %eax,%ebx # backup rounds
1200 shl \$7,%rax # 128 bytes per inner round key
1201 sub \$`128-32`,%rax # size of bit-sliced key schedule
1203 mov %rsp,%rax # pass key schedule
1204 mov $key,%rcx # pass key
1205 mov %ebx,%r10d # pass rounds
1206 call _bsaes_key_convert
1207 pxor %xmm6,%xmm7 # fix up last round key
1208 movdqa %xmm7,(%rax) # save last round key
1212 movdqu 0x00($inp), @XMM[0] # load input
1213 movdqu 0x10($inp), @XMM[1]
1214 movdqu 0x20($inp), @XMM[2]
1215 movdqu 0x30($inp), @XMM[3]
1216 movdqu 0x40($inp), @XMM[4]
1217 movdqu 0x50($inp), @XMM[5]
1218 mov %rsp, %rax # pass key schedule
1219 movdqu 0x60($inp), @XMM[6]
1220 mov %ebx,%r10d # pass rounds
1221 movdqu 0x70($inp), @XMM[7]
1222 lea 0x80($inp), $inp
1224 call _bsaes_encrypt8
1226 movdqu @XMM[0], 0x00($out) # write output
1227 movdqu @XMM[1], 0x10($out)
1228 movdqu @XMM[4], 0x20($out)
1229 movdqu @XMM[6], 0x30($out)
1230 movdqu @XMM[3], 0x40($out)
1231 movdqu @XMM[7], 0x50($out)
1232 movdqu @XMM[2], 0x60($out)
1233 movdqu @XMM[5], 0x70($out)
1234 lea 0x80($out), $out
1241 movdqu 0x00($inp), @XMM[0] # load input
1242 mov %rsp, %rax # pass key schedule
1243 mov %ebx,%r10d # pass rounds
1246 movdqu 0x10($inp), @XMM[1]
1248 movdqu 0x20($inp), @XMM[2]
1251 movdqu 0x30($inp), @XMM[3]
1253 movdqu 0x40($inp), @XMM[4]
1256 movdqu 0x50($inp), @XMM[5]
1258 movdqu 0x60($inp), @XMM[6]
1259 call _bsaes_encrypt8
1260 movdqu @XMM[0], 0x00($out) # write output
1261 movdqu @XMM[1], 0x10($out)
1262 movdqu @XMM[4], 0x20($out)
1263 movdqu @XMM[6], 0x30($out)
1264 movdqu @XMM[3], 0x40($out)
1265 movdqu @XMM[7], 0x50($out)
1266 movdqu @XMM[2], 0x60($out)
1270 call _bsaes_encrypt8
1271 movdqu @XMM[0], 0x00($out) # write output
1272 movdqu @XMM[1], 0x10($out)
1273 movdqu @XMM[4], 0x20($out)
1274 movdqu @XMM[6], 0x30($out)
1275 movdqu @XMM[3], 0x40($out)
1276 movdqu @XMM[7], 0x50($out)
1280 call _bsaes_encrypt8
1281 movdqu @XMM[0], 0x00($out) # write output
1282 movdqu @XMM[1], 0x10($out)
1283 movdqu @XMM[4], 0x20($out)
1284 movdqu @XMM[6], 0x30($out)
1285 movdqu @XMM[3], 0x40($out)
1289 call _bsaes_encrypt8
1290 movdqu @XMM[0], 0x00($out) # write output
1291 movdqu @XMM[1], 0x10($out)
1292 movdqu @XMM[4], 0x20($out)
1293 movdqu @XMM[6], 0x30($out)
1297 call _bsaes_encrypt8
1298 movdqu @XMM[0], 0x00($out) # write output
1299 movdqu @XMM[1], 0x10($out)
1300 movdqu @XMM[4], 0x20($out)
1304 call _bsaes_encrypt8
1305 movdqu @XMM[0], 0x00($out) # write output
1306 movdqu @XMM[1], 0x10($out)
1310 call _bsaes_encrypt8
1311 movdqu @XMM[0], 0x00($out) # write output
1318 call asm_AES_encrypt
1327 .Lecb_enc_bzero: # wipe key schedule [if any]
1328 movdqa %xmm0, 0x00(%rax)
1329 movdqa %xmm0, 0x10(%rax)
1330 lea 0x20(%rax), %rax
1334 lea (%rbp),%rsp # restore %rsp
1336 $code.=<<___ if ($win64);
1337 movaps 0x40(%rbp), %xmm6
1338 movaps 0x50(%rbp), %xmm7
1339 movaps 0x60(%rbp), %xmm8
1340 movaps 0x70(%rbp), %xmm9
1341 movaps 0x80(%rbp), %xmm10
1342 movaps 0x90(%rbp), %xmm11
1343 movaps 0xa0(%rbp), %xmm12
1344 movaps 0xb0(%rbp), %xmm13
1345 movaps 0xc0(%rbp), %xmm14
1346 movaps 0xd0(%rbp), %xmm15
1347 lea 0xa0(%rbp), %rsp
1350 mov 0x48(%rsp), %r15
1351 mov 0x50(%rsp), %r14
1352 mov 0x58(%rsp), %r13
1353 mov 0x60(%rsp), %r12
1354 mov 0x68(%rsp), %rbx
1355 mov 0x70(%rsp), %rax
1356 lea 0x78(%rsp), %rsp
1360 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1362 .globl bsaes_ecb_decrypt_blocks
1363 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1365 bsaes_ecb_decrypt_blocks:
1374 lea -0x48(%rsp),%rsp
1376 $code.=<<___ if ($win64);
1377 lea -0xa0(%rsp), %rsp
1378 movaps %xmm6, 0x40(%rsp)
1379 movaps %xmm7, 0x50(%rsp)
1380 movaps %xmm8, 0x60(%rsp)
1381 movaps %xmm9, 0x70(%rsp)
1382 movaps %xmm10, 0x80(%rsp)
1383 movaps %xmm11, 0x90(%rsp)
1384 movaps %xmm12, 0xa0(%rsp)
1385 movaps %xmm13, 0xb0(%rsp)
1386 movaps %xmm14, 0xc0(%rsp)
1387 movaps %xmm15, 0xd0(%rsp)
1391 mov %rsp,%rbp # backup %rsp
1392 mov 240($arg4),%eax # rounds
1393 mov $arg1,$inp # backup arguments
1400 mov %eax,%ebx # backup rounds
1401 shl \$7,%rax # 128 bytes per inner round key
1402 sub \$`128-32`,%rax # size of bit-sliced key schedule
1404 mov %rsp,%rax # pass key schedule
1405 mov $key,%rcx # pass key
1406 mov %ebx,%r10d # pass rounds
1407 call _bsaes_key_convert
1408 pxor (%rsp),%xmm7 # fix up 0 round key
1409 movdqa %xmm6,(%rax) # save last round key
1414 movdqu 0x00($inp), @XMM[0] # load input
1415 movdqu 0x10($inp), @XMM[1]
1416 movdqu 0x20($inp), @XMM[2]
1417 movdqu 0x30($inp), @XMM[3]
1418 movdqu 0x40($inp), @XMM[4]
1419 movdqu 0x50($inp), @XMM[5]
1420 mov %rsp, %rax # pass key schedule
1421 movdqu 0x60($inp), @XMM[6]
1422 mov %ebx,%r10d # pass rounds
1423 movdqu 0x70($inp), @XMM[7]
1424 lea 0x80($inp), $inp
1426 call _bsaes_decrypt8
1428 movdqu @XMM[0], 0x00($out) # write output
1429 movdqu @XMM[1], 0x10($out)
1430 movdqu @XMM[6], 0x20($out)
1431 movdqu @XMM[4], 0x30($out)
1432 movdqu @XMM[2], 0x40($out)
1433 movdqu @XMM[7], 0x50($out)
1434 movdqu @XMM[3], 0x60($out)
1435 movdqu @XMM[5], 0x70($out)
1436 lea 0x80($out), $out
1443 movdqu 0x00($inp), @XMM[0] # load input
1444 mov %rsp, %rax # pass key schedule
1445 mov %ebx,%r10d # pass rounds
1448 movdqu 0x10($inp), @XMM[1]
1450 movdqu 0x20($inp), @XMM[2]
1453 movdqu 0x30($inp), @XMM[3]
1455 movdqu 0x40($inp), @XMM[4]
1458 movdqu 0x50($inp), @XMM[5]
1460 movdqu 0x60($inp), @XMM[6]
1461 call _bsaes_decrypt8
1462 movdqu @XMM[0], 0x00($out) # write output
1463 movdqu @XMM[1], 0x10($out)
1464 movdqu @XMM[6], 0x20($out)
1465 movdqu @XMM[4], 0x30($out)
1466 movdqu @XMM[2], 0x40($out)
1467 movdqu @XMM[7], 0x50($out)
1468 movdqu @XMM[3], 0x60($out)
1472 call _bsaes_decrypt8
1473 movdqu @XMM[0], 0x00($out) # write output
1474 movdqu @XMM[1], 0x10($out)
1475 movdqu @XMM[6], 0x20($out)
1476 movdqu @XMM[4], 0x30($out)
1477 movdqu @XMM[2], 0x40($out)
1478 movdqu @XMM[7], 0x50($out)
1482 call _bsaes_decrypt8
1483 movdqu @XMM[0], 0x00($out) # write output
1484 movdqu @XMM[1], 0x10($out)
1485 movdqu @XMM[6], 0x20($out)
1486 movdqu @XMM[4], 0x30($out)
1487 movdqu @XMM[2], 0x40($out)
1491 call _bsaes_decrypt8
1492 movdqu @XMM[0], 0x00($out) # write output
1493 movdqu @XMM[1], 0x10($out)
1494 movdqu @XMM[6], 0x20($out)
1495 movdqu @XMM[4], 0x30($out)
1499 call _bsaes_decrypt8
1500 movdqu @XMM[0], 0x00($out) # write output
1501 movdqu @XMM[1], 0x10($out)
1502 movdqu @XMM[6], 0x20($out)
1506 call _bsaes_decrypt8
1507 movdqu @XMM[0], 0x00($out) # write output
1508 movdqu @XMM[1], 0x10($out)
1512 call _bsaes_decrypt8
1513 movdqu @XMM[0], 0x00($out) # write output
1520 call asm_AES_decrypt
1529 .Lecb_dec_bzero: # wipe key schedule [if any]
1530 movdqa %xmm0, 0x00(%rax)
1531 movdqa %xmm0, 0x10(%rax)
1532 lea 0x20(%rax), %rax
1536 lea (%rbp),%rsp # restore %rsp
1538 $code.=<<___ if ($win64);
1539 movaps 0x40(%rbp), %xmm6
1540 movaps 0x50(%rbp), %xmm7
1541 movaps 0x60(%rbp), %xmm8
1542 movaps 0x70(%rbp), %xmm9
1543 movaps 0x80(%rbp), %xmm10
1544 movaps 0x90(%rbp), %xmm11
1545 movaps 0xa0(%rbp), %xmm12
1546 movaps 0xb0(%rbp), %xmm13
1547 movaps 0xc0(%rbp), %xmm14
1548 movaps 0xd0(%rbp), %xmm15
1549 lea 0xa0(%rbp), %rsp
1552 mov 0x48(%rsp), %r15
1553 mov 0x50(%rsp), %r14
1554 mov 0x58(%rsp), %r13
1555 mov 0x60(%rsp), %r12
1556 mov 0x68(%rsp), %rbx
1557 mov 0x70(%rsp), %rax
1558 lea 0x78(%rsp), %rsp
1562 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1566 .extern asm_AES_cbc_encrypt
1567 .globl bsaes_cbc_encrypt
1568 .type bsaes_cbc_encrypt,\@abi-omnipotent
1572 $code.=<<___ if ($win64);
1573 mov 48(%rsp),$arg6 # pull direction flag
1577 jne asm_AES_cbc_encrypt
1579 jb asm_AES_cbc_encrypt
1589 lea -0x48(%rsp), %rsp
1591 $code.=<<___ if ($win64);
1592 mov 0xa0(%rsp),$arg5 # pull ivp
1593 lea -0xa0(%rsp), %rsp
1594 movaps %xmm6, 0x40(%rsp)
1595 movaps %xmm7, 0x50(%rsp)
1596 movaps %xmm8, 0x60(%rsp)
1597 movaps %xmm9, 0x70(%rsp)
1598 movaps %xmm10, 0x80(%rsp)
1599 movaps %xmm11, 0x90(%rsp)
1600 movaps %xmm12, 0xa0(%rsp)
1601 movaps %xmm13, 0xb0(%rsp)
1602 movaps %xmm14, 0xc0(%rsp)
1603 movaps %xmm15, 0xd0(%rsp)
1607 mov %rsp, %rbp # backup %rsp
1608 mov 240($arg4), %eax # rounds
1609 mov $arg1, $inp # backup arguments
1614 shr \$4, $len # bytes to blocks
1616 mov %eax, %edx # rounds
1617 shl \$7, %rax # 128 bytes per inner round key
1618 sub \$`128-32`, %rax # size of bit-sliced key schedule
1621 mov %rsp, %rax # pass key schedule
1622 mov $key, %rcx # pass key
1623 mov %edx, %r10d # pass rounds
1624 call _bsaes_key_convert
1625 pxor (%rsp),%xmm7 # fix up 0 round key
1626 movdqa %xmm6,(%rax) # save last round key
1629 movdqu (%rbx), @XMM[15] # load IV
1632 movdqu 0x00($inp), @XMM[0] # load input
1633 movdqu 0x10($inp), @XMM[1]
1634 movdqu 0x20($inp), @XMM[2]
1635 movdqu 0x30($inp), @XMM[3]
1636 movdqu 0x40($inp), @XMM[4]
1637 movdqu 0x50($inp), @XMM[5]
1638 mov %rsp, %rax # pass key schedule
1639 movdqu 0x60($inp), @XMM[6]
1640 mov %edx,%r10d # pass rounds
1641 movdqu 0x70($inp), @XMM[7]
1642 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1644 call _bsaes_decrypt8
1646 pxor 0x20(%rbp), @XMM[0] # ^= IV
1647 movdqu 0x00($inp), @XMM[8] # re-load input
1648 movdqu 0x10($inp), @XMM[9]
1649 pxor @XMM[8], @XMM[1]
1650 movdqu 0x20($inp), @XMM[10]
1651 pxor @XMM[9], @XMM[6]
1652 movdqu 0x30($inp), @XMM[11]
1653 pxor @XMM[10], @XMM[4]
1654 movdqu 0x40($inp), @XMM[12]
1655 pxor @XMM[11], @XMM[2]
1656 movdqu 0x50($inp), @XMM[13]
1657 pxor @XMM[12], @XMM[7]
1658 movdqu 0x60($inp), @XMM[14]
1659 pxor @XMM[13], @XMM[3]
1660 movdqu 0x70($inp), @XMM[15] # IV
1661 pxor @XMM[14], @XMM[5]
1662 movdqu @XMM[0], 0x00($out) # write output
1663 lea 0x80($inp), $inp
1664 movdqu @XMM[1], 0x10($out)
1665 movdqu @XMM[6], 0x20($out)
1666 movdqu @XMM[4], 0x30($out)
1667 movdqu @XMM[2], 0x40($out)
1668 movdqu @XMM[7], 0x50($out)
1669 movdqu @XMM[3], 0x60($out)
1670 movdqu @XMM[5], 0x70($out)
1671 lea 0x80($out), $out
1678 movdqu 0x00($inp), @XMM[0] # load input
1679 mov %rsp, %rax # pass key schedule
1680 mov %edx, %r10d # pass rounds
1683 movdqu 0x10($inp), @XMM[1]
1685 movdqu 0x20($inp), @XMM[2]
1688 movdqu 0x30($inp), @XMM[3]
1690 movdqu 0x40($inp), @XMM[4]
1693 movdqu 0x50($inp), @XMM[5]
1695 movdqu 0x60($inp), @XMM[6]
1696 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1697 call _bsaes_decrypt8
1698 pxor 0x20(%rbp), @XMM[0] # ^= IV
1699 movdqu 0x00($inp), @XMM[8] # re-load input
1700 movdqu 0x10($inp), @XMM[9]
1701 pxor @XMM[8], @XMM[1]
1702 movdqu 0x20($inp), @XMM[10]
1703 pxor @XMM[9], @XMM[6]
1704 movdqu 0x30($inp), @XMM[11]
1705 pxor @XMM[10], @XMM[4]
1706 movdqu 0x40($inp), @XMM[12]
1707 pxor @XMM[11], @XMM[2]
1708 movdqu 0x50($inp), @XMM[13]
1709 pxor @XMM[12], @XMM[7]
1710 movdqu 0x60($inp), @XMM[15] # IV
1711 pxor @XMM[13], @XMM[3]
1712 movdqu @XMM[0], 0x00($out) # write output
1713 movdqu @XMM[1], 0x10($out)
1714 movdqu @XMM[6], 0x20($out)
1715 movdqu @XMM[4], 0x30($out)
1716 movdqu @XMM[2], 0x40($out)
1717 movdqu @XMM[7], 0x50($out)
1718 movdqu @XMM[3], 0x60($out)
1722 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1723 call _bsaes_decrypt8
1724 pxor 0x20(%rbp), @XMM[0] # ^= IV
1725 movdqu 0x00($inp), @XMM[8] # re-load input
1726 movdqu 0x10($inp), @XMM[9]
1727 pxor @XMM[8], @XMM[1]
1728 movdqu 0x20($inp), @XMM[10]
1729 pxor @XMM[9], @XMM[6]
1730 movdqu 0x30($inp), @XMM[11]
1731 pxor @XMM[10], @XMM[4]
1732 movdqu 0x40($inp), @XMM[12]
1733 pxor @XMM[11], @XMM[2]
1734 movdqu 0x50($inp), @XMM[15] # IV
1735 pxor @XMM[12], @XMM[7]
1736 movdqu @XMM[0], 0x00($out) # write output
1737 movdqu @XMM[1], 0x10($out)
1738 movdqu @XMM[6], 0x20($out)
1739 movdqu @XMM[4], 0x30($out)
1740 movdqu @XMM[2], 0x40($out)
1741 movdqu @XMM[7], 0x50($out)
1745 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1746 call _bsaes_decrypt8
1747 pxor 0x20(%rbp), @XMM[0] # ^= IV
1748 movdqu 0x00($inp), @XMM[8] # re-load input
1749 movdqu 0x10($inp), @XMM[9]
1750 pxor @XMM[8], @XMM[1]
1751 movdqu 0x20($inp), @XMM[10]
1752 pxor @XMM[9], @XMM[6]
1753 movdqu 0x30($inp), @XMM[11]
1754 pxor @XMM[10], @XMM[4]
1755 movdqu 0x40($inp), @XMM[15] # IV
1756 pxor @XMM[11], @XMM[2]
1757 movdqu @XMM[0], 0x00($out) # write output
1758 movdqu @XMM[1], 0x10($out)
1759 movdqu @XMM[6], 0x20($out)
1760 movdqu @XMM[4], 0x30($out)
1761 movdqu @XMM[2], 0x40($out)
1765 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1766 call _bsaes_decrypt8
1767 pxor 0x20(%rbp), @XMM[0] # ^= IV
1768 movdqu 0x00($inp), @XMM[8] # re-load input
1769 movdqu 0x10($inp), @XMM[9]
1770 pxor @XMM[8], @XMM[1]
1771 movdqu 0x20($inp), @XMM[10]
1772 pxor @XMM[9], @XMM[6]
1773 movdqu 0x30($inp), @XMM[15] # IV
1774 pxor @XMM[10], @XMM[4]
1775 movdqu @XMM[0], 0x00($out) # write output
1776 movdqu @XMM[1], 0x10($out)
1777 movdqu @XMM[6], 0x20($out)
1778 movdqu @XMM[4], 0x30($out)
1782 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1783 call _bsaes_decrypt8
1784 pxor 0x20(%rbp), @XMM[0] # ^= IV
1785 movdqu 0x00($inp), @XMM[8] # re-load input
1786 movdqu 0x10($inp), @XMM[9]
1787 pxor @XMM[8], @XMM[1]
1788 movdqu 0x20($inp), @XMM[15] # IV
1789 pxor @XMM[9], @XMM[6]
1790 movdqu @XMM[0], 0x00($out) # write output
1791 movdqu @XMM[1], 0x10($out)
1792 movdqu @XMM[6], 0x20($out)
1796 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1797 call _bsaes_decrypt8
1798 pxor 0x20(%rbp), @XMM[0] # ^= IV
1799 movdqu 0x00($inp), @XMM[8] # re-load input
1800 movdqu 0x10($inp), @XMM[15] # IV
1801 pxor @XMM[8], @XMM[1]
1802 movdqu @XMM[0], 0x00($out) # write output
1803 movdqu @XMM[1], 0x10($out)
1808 lea 0x20(%rbp), $arg2 # buffer output
1810 call asm_AES_decrypt # doesn't touch %xmm
1811 pxor 0x20(%rbp), @XMM[15] # ^= IV
1812 movdqu @XMM[15], ($out) # write output
1813 movdqa @XMM[0], @XMM[15] # IV
1816 movdqu @XMM[15], (%rbx) # return IV
1819 .Lcbc_dec_bzero: # wipe key schedule [if any]
1820 movdqa %xmm0, 0x00(%rax)
1821 movdqa %xmm0, 0x10(%rax)
1822 lea 0x20(%rax), %rax
1826 lea (%rbp),%rsp # restore %rsp
1828 $code.=<<___ if ($win64);
1829 movaps 0x40(%rbp), %xmm6
1830 movaps 0x50(%rbp), %xmm7
1831 movaps 0x60(%rbp), %xmm8
1832 movaps 0x70(%rbp), %xmm9
1833 movaps 0x80(%rbp), %xmm10
1834 movaps 0x90(%rbp), %xmm11
1835 movaps 0xa0(%rbp), %xmm12
1836 movaps 0xb0(%rbp), %xmm13
1837 movaps 0xc0(%rbp), %xmm14
1838 movaps 0xd0(%rbp), %xmm15
1839 lea 0xa0(%rbp), %rsp
1842 mov 0x48(%rsp), %r15
1843 mov 0x50(%rsp), %r14
1844 mov 0x58(%rsp), %r13
1845 mov 0x60(%rsp), %r12
1846 mov 0x68(%rsp), %rbx
1847 mov 0x70(%rsp), %rax
1848 lea 0x78(%rsp), %rsp
1852 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1854 .globl bsaes_ctr32_encrypt_blocks
1855 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1857 bsaes_ctr32_encrypt_blocks:
1866 lea -0x48(%rsp), %rsp
1868 $code.=<<___ if ($win64);
1869 mov 0xa0(%rsp),$arg5 # pull ivp
1870 lea -0xa0(%rsp), %rsp
1871 movaps %xmm6, 0x40(%rsp)
1872 movaps %xmm7, 0x50(%rsp)
1873 movaps %xmm8, 0x60(%rsp)
1874 movaps %xmm9, 0x70(%rsp)
1875 movaps %xmm10, 0x80(%rsp)
1876 movaps %xmm11, 0x90(%rsp)
1877 movaps %xmm12, 0xa0(%rsp)
1878 movaps %xmm13, 0xb0(%rsp)
1879 movaps %xmm14, 0xc0(%rsp)
1880 movaps %xmm15, 0xd0(%rsp)
1884 mov %rsp, %rbp # backup %rsp
1885 movdqu ($arg5), %xmm0 # load counter
1886 mov 240($arg4), %eax # rounds
1887 mov $arg1, $inp # backup arguments
1891 movdqa %xmm0, 0x20(%rbp) # copy counter
1895 mov %eax, %ebx # rounds
1896 shl \$7, %rax # 128 bytes per inner round key
1897 sub \$`128-32`, %rax # size of bit-sliced key schedule
1900 mov %rsp, %rax # pass key schedule
1901 mov $key, %rcx # pass key
1902 mov %ebx, %r10d # pass rounds
1903 call _bsaes_key_convert
1904 pxor %xmm6,%xmm7 # fix up last round key
1905 movdqa %xmm7,(%rax) # save last round key
1907 movdqa (%rsp), @XMM[9] # load round0 key
1908 lea .LADD1(%rip), %r11
1909 movdqa 0x20(%rbp), @XMM[0] # counter copy
1910 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1911 pshufb @XMM[8], @XMM[9] # byte swap upper part
1912 pshufb @XMM[8], @XMM[0]
1913 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1917 movdqa @XMM[0], 0x20(%rbp) # save counter
1918 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1919 movdqa @XMM[0], @XMM[2]
1920 paddd 0x00(%r11), @XMM[1] # .LADD1
1921 movdqa @XMM[0], @XMM[3]
1922 paddd 0x10(%r11), @XMM[2] # .LADD2
1923 movdqa @XMM[0], @XMM[4]
1924 paddd 0x20(%r11), @XMM[3] # .LADD3
1925 movdqa @XMM[0], @XMM[5]
1926 paddd 0x30(%r11), @XMM[4] # .LADD4
1927 movdqa @XMM[0], @XMM[6]
1928 paddd 0x40(%r11), @XMM[5] # .LADD5
1929 movdqa @XMM[0], @XMM[7]
1930 paddd 0x50(%r11), @XMM[6] # .LADD6
1931 paddd 0x60(%r11), @XMM[7] # .LADD7
1933 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1934 # to flip byte order in 32-bit counter
1935 movdqa (%rsp), @XMM[9] # round 0 key
1936 lea 0x10(%rsp), %rax # pass key schedule
1937 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1938 pxor @XMM[9], @XMM[0] # xor with round0 key
1939 pxor @XMM[9], @XMM[1]
1940 pshufb @XMM[8], @XMM[0]
1941 pxor @XMM[9], @XMM[2]
1942 pshufb @XMM[8], @XMM[1]
1943 pxor @XMM[9], @XMM[3]
1944 pshufb @XMM[8], @XMM[2]
1945 pxor @XMM[9], @XMM[4]
1946 pshufb @XMM[8], @XMM[3]
1947 pxor @XMM[9], @XMM[5]
1948 pshufb @XMM[8], @XMM[4]
1949 pxor @XMM[9], @XMM[6]
1950 pshufb @XMM[8], @XMM[5]
1951 pxor @XMM[9], @XMM[7]
1952 pshufb @XMM[8], @XMM[6]
1953 lea .LBS0(%rip), %r11 # constants table
1954 pshufb @XMM[8], @XMM[7]
1955 mov %ebx,%r10d # pass rounds
1957 call _bsaes_encrypt8_bitslice
1960 jc .Lctr_enc_loop_done
1962 movdqu 0x00($inp), @XMM[8] # load input
1963 movdqu 0x10($inp), @XMM[9]
1964 movdqu 0x20($inp), @XMM[10]
1965 movdqu 0x30($inp), @XMM[11]
1966 movdqu 0x40($inp), @XMM[12]
1967 movdqu 0x50($inp), @XMM[13]
1968 movdqu 0x60($inp), @XMM[14]
1969 movdqu 0x70($inp), @XMM[15]
1971 pxor @XMM[0], @XMM[8]
1972 movdqa 0x20(%rbp), @XMM[0] # load counter
1973 pxor @XMM[9], @XMM[1]
1974 movdqu @XMM[8], 0x00($out) # write output
1975 pxor @XMM[10], @XMM[4]
1976 movdqu @XMM[1], 0x10($out)
1977 pxor @XMM[11], @XMM[6]
1978 movdqu @XMM[4], 0x20($out)
1979 pxor @XMM[12], @XMM[3]
1980 movdqu @XMM[6], 0x30($out)
1981 pxor @XMM[13], @XMM[7]
1982 movdqu @XMM[3], 0x40($out)
1983 pxor @XMM[14], @XMM[2]
1984 movdqu @XMM[7], 0x50($out)
1985 pxor @XMM[15], @XMM[5]
1986 movdqu @XMM[2], 0x60($out)
1987 lea .LADD1(%rip), %r11
1988 movdqu @XMM[5], 0x70($out)
1989 lea 0x80($out), $out
1990 paddd 0x70(%r11), @XMM[0] # .LADD8
1995 .Lctr_enc_loop_done:
1997 movdqu 0x00($inp), @XMM[8] # load input
1998 pxor @XMM[8], @XMM[0]
1999 movdqu @XMM[0], 0x00($out) # write output
2002 movdqu 0x10($inp), @XMM[9]
2003 pxor @XMM[9], @XMM[1]
2004 movdqu @XMM[1], 0x10($out)
2006 movdqu 0x20($inp), @XMM[10]
2007 pxor @XMM[10], @XMM[4]
2008 movdqu @XMM[4], 0x20($out)
2011 movdqu 0x30($inp), @XMM[11]
2012 pxor @XMM[11], @XMM[6]
2013 movdqu @XMM[6], 0x30($out)
2015 movdqu 0x40($inp), @XMM[12]
2016 pxor @XMM[12], @XMM[3]
2017 movdqu @XMM[3], 0x40($out)
2020 movdqu 0x50($inp), @XMM[13]
2021 pxor @XMM[13], @XMM[7]
2022 movdqu @XMM[7], 0x50($out)
2024 movdqu 0x60($inp), @XMM[14]
2025 pxor @XMM[14], @XMM[2]
2026 movdqu @XMM[2], 0x60($out)
2031 lea 0x20(%rbp), $arg1
2032 lea 0x30(%rbp), $arg2
2034 call asm_AES_encrypt
2035 movdqu ($inp), @XMM[1]
2037 mov 0x2c(%rbp), %eax # load 32-bit counter
2039 pxor 0x30(%rbp), @XMM[1]
2040 inc %eax # increment
2041 movdqu @XMM[1], ($out)
2044 mov %eax, 0x2c(%rsp) # save 32-bit counter
2051 .Lctr_enc_bzero: # wipe key schedule [if any]
2052 movdqa %xmm0, 0x00(%rax)
2053 movdqa %xmm0, 0x10(%rax)
2054 lea 0x20(%rax), %rax
2058 lea (%rbp),%rsp # restore %rsp
2060 $code.=<<___ if ($win64);
2061 movaps 0x40(%rbp), %xmm6
2062 movaps 0x50(%rbp), %xmm7
2063 movaps 0x60(%rbp), %xmm8
2064 movaps 0x70(%rbp), %xmm9
2065 movaps 0x80(%rbp), %xmm10
2066 movaps 0x90(%rbp), %xmm11
2067 movaps 0xa0(%rbp), %xmm12
2068 movaps 0xb0(%rbp), %xmm13
2069 movaps 0xc0(%rbp), %xmm14
2070 movaps 0xd0(%rbp), %xmm15
2071 lea 0xa0(%rbp), %rsp
2074 mov 0x48(%rsp), %r15
2075 mov 0x50(%rsp), %r14
2076 mov 0x58(%rsp), %r13
2077 mov 0x60(%rsp), %r12
2078 mov 0x68(%rsp), %rbx
2079 mov 0x70(%rsp), %rax
2080 lea 0x78(%rsp), %rsp
2084 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2086 ######################################################################
2087 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2088 # const AES_KEY *key1, const AES_KEY *key2,
2089 # const unsigned char iv[16]);
2091 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2093 .globl bsaes_xts_encrypt
2094 .type bsaes_xts_encrypt,\@abi-omnipotent
2105 lea -0x48(%rsp), %rsp
2107 $code.=<<___ if ($win64);
2108 mov 0xa0(%rsp),$arg5 # pull key2
2109 mov 0xa8(%rsp),$arg6 # pull ivp
2110 lea -0xa0(%rsp), %rsp
2111 movaps %xmm6, 0x40(%rsp)
2112 movaps %xmm7, 0x50(%rsp)
2113 movaps %xmm8, 0x60(%rsp)
2114 movaps %xmm9, 0x70(%rsp)
2115 movaps %xmm10, 0x80(%rsp)
2116 movaps %xmm11, 0x90(%rsp)
2117 movaps %xmm12, 0xa0(%rsp)
2118 movaps %xmm13, 0xb0(%rsp)
2119 movaps %xmm14, 0xc0(%rsp)
2120 movaps %xmm15, 0xd0(%rsp)
2124 mov %rsp, %rbp # backup %rsp
2125 mov $arg1, $inp # backup arguments
2131 lea 0x20(%rbp), $arg2
2133 call asm_AES_encrypt # generate initial tweak
2135 mov 240($key), %eax # rounds
2136 mov $len, %rbx # backup $len
2138 mov %eax, %edx # rounds
2139 shl \$7, %rax # 128 bytes per inner round key
2140 sub \$`128-32`, %rax # size of bit-sliced key schedule
2143 mov %rsp, %rax # pass key schedule
2144 mov $key, %rcx # pass key
2145 mov %edx, %r10d # pass rounds
2146 call _bsaes_key_convert
2147 pxor %xmm6, %xmm7 # fix up last round key
2148 movdqa %xmm7, (%rax) # save last round key
2151 sub \$0x80, %rsp # place for tweak[8]
2152 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2155 movdqa .Lxts_magic(%rip), $twmask
2156 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2165 for ($i=0;$i<7;$i++) {
2167 pshufd \$0x13, $twtmp, $twres
2169 movdqa @XMM[7], @XMM[$i]
2170 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2171 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2172 pand $twmask, $twres # isolate carry and residue
2173 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2174 pxor $twres, @XMM[7]
2176 $code.=<<___ if ($i>=1);
2177 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2179 $code.=<<___ if ($i>=2);
2180 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2184 movdqu 0x60($inp), @XMM[8+6]
2185 pxor @XMM[8+5], @XMM[5]
2186 movdqu 0x70($inp), @XMM[8+7]
2187 lea 0x80($inp), $inp
2188 movdqa @XMM[7], 0x70(%rsp)
2189 pxor @XMM[8+6], @XMM[6]
2190 lea 0x80(%rsp), %rax # pass key schedule
2191 pxor @XMM[8+7], @XMM[7]
2192 mov %edx, %r10d # pass rounds
2194 call _bsaes_encrypt8
2196 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2197 pxor 0x10(%rsp), @XMM[1]
2198 movdqu @XMM[0], 0x00($out) # write output
2199 pxor 0x20(%rsp), @XMM[4]
2200 movdqu @XMM[1], 0x10($out)
2201 pxor 0x30(%rsp), @XMM[6]
2202 movdqu @XMM[4], 0x20($out)
2203 pxor 0x40(%rsp), @XMM[3]
2204 movdqu @XMM[6], 0x30($out)
2205 pxor 0x50(%rsp), @XMM[7]
2206 movdqu @XMM[3], 0x40($out)
2207 pxor 0x60(%rsp), @XMM[2]
2208 movdqu @XMM[7], 0x50($out)
2209 pxor 0x70(%rsp), @XMM[5]
2210 movdqu @XMM[2], 0x60($out)
2211 movdqu @XMM[5], 0x70($out)
2212 lea 0x80($out), $out
2214 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2216 movdqa .Lxts_magic(%rip), $twmask
2217 pcmpgtd @XMM[7], $twtmp
2218 pshufd \$0x13, $twtmp, $twres
2220 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2221 pand $twmask, $twres # isolate carry and residue
2222 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2223 pxor $twres, @XMM[7]
2232 for ($i=0;$i<7;$i++) {
2234 pshufd \$0x13, $twtmp, $twres
2236 movdqa @XMM[7], @XMM[$i]
2237 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2238 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2239 pand $twmask, $twres # isolate carry and residue
2240 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2241 pxor $twres, @XMM[7]
2243 $code.=<<___ if ($i>=1);
2244 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2245 cmp \$`0x10*$i`,$len
2248 $code.=<<___ if ($i>=2);
2249 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2253 movdqu 0x60($inp), @XMM[8+6]
2254 pxor @XMM[8+5], @XMM[5]
2255 movdqa @XMM[7], 0x70(%rsp)
2256 lea 0x70($inp), $inp
2257 pxor @XMM[8+6], @XMM[6]
2258 lea 0x80(%rsp), %rax # pass key schedule
2259 mov %edx, %r10d # pass rounds
2261 call _bsaes_encrypt8
2263 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2264 pxor 0x10(%rsp), @XMM[1]
2265 movdqu @XMM[0], 0x00($out) # write output
2266 pxor 0x20(%rsp), @XMM[4]
2267 movdqu @XMM[1], 0x10($out)
2268 pxor 0x30(%rsp), @XMM[6]
2269 movdqu @XMM[4], 0x20($out)
2270 pxor 0x40(%rsp), @XMM[3]
2271 movdqu @XMM[6], 0x30($out)
2272 pxor 0x50(%rsp), @XMM[7]
2273 movdqu @XMM[3], 0x40($out)
2274 pxor 0x60(%rsp), @XMM[2]
2275 movdqu @XMM[7], 0x50($out)
2276 movdqu @XMM[2], 0x60($out)
2277 lea 0x70($out), $out
2279 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2283 pxor @XMM[8+4], @XMM[4]
2284 lea 0x60($inp), $inp
2285 pxor @XMM[8+5], @XMM[5]
2286 lea 0x80(%rsp), %rax # pass key schedule
2287 mov %edx, %r10d # pass rounds
2289 call _bsaes_encrypt8
2291 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2292 pxor 0x10(%rsp), @XMM[1]
2293 movdqu @XMM[0], 0x00($out) # write output
2294 pxor 0x20(%rsp), @XMM[4]
2295 movdqu @XMM[1], 0x10($out)
2296 pxor 0x30(%rsp), @XMM[6]
2297 movdqu @XMM[4], 0x20($out)
2298 pxor 0x40(%rsp), @XMM[3]
2299 movdqu @XMM[6], 0x30($out)
2300 pxor 0x50(%rsp), @XMM[7]
2301 movdqu @XMM[3], 0x40($out)
2302 movdqu @XMM[7], 0x50($out)
2303 lea 0x60($out), $out
2305 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2309 pxor @XMM[8+3], @XMM[3]
2310 lea 0x50($inp), $inp
2311 pxor @XMM[8+4], @XMM[4]
2312 lea 0x80(%rsp), %rax # pass key schedule
2313 mov %edx, %r10d # pass rounds
2315 call _bsaes_encrypt8
2317 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2318 pxor 0x10(%rsp), @XMM[1]
2319 movdqu @XMM[0], 0x00($out) # write output
2320 pxor 0x20(%rsp), @XMM[4]
2321 movdqu @XMM[1], 0x10($out)
2322 pxor 0x30(%rsp), @XMM[6]
2323 movdqu @XMM[4], 0x20($out)
2324 pxor 0x40(%rsp), @XMM[3]
2325 movdqu @XMM[6], 0x30($out)
2326 movdqu @XMM[3], 0x40($out)
2327 lea 0x50($out), $out
2329 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2333 pxor @XMM[8+2], @XMM[2]
2334 lea 0x40($inp), $inp
2335 pxor @XMM[8+3], @XMM[3]
2336 lea 0x80(%rsp), %rax # pass key schedule
2337 mov %edx, %r10d # pass rounds
2339 call _bsaes_encrypt8
2341 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2342 pxor 0x10(%rsp), @XMM[1]
2343 movdqu @XMM[0], 0x00($out) # write output
2344 pxor 0x20(%rsp), @XMM[4]
2345 movdqu @XMM[1], 0x10($out)
2346 pxor 0x30(%rsp), @XMM[6]
2347 movdqu @XMM[4], 0x20($out)
2348 movdqu @XMM[6], 0x30($out)
2349 lea 0x40($out), $out
2351 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2355 pxor @XMM[8+1], @XMM[1]
2356 lea 0x30($inp), $inp
2357 pxor @XMM[8+2], @XMM[2]
2358 lea 0x80(%rsp), %rax # pass key schedule
2359 mov %edx, %r10d # pass rounds
2361 call _bsaes_encrypt8
2363 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2364 pxor 0x10(%rsp), @XMM[1]
2365 movdqu @XMM[0], 0x00($out) # write output
2366 pxor 0x20(%rsp), @XMM[4]
2367 movdqu @XMM[1], 0x10($out)
2368 movdqu @XMM[4], 0x20($out)
2369 lea 0x30($out), $out
2371 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2375 pxor @XMM[8+0], @XMM[0]
2376 lea 0x20($inp), $inp
2377 pxor @XMM[8+1], @XMM[1]
2378 lea 0x80(%rsp), %rax # pass key schedule
2379 mov %edx, %r10d # pass rounds
2381 call _bsaes_encrypt8
2383 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2384 pxor 0x10(%rsp), @XMM[1]
2385 movdqu @XMM[0], 0x00($out) # write output
2386 movdqu @XMM[1], 0x10($out)
2387 lea 0x20($out), $out
2389 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2393 pxor @XMM[0], @XMM[8]
2394 lea 0x10($inp), $inp
2395 movdqa @XMM[8], 0x20(%rbp)
2396 lea 0x20(%rbp), $arg1
2397 lea 0x20(%rbp), $arg2
2399 call asm_AES_encrypt # doesn't touch %xmm
2400 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2401 #pxor @XMM[8], @XMM[0]
2402 #lea 0x80(%rsp), %rax # pass key schedule
2403 #mov %edx, %r10d # pass rounds
2404 #call _bsaes_encrypt8
2405 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2406 movdqu @XMM[0], 0x00($out) # write output
2407 lea 0x10($out), $out
2409 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2418 movzb -16(%rdx), %ecx
2426 movdqu -16($out), @XMM[0]
2427 lea 0x20(%rbp), $arg1
2428 pxor @XMM[7], @XMM[0]
2429 lea 0x20(%rbp), $arg2
2430 movdqa @XMM[0], 0x20(%rbp)
2432 call asm_AES_encrypt # doesn't touch %xmm
2433 pxor 0x20(%rbp), @XMM[7]
2434 movdqu @XMM[7], -16($out)
2439 .Lxts_enc_bzero: # wipe key schedule [if any]
2440 movdqa %xmm0, 0x00(%rax)
2441 movdqa %xmm0, 0x10(%rax)
2442 lea 0x20(%rax), %rax
2446 lea (%rbp),%rsp # restore %rsp
2448 $code.=<<___ if ($win64);
2449 movaps 0x40(%rbp), %xmm6
2450 movaps 0x50(%rbp), %xmm7
2451 movaps 0x60(%rbp), %xmm8
2452 movaps 0x70(%rbp), %xmm9
2453 movaps 0x80(%rbp), %xmm10
2454 movaps 0x90(%rbp), %xmm11
2455 movaps 0xa0(%rbp), %xmm12
2456 movaps 0xb0(%rbp), %xmm13
2457 movaps 0xc0(%rbp), %xmm14
2458 movaps 0xd0(%rbp), %xmm15
2459 lea 0xa0(%rbp), %rsp
2462 mov 0x48(%rsp), %r15
2463 mov 0x50(%rsp), %r14
2464 mov 0x58(%rsp), %r13
2465 mov 0x60(%rsp), %r12
2466 mov 0x68(%rsp), %rbx
2467 mov 0x70(%rsp), %rax
2468 lea 0x78(%rsp), %rsp
2472 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2474 .globl bsaes_xts_decrypt
2475 .type bsaes_xts_decrypt,\@abi-omnipotent
2486 lea -0x48(%rsp), %rsp
2488 $code.=<<___ if ($win64);
2489 mov 0xa0(%rsp),$arg5 # pull key2
2490 mov 0xa8(%rsp),$arg6 # pull ivp
2491 lea -0xa0(%rsp), %rsp
2492 movaps %xmm6, 0x40(%rsp)
2493 movaps %xmm7, 0x50(%rsp)
2494 movaps %xmm8, 0x60(%rsp)
2495 movaps %xmm9, 0x70(%rsp)
2496 movaps %xmm10, 0x80(%rsp)
2497 movaps %xmm11, 0x90(%rsp)
2498 movaps %xmm12, 0xa0(%rsp)
2499 movaps %xmm13, 0xb0(%rsp)
2500 movaps %xmm14, 0xc0(%rsp)
2501 movaps %xmm15, 0xd0(%rsp)
2505 mov %rsp, %rbp # backup %rsp
2506 mov $arg1, $inp # backup arguments
2512 lea 0x20(%rbp), $arg2
2514 call asm_AES_encrypt # generate initial tweak
2516 mov 240($key), %eax # rounds
2517 mov $len, %rbx # backup $len
2519 mov %eax, %edx # rounds
2520 shl \$7, %rax # 128 bytes per inner round key
2521 sub \$`128-32`, %rax # size of bit-sliced key schedule
2524 mov %rsp, %rax # pass key schedule
2525 mov $key, %rcx # pass key
2526 mov %edx, %r10d # pass rounds
2527 call _bsaes_key_convert
2528 pxor (%rsp), %xmm7 # fix up round 0 key
2529 movdqa %xmm6, (%rax) # save last round key
2530 movdqa %xmm7, (%rsp)
2532 xor %eax, %eax # if ($len%16) len-=16;
2539 sub \$0x80, %rsp # place for tweak[8]
2540 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2543 movdqa .Lxts_magic(%rip), $twmask
2544 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2553 for ($i=0;$i<7;$i++) {
2555 pshufd \$0x13, $twtmp, $twres
2557 movdqa @XMM[7], @XMM[$i]
2558 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2559 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2560 pand $twmask, $twres # isolate carry and residue
2561 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2562 pxor $twres, @XMM[7]
2564 $code.=<<___ if ($i>=1);
2565 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2567 $code.=<<___ if ($i>=2);
2568 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2572 movdqu 0x60($inp), @XMM[8+6]
2573 pxor @XMM[8+5], @XMM[5]
2574 movdqu 0x70($inp), @XMM[8+7]
2575 lea 0x80($inp), $inp
2576 movdqa @XMM[7], 0x70(%rsp)
2577 pxor @XMM[8+6], @XMM[6]
2578 lea 0x80(%rsp), %rax # pass key schedule
2579 pxor @XMM[8+7], @XMM[7]
2580 mov %edx, %r10d # pass rounds
2582 call _bsaes_decrypt8
2584 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2585 pxor 0x10(%rsp), @XMM[1]
2586 movdqu @XMM[0], 0x00($out) # write output
2587 pxor 0x20(%rsp), @XMM[6]
2588 movdqu @XMM[1], 0x10($out)
2589 pxor 0x30(%rsp), @XMM[4]
2590 movdqu @XMM[6], 0x20($out)
2591 pxor 0x40(%rsp), @XMM[2]
2592 movdqu @XMM[4], 0x30($out)
2593 pxor 0x50(%rsp), @XMM[7]
2594 movdqu @XMM[2], 0x40($out)
2595 pxor 0x60(%rsp), @XMM[3]
2596 movdqu @XMM[7], 0x50($out)
2597 pxor 0x70(%rsp), @XMM[5]
2598 movdqu @XMM[3], 0x60($out)
2599 movdqu @XMM[5], 0x70($out)
2600 lea 0x80($out), $out
2602 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2604 movdqa .Lxts_magic(%rip), $twmask
2605 pcmpgtd @XMM[7], $twtmp
2606 pshufd \$0x13, $twtmp, $twres
2608 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2609 pand $twmask, $twres # isolate carry and residue
2610 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2611 pxor $twres, @XMM[7]
2620 for ($i=0;$i<7;$i++) {
2622 pshufd \$0x13, $twtmp, $twres
2624 movdqa @XMM[7], @XMM[$i]
2625 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2626 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2627 pand $twmask, $twres # isolate carry and residue
2628 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2629 pxor $twres, @XMM[7]
2631 $code.=<<___ if ($i>=1);
2632 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2633 cmp \$`0x10*$i`,$len
2636 $code.=<<___ if ($i>=2);
2637 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2641 movdqu 0x60($inp), @XMM[8+6]
2642 pxor @XMM[8+5], @XMM[5]
2643 movdqa @XMM[7], 0x70(%rsp)
2644 lea 0x70($inp), $inp
2645 pxor @XMM[8+6], @XMM[6]
2646 lea 0x80(%rsp), %rax # pass key schedule
2647 mov %edx, %r10d # pass rounds
2649 call _bsaes_decrypt8
2651 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2652 pxor 0x10(%rsp), @XMM[1]
2653 movdqu @XMM[0], 0x00($out) # write output
2654 pxor 0x20(%rsp), @XMM[6]
2655 movdqu @XMM[1], 0x10($out)
2656 pxor 0x30(%rsp), @XMM[4]
2657 movdqu @XMM[6], 0x20($out)
2658 pxor 0x40(%rsp), @XMM[2]
2659 movdqu @XMM[4], 0x30($out)
2660 pxor 0x50(%rsp), @XMM[7]
2661 movdqu @XMM[2], 0x40($out)
2662 pxor 0x60(%rsp), @XMM[3]
2663 movdqu @XMM[7], 0x50($out)
2664 movdqu @XMM[3], 0x60($out)
2665 lea 0x70($out), $out
2667 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2671 pxor @XMM[8+4], @XMM[4]
2672 lea 0x60($inp), $inp
2673 pxor @XMM[8+5], @XMM[5]
2674 lea 0x80(%rsp), %rax # pass key schedule
2675 mov %edx, %r10d # pass rounds
2677 call _bsaes_decrypt8
2679 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2680 pxor 0x10(%rsp), @XMM[1]
2681 movdqu @XMM[0], 0x00($out) # write output
2682 pxor 0x20(%rsp), @XMM[6]
2683 movdqu @XMM[1], 0x10($out)
2684 pxor 0x30(%rsp), @XMM[4]
2685 movdqu @XMM[6], 0x20($out)
2686 pxor 0x40(%rsp), @XMM[2]
2687 movdqu @XMM[4], 0x30($out)
2688 pxor 0x50(%rsp), @XMM[7]
2689 movdqu @XMM[2], 0x40($out)
2690 movdqu @XMM[7], 0x50($out)
2691 lea 0x60($out), $out
2693 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2697 pxor @XMM[8+3], @XMM[3]
2698 lea 0x50($inp), $inp
2699 pxor @XMM[8+4], @XMM[4]
2700 lea 0x80(%rsp), %rax # pass key schedule
2701 mov %edx, %r10d # pass rounds
2703 call _bsaes_decrypt8
2705 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2706 pxor 0x10(%rsp), @XMM[1]
2707 movdqu @XMM[0], 0x00($out) # write output
2708 pxor 0x20(%rsp), @XMM[6]
2709 movdqu @XMM[1], 0x10($out)
2710 pxor 0x30(%rsp), @XMM[4]
2711 movdqu @XMM[6], 0x20($out)
2712 pxor 0x40(%rsp), @XMM[2]
2713 movdqu @XMM[4], 0x30($out)
2714 movdqu @XMM[2], 0x40($out)
2715 lea 0x50($out), $out
2717 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2721 pxor @XMM[8+2], @XMM[2]
2722 lea 0x40($inp), $inp
2723 pxor @XMM[8+3], @XMM[3]
2724 lea 0x80(%rsp), %rax # pass key schedule
2725 mov %edx, %r10d # pass rounds
2727 call _bsaes_decrypt8
2729 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2730 pxor 0x10(%rsp), @XMM[1]
2731 movdqu @XMM[0], 0x00($out) # write output
2732 pxor 0x20(%rsp), @XMM[6]
2733 movdqu @XMM[1], 0x10($out)
2734 pxor 0x30(%rsp), @XMM[4]
2735 movdqu @XMM[6], 0x20($out)
2736 movdqu @XMM[4], 0x30($out)
2737 lea 0x40($out), $out
2739 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2743 pxor @XMM[8+1], @XMM[1]
2744 lea 0x30($inp), $inp
2745 pxor @XMM[8+2], @XMM[2]
2746 lea 0x80(%rsp), %rax # pass key schedule
2747 mov %edx, %r10d # pass rounds
2749 call _bsaes_decrypt8
2751 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2752 pxor 0x10(%rsp), @XMM[1]
2753 movdqu @XMM[0], 0x00($out) # write output
2754 pxor 0x20(%rsp), @XMM[6]
2755 movdqu @XMM[1], 0x10($out)
2756 movdqu @XMM[6], 0x20($out)
2757 lea 0x30($out), $out
2759 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2763 pxor @XMM[8+0], @XMM[0]
2764 lea 0x20($inp), $inp
2765 pxor @XMM[8+1], @XMM[1]
2766 lea 0x80(%rsp), %rax # pass key schedule
2767 mov %edx, %r10d # pass rounds
2769 call _bsaes_decrypt8
2771 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2772 pxor 0x10(%rsp), @XMM[1]
2773 movdqu @XMM[0], 0x00($out) # write output
2774 movdqu @XMM[1], 0x10($out)
2775 lea 0x20($out), $out
2777 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2781 pxor @XMM[0], @XMM[8]
2782 lea 0x10($inp), $inp
2783 movdqa @XMM[8], 0x20(%rbp)
2784 lea 0x20(%rbp), $arg1
2785 lea 0x20(%rbp), $arg2
2787 call asm_AES_decrypt # doesn't touch %xmm
2788 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2789 #pxor @XMM[8], @XMM[0]
2790 #lea 0x80(%rsp), %rax # pass key schedule
2791 #mov %edx, %r10d # pass rounds
2792 #call _bsaes_decrypt8
2793 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2794 movdqu @XMM[0], 0x00($out) # write output
2795 lea 0x10($out), $out
2797 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2804 movdqa .Lxts_magic(%rip), $twmask
2805 pcmpgtd @XMM[7], $twtmp
2806 pshufd \$0x13, $twtmp, $twres
2807 movdqa @XMM[7], @XMM[6]
2808 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2809 pand $twmask, $twres # isolate carry and residue
2810 movdqu ($inp), @XMM[0]
2811 pxor $twres, @XMM[7]
2813 lea 0x20(%rbp), $arg1
2814 pxor @XMM[7], @XMM[0]
2815 lea 0x20(%rbp), $arg2
2816 movdqa @XMM[0], 0x20(%rbp)
2818 call asm_AES_decrypt # doesn't touch %xmm
2819 pxor 0x20(%rbp), @XMM[7]
2821 movdqu @XMM[7], ($out)
2824 movzb 16($inp), %eax
2833 movdqu ($out), @XMM[0]
2834 lea 0x20(%rbp), $arg1
2835 pxor @XMM[6], @XMM[0]
2836 lea 0x20(%rbp), $arg2
2837 movdqa @XMM[0], 0x20(%rbp)
2839 call asm_AES_decrypt # doesn't touch %xmm
2840 pxor 0x20(%rbp), @XMM[6]
2841 movdqu @XMM[6], ($out)
2846 .Lxts_dec_bzero: # wipe key schedule [if any]
2847 movdqa %xmm0, 0x00(%rax)
2848 movdqa %xmm0, 0x10(%rax)
2849 lea 0x20(%rax), %rax
2853 lea (%rbp),%rsp # restore %rsp
2855 $code.=<<___ if ($win64);
2856 movaps 0x40(%rbp), %xmm6
2857 movaps 0x50(%rbp), %xmm7
2858 movaps 0x60(%rbp), %xmm8
2859 movaps 0x70(%rbp), %xmm9
2860 movaps 0x80(%rbp), %xmm10
2861 movaps 0x90(%rbp), %xmm11
2862 movaps 0xa0(%rbp), %xmm12
2863 movaps 0xb0(%rbp), %xmm13
2864 movaps 0xc0(%rbp), %xmm14
2865 movaps 0xd0(%rbp), %xmm15
2866 lea 0xa0(%rbp), %rsp
2869 mov 0x48(%rsp), %r15
2870 mov 0x50(%rsp), %r14
2871 mov 0x58(%rsp), %r13
2872 mov 0x60(%rsp), %r12
2873 mov 0x68(%rsp), %rbx
2874 mov 0x70(%rsp), %rax
2875 lea 0x78(%rsp), %rsp
2879 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2883 .type _bsaes_const,\@object
2886 .LM0ISR: # InvShiftRows constants
2887 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2889 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2891 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2892 .LBS0: # bit-slice constants
2893 .quad 0x5555555555555555, 0x5555555555555555
2895 .quad 0x3333333333333333, 0x3333333333333333
2897 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2898 .LSR: # shiftrows constants
2899 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2901 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2903 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2904 .LSWPUP: # byte-swap upper dword
2905 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2907 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2908 .LADD1: # counter increment constants
2909 .quad 0x0000000000000000, 0x0000000100000000
2911 .quad 0x0000000000000000, 0x0000000200000000
2913 .quad 0x0000000000000000, 0x0000000300000000
2915 .quad 0x0000000000000000, 0x0000000400000000
2917 .quad 0x0000000000000000, 0x0000000500000000
2919 .quad 0x0000000000000000, 0x0000000600000000
2921 .quad 0x0000000000000000, 0x0000000700000000
2923 .quad 0x0000000000000000, 0x0000000800000000
2927 .quad 0x0101010101010101, 0x0101010101010101
2928 .quad 0x0202020202020202, 0x0202020202020202
2929 .quad 0x0404040404040404, 0x0404040404040404
2930 .quad 0x0808080808080808, 0x0808080808080808
2932 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2934 .quad 0x6363636363636363, 0x6363636363636363
2935 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2937 .size _bsaes_const,.-_bsaes_const
2940 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2941 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2949 .extern __imp_RtlVirtualUnwind
2950 .type se_handler,\@abi-omnipotent
2964 mov 120($context),%rax # pull context->Rax
2965 mov 248($context),%rbx # pull context->Rip
2967 mov 8($disp),%rsi # disp->ImageBase
2968 mov 56($disp),%r11 # disp->HandlerData
2970 mov 0(%r11),%r10d # HandlerData[0]
2971 lea (%rsi,%r10),%r10 # prologue label
2972 cmp %r10,%rbx # context->Rip<prologue label
2975 mov 152($context),%rax # pull context->Rsp
2977 mov 4(%r11),%r10d # HandlerData[1]
2978 lea (%rsi,%r10),%r10 # epilogue label
2979 cmp %r10,%rbx # context->Rip>=epilogue label
2982 mov 160($context),%rax # pull context->Rbp
2984 lea 0x40(%rax),%rsi # %xmm save area
2985 lea 512($context),%rdi # &context.Xmm6
2986 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2987 .long 0xa548f3fc # cld; rep movsq
2988 lea 0xa0(%rax),%rax # adjust stack pointer
2996 lea 0x78(%rax),%rax # adjust stack pointer
2997 mov %rbx,144($context) # restore context->Rbx
2998 mov %rbp,160($context) # restore context->Rbp
2999 mov %r12,216($context) # restore context->R12
3000 mov %r13,224($context) # restore context->R13
3001 mov %r14,232($context) # restore context->R14
3002 mov %r15,240($context) # restore context->R15
3005 mov %rax,152($context) # restore context->Rsp
3007 mov 40($disp),%rdi # disp->ContextRecord
3008 mov $context,%rsi # context
3009 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3010 .long 0xa548f3fc # cld; rep movsq
3013 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3014 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3015 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3016 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3017 mov 40(%rsi),%r10 # disp->ContextRecord
3018 lea 56(%rsi),%r11 # &disp->HandlerData
3019 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3020 mov %r10,32(%rsp) # arg5
3021 mov %r11,40(%rsp) # arg6
3022 mov %r12,48(%rsp) # arg7
3023 mov %rcx,56(%rsp) # arg8, (NULL)
3024 call *__imp_RtlVirtualUnwind(%rip)
3026 mov \$1,%eax # ExceptionContinueSearch
3038 .size se_handler,.-se_handler
3043 $code.=<<___ if ($ecb);
3044 .rva .Lecb_enc_prologue
3045 .rva .Lecb_enc_epilogue
3048 .rva .Lecb_dec_prologue
3049 .rva .Lecb_dec_epilogue
3053 .rva .Lcbc_dec_prologue
3054 .rva .Lcbc_dec_epilogue
3057 .rva .Lctr_enc_prologue
3058 .rva .Lctr_enc_epilogue
3061 .rva .Lxts_enc_prologue
3062 .rva .Lxts_enc_epilogue
3065 .rva .Lxts_dec_prologue
3066 .rva .Lxts_dec_epilogue
3072 $code.=<<___ if ($ecb);
3076 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3080 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3086 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3090 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3094 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3098 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3102 $code =~ s/\`([^\`]*)\`/eval($1)/gem;