2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ###################################################################
11 ### AES-128 [originally in CTR mode] ###
12 ### bitsliced implementation for Intel Core 2 processors ###
13 ### requires support of SSE extensions up to SSSE3 ###
14 ### Author: Emilia Käsper and Peter Schwabe ###
15 ### Date: 2009-03-19 ###
18 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
19 ### further information. ###
20 ###################################################################
24 # Started as transliteration to "perlasm" the original code has
25 # undergone following changes:
27 # - code was made position-independent;
28 # - rounds were folded into a loop resulting in >5x size reduction
29 # from 12.5KB to 2.2KB;
30 # - above was possibile thanks to mixcolumns() modification that
31 # allowed to feed its output back to aesenc[last], this was
32 # achieved at cost of two additional inter-registers moves;
33 # - some instruction reordering and interleaving;
34 # - this module doesn't implement key setup subroutine, instead it
35 # relies on conversion of "conventional" key schedule as returned
36 # by AES_set_encrypt_key (see discussion below);
37 # - first and last round keys are treated differently, which allowed
38 # to skip one shiftrows(), reduce bit-sliced key schedule and
39 # speed-up conversion by 22%;
40 # - support for 192- and 256-bit keys was added;
42 # Resulting performance in CPU cycles spent to encrypt one byte out
43 # of 4096-byte buffer with 128-bit key is:
45 # Emilia's this(*) difference
47 # Core 2 9.30 8.69 +7%
48 # Nehalem(**) 7.63 6.88 +11%
53 # (*) Comparison is not completely fair, because "this" is ECB,
54 # i.e. no extra processing such as counter values calculation
55 # and xor-ing input as in Emilia's CTR implementation is
56 # performed. However, the CTR calculations stand for not more
57 # than 1% of total time, so comparison is *rather* fair.
59 # (**) Results were collected on Westmere, which is considered to
60 # be equivalent to Nehalem for this code.
62 # As for key schedule conversion subroutine. Interface to OpenSSL
63 # relies on per-invocation on-the-fly conversion. This naturally
64 # has impact on performance, especially for short inputs. Conversion
65 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
68 # conversion conversion/8x block
73 # The ratio values mean that 128-byte blocks will be processed
74 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
75 # etc. Then keep in mind that input sizes not divisible by 128 are
76 # *effectively* slower, especially shortest ones, e.g. consecutive
77 # 144-byte blocks are processed 44% slower than one would expect,
78 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
79 # it's still faster than ["hyper-threading-safe" code path in]
80 # aes-x86_64.pl on all lengths above 64 bytes...
84 # Add decryption procedure. Performance in CPU cycles spent to decrypt
85 # one byte out of 4096-byte buffer with 128-bit key is:
95 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
96 # suboptimal, but XTS is meant to be used with larger blocks...
102 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
104 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
108 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
109 die "can't locate x86_64-xlate.pl";
111 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
114 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
115 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
116 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
119 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
122 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
123 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
128 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
129 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
133 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
134 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
156 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
157 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
177 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
178 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
182 &InvInBasisChange (@b);
183 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
184 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
187 sub InvInBasisChange { # OutBasisChange in reverse
188 my @b=@_[5,1,2,6,3,7,0,4];
206 sub InvOutBasisChange { # InBasisChange in reverse
207 my @b=@_[2,5,7,3,6,1,0,4];
228 #;*************************************************************
229 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
230 #;*************************************************************
231 my ($x0,$x1,$y0,$y1,$t0)=@_;
244 sub Mul_GF4_N { # not used, see next subroutine
245 # multiply and scale by N
246 my ($x0,$x1,$y0,$y1,$t0)=@_;
260 # interleaved Mul_GF4_N and Mul_GF4
261 my ($x0,$x1,$y0,$y1,$t0,
262 $x2,$x3,$y2,$y3,$t1)=@_;
290 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
297 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
298 @x[2], @x[3], @y[2], @y[3], @t[2]);
310 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
311 @x[6], @x[7], @y[2], @y[3], @t[2]);
316 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
325 #;********************************************************************
326 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
327 #;********************************************************************
331 # direct optimizations from hardware
386 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
388 # new smaller inversion
422 # output in s3, s2, s1, t1
424 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
426 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
427 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
429 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
432 # AES linear components
438 pxor 0x00($key),@x[0]
439 pxor 0x10($key),@x[1]
440 pxor 0x20($key),@x[2]
441 pxor 0x30($key),@x[3]
444 pxor 0x40($key),@x[4]
445 pxor 0x50($key),@x[5]
448 pxor 0x60($key),@x[6]
449 pxor 0x70($key),@x[7]
459 # modified to emit output in order suitable for feeding back to aesenc[last]
462 my $inv=@_[16]; # optional
464 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
465 pshufd \$0x93, @x[1], @t[1]
466 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
467 pshufd \$0x93, @x[2], @t[2]
469 pshufd \$0x93, @x[3], @t[3]
471 pshufd \$0x93, @x[4], @t[4]
473 pshufd \$0x93, @x[5], @t[5]
475 pshufd \$0x93, @x[6], @t[6]
477 pshufd \$0x93, @x[7], @t[7]
484 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
486 pshufd \$0x4E, @x[1], @x[1]
492 pshufd \$0x4E, @x[4], @t[0]
494 pshufd \$0x4E, @x[5], @t[1]
496 pshufd \$0x4E, @x[3], @x[4]
498 pshufd \$0x4E, @x[7], @x[5]
500 pshufd \$0x4E, @x[6], @x[3]
502 pshufd \$0x4E, @x[2], @x[6]
505 $code.=<<___ if (!$inv);
513 $code.=<<___ if ($inv);
526 sub InvMixColumns_orig {
531 # multiplication by 0x0e
532 pshufd \$0x93, @x[7], @t[7]
534 pxor @x[5], @x[7] # 7 5
535 pxor @x[5], @x[2] # 2 5
536 pshufd \$0x93, @x[0], @t[0]
538 pxor @x[0], @x[5] # 5 0 [1]
539 pxor @x[1], @x[0] # 0 1
540 pshufd \$0x93, @x[1], @t[1]
541 pxor @x[2], @x[1] # 1 25
542 pxor @x[6], @x[0] # 01 6 [2]
543 pxor @x[3], @x[1] # 125 3 [4]
544 pshufd \$0x93, @x[3], @t[3]
545 pxor @x[0], @x[2] # 25 016 [3]
546 pxor @x[7], @x[3] # 3 75
547 pxor @x[6], @x[7] # 75 6 [0]
548 pshufd \$0x93, @x[6], @t[6]
550 pxor @x[4], @x[6] # 6 4
551 pxor @x[3], @x[4] # 4 375 [6]
552 pxor @x[7], @x[3] # 375 756=36
553 pxor @t[5], @x[6] # 64 5 [7]
554 pxor @t[2], @x[3] # 36 2
555 pxor @t[4], @x[3] # 362 4 [5]
556 pshufd \$0x93, @t[5], @t[5]
558 my @y = @x[7,5,0,2,1,3,4,6];
560 # multiplication by 0x0b
564 pshufd \$0x93, @t[2], @t[2]
568 pshufd \$0x93, @t[4], @t[4]
569 pxor @t[6], @t[7] # clobber t[7]
573 pshufd \$0x93, @t[0], @t[0]
577 pshufd \$0x93, @t[1], @t[1]
581 pshufd \$0x93, @t[2], @t[2]
585 pshufd \$0x93, @t[3], @t[3]
591 pxor @t[5], @t[7] # clobber t[7] even more
594 pshufd \$0x93, @t[4], @t[4]
599 pshufd \$0x93, @t[5], @t[5]
600 pxor @t[6], @t[7] # restore t[7]
602 # multiplication by 0x0d
605 pshufd \$0x93, @t[6], @t[6]
609 pshufd \$0x93, @t[7], @t[7]
618 pshufd \$0x93, @t[0], @t[0]
622 pshufd \$0x93, @t[1], @t[1]
627 pshufd \$0x93, @t[2], @t[2]
629 pxor @t[3], @t[6] # clobber t[6]
636 pshufd \$0x93, @t[4], @t[4]
639 pxor @t[3], @t[6] # restore t[6]
641 pshufd \$0x93, @t[5], @t[5]
642 pshufd \$0x93, @t[6], @t[6]
643 pshufd \$0x93, @t[7], @t[7]
644 pshufd \$0x93, @t[3], @t[3]
646 # multiplication by 0x09
648 pxor @y[1], @t[1] # t[1]=y[1]
649 pxor @t[5], @t[0] # clobber t[0]
652 pxor @y[0], @t[0] # t[0]=y[0]
654 pxor @t[7], @t[6] # clobber t[6]
657 pxor @y[4], @t[4] # t[4]=y[4]
659 pxor @y[3], @t[3] # t[3]=y[3]
661 pxor @y[2], @t[2] # t[2]=y[2]
663 pxor @y[5], @t[5] # t[5]=y[5]
666 pxor @y[6], @t[6] # t[6]=y[6]
667 pxor @y[7], @t[7] # t[7]=y[7]
684 # Thanks to Jussi Kivilinna for providing pointer to
686 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
687 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
688 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
689 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
692 # multiplication by 0x05-0x00-0x04-0x00
693 pshufd \$0x4E, @x[0], @t[0]
694 pshufd \$0x4E, @x[6], @t[6]
696 pshufd \$0x4E, @x[7], @t[7]
698 pshufd \$0x4E, @x[1], @t[1]
700 pshufd \$0x4E, @x[2], @t[2]
702 pshufd \$0x4E, @x[3], @t[3]
706 pshufd \$0x4E, @x[4], @t[4]
710 pshufd \$0x4E, @x[5], @t[5]
725 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
728 sub aesenc { # not used
732 movdqa 0x30($const),@t[0] # .LSR
734 &ShiftRows (@b,@t[0]);
736 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
739 sub aesenclast { # not used
743 movdqa 0x40($const),@t[0] # .LSRM0
745 &ShiftRows (@b,@t[0]);
748 pxor 0x00($key),@b[0]
749 pxor 0x10($key),@b[1]
750 pxor 0x20($key),@b[4]
751 pxor 0x30($key),@b[6]
752 pxor 0x40($key),@b[3]
753 pxor 0x50($key),@b[7]
754 pxor 0x60($key),@b[2]
755 pxor 0x70($key),@b[5]
760 my ($a,$b,$n,$mask,$t)=@_;
772 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
792 my @x=reverse(@_[0..7]);
793 my ($t0,$t1,$t2,$t3)=@_[8..11];
795 movdqa 0x00($const),$t0 # .LBS0
796 movdqa 0x10($const),$t1 # .LBS1
798 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
799 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
801 movdqa 0x20($const),$t0 # .LBS2
803 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
804 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
806 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
807 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
813 .extern asm_AES_encrypt
814 .extern asm_AES_decrypt
816 .type _bsaes_encrypt8,\@abi-omnipotent
820 lea .LBS0(%rip), $const # constants table
822 movdqa ($key), @XMM[9] # round 0 key
824 movdqa 0x50($const), @XMM[8] # .LM0SR
825 pxor @XMM[9], @XMM[0] # xor with round0 key
826 pxor @XMM[9], @XMM[1]
827 pxor @XMM[9], @XMM[2]
828 pxor @XMM[9], @XMM[3]
829 pshufb @XMM[8], @XMM[0]
830 pshufb @XMM[8], @XMM[1]
831 pxor @XMM[9], @XMM[4]
832 pxor @XMM[9], @XMM[5]
833 pshufb @XMM[8], @XMM[2]
834 pshufb @XMM[8], @XMM[3]
835 pxor @XMM[9], @XMM[6]
836 pxor @XMM[9], @XMM[7]
837 pshufb @XMM[8], @XMM[4]
838 pshufb @XMM[8], @XMM[5]
839 pshufb @XMM[8], @XMM[6]
840 pshufb @XMM[8], @XMM[7]
841 _bsaes_encrypt8_bitslice:
843 &bitslice (@XMM[0..7, 8..11]);
850 &ShiftRows (@XMM[0..7, 8]);
851 $code.=".Lenc_sbox:\n";
852 &Sbox (@XMM[0..7, 8..15]);
857 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
859 movdqa 0x30($const), @XMM[8] # .LSR
861 movdqa 0x40($const), @XMM[8] # .LSRM0
866 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
867 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
869 movdqa ($key), @XMM[8] # last round key
870 pxor @XMM[8], @XMM[4]
871 pxor @XMM[8], @XMM[6]
872 pxor @XMM[8], @XMM[3]
873 pxor @XMM[8], @XMM[7]
874 pxor @XMM[8], @XMM[2]
875 pxor @XMM[8], @XMM[5]
876 pxor @XMM[8], @XMM[0]
877 pxor @XMM[8], @XMM[1]
880 .size _bsaes_encrypt8,.-_bsaes_encrypt8
882 .type _bsaes_decrypt8,\@abi-omnipotent
886 lea .LBS0(%rip), $const # constants table
888 movdqa ($key), @XMM[9] # round 0 key
890 movdqa -0x30($const), @XMM[8] # .LM0ISR
891 pxor @XMM[9], @XMM[0] # xor with round0 key
892 pxor @XMM[9], @XMM[1]
893 pxor @XMM[9], @XMM[2]
894 pxor @XMM[9], @XMM[3]
895 pshufb @XMM[8], @XMM[0]
896 pshufb @XMM[8], @XMM[1]
897 pxor @XMM[9], @XMM[4]
898 pxor @XMM[9], @XMM[5]
899 pshufb @XMM[8], @XMM[2]
900 pshufb @XMM[8], @XMM[3]
901 pxor @XMM[9], @XMM[6]
902 pxor @XMM[9], @XMM[7]
903 pshufb @XMM[8], @XMM[4]
904 pshufb @XMM[8], @XMM[5]
905 pshufb @XMM[8], @XMM[6]
906 pshufb @XMM[8], @XMM[7]
908 &bitslice (@XMM[0..7, 8..11]);
915 &ShiftRows (@XMM[0..7, 8]);
916 $code.=".Ldec_sbox:\n";
917 &InvSbox (@XMM[0..7, 8..15]);
922 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
924 movdqa -0x10($const), @XMM[8] # .LISR
926 movdqa -0x20($const), @XMM[8] # .LISRM0
931 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
933 movdqa ($key), @XMM[8] # last round key
934 pxor @XMM[8], @XMM[6]
935 pxor @XMM[8], @XMM[4]
936 pxor @XMM[8], @XMM[2]
937 pxor @XMM[8], @XMM[7]
938 pxor @XMM[8], @XMM[3]
939 pxor @XMM[8], @XMM[5]
940 pxor @XMM[8], @XMM[0]
941 pxor @XMM[8], @XMM[1]
944 .size _bsaes_decrypt8,.-_bsaes_decrypt8
948 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
951 my @x=reverse(@_[0..7]);
952 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
954 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
956 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
960 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
962 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
964 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
970 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
971 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
975 .type _bsaes_key_convert,\@abi-omnipotent
979 lea .Lmasks(%rip), $const
980 movdqu ($inp), %xmm7 # load round 0 key
982 movdqa 0x00($const), %xmm0 # 0x01...
983 movdqa 0x10($const), %xmm1 # 0x02...
984 movdqa 0x20($const), %xmm2 # 0x04...
985 movdqa 0x30($const), %xmm3 # 0x08...
986 movdqa 0x40($const), %xmm4 # .LM0
987 pcmpeqd %xmm5, %xmm5 # .LNOT
989 movdqu ($inp), %xmm6 # load round 1 key
990 movdqa %xmm7, ($out) # save round 0 key
996 pshufb %xmm4, %xmm6 # .LM0
1003 movdqa %xmm2, %xmm10
1004 pcmpeqb %xmm0, %xmm8
1005 psllq \$4, %xmm0 # 0x10...
1006 movdqa %xmm3, %xmm11
1007 pcmpeqb %xmm1, %xmm9
1008 psllq \$4, %xmm1 # 0x20...
1012 movdqa %xmm0, %xmm12
1013 pcmpeqb %xmm2, %xmm10
1014 psllq \$4, %xmm2 # 0x40...
1015 movdqa %xmm1, %xmm13
1016 pcmpeqb %xmm3, %xmm11
1017 psllq \$4, %xmm3 # 0x80...
1019 movdqa %xmm2, %xmm14
1020 movdqa %xmm3, %xmm15
1021 pxor %xmm5, %xmm8 # "pnot"
1026 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1027 pcmpeqb %xmm0, %xmm12
1028 psrlq \$4, %xmm0 # 0x01...
1029 movdqa %xmm9, 0x10($out)
1030 pcmpeqb %xmm1, %xmm13
1031 psrlq \$4, %xmm1 # 0x02...
1032 lea 0x10($inp), $inp
1036 movdqa %xmm10, 0x20($out)
1037 pcmpeqb %xmm2, %xmm14
1038 psrlq \$4, %xmm2 # 0x04...
1039 movdqa %xmm11, 0x30($out)
1040 pcmpeqb %xmm3, %xmm15
1041 psrlq \$4, %xmm3 # 0x08...
1042 movdqu ($inp), %xmm6 # load next round key
1044 pxor %xmm5, %xmm13 # "pnot"
1046 movdqa %xmm12, 0x40($out)
1047 movdqa %xmm13, 0x50($out)
1048 movdqa %xmm14, 0x60($out)
1049 movdqa %xmm15, 0x70($out)
1054 movdqa 0x50($const), %xmm7 # .L63
1055 #movdqa %xmm6, ($out) # don't save last round key
1058 .size _bsaes_key_convert,.-_bsaes_key_convert
1062 if (0 && !$win64) { # following four functions are unsupported interface
1063 # used for benchmarking...
1065 .globl bsaes_enc_key_convert
1066 .type bsaes_enc_key_convert,\@function,2
1068 bsaes_enc_key_convert:
1069 mov 240($inp),%r10d # pass rounds
1070 mov $inp,%rcx # pass key
1071 mov $out,%rax # pass key schedule
1072 call _bsaes_key_convert
1073 pxor %xmm6,%xmm7 # fix up last round key
1074 movdqa %xmm7,(%rax) # save last round key
1076 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1078 .globl bsaes_encrypt_128
1079 .type bsaes_encrypt_128,\@function,4
1083 movdqu 0x00($inp), @XMM[0] # load input
1084 movdqu 0x10($inp), @XMM[1]
1085 movdqu 0x20($inp), @XMM[2]
1086 movdqu 0x30($inp), @XMM[3]
1087 movdqu 0x40($inp), @XMM[4]
1088 movdqu 0x50($inp), @XMM[5]
1089 movdqu 0x60($inp), @XMM[6]
1090 movdqu 0x70($inp), @XMM[7]
1091 mov $key, %rax # pass the $key
1092 lea 0x80($inp), $inp
1095 call _bsaes_encrypt8
1097 movdqu @XMM[0], 0x00($out) # write output
1098 movdqu @XMM[1], 0x10($out)
1099 movdqu @XMM[4], 0x20($out)
1100 movdqu @XMM[6], 0x30($out)
1101 movdqu @XMM[3], 0x40($out)
1102 movdqu @XMM[7], 0x50($out)
1103 movdqu @XMM[2], 0x60($out)
1104 movdqu @XMM[5], 0x70($out)
1105 lea 0x80($out), $out
1109 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1111 .globl bsaes_dec_key_convert
1112 .type bsaes_dec_key_convert,\@function,2
1114 bsaes_dec_key_convert:
1115 mov 240($inp),%r10d # pass rounds
1116 mov $inp,%rcx # pass key
1117 mov $out,%rax # pass key schedule
1118 call _bsaes_key_convert
1119 pxor ($out),%xmm7 # fix up round 0 key
1120 movdqa %xmm6,(%rax) # save last round key
1123 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1125 .globl bsaes_decrypt_128
1126 .type bsaes_decrypt_128,\@function,4
1130 movdqu 0x00($inp), @XMM[0] # load input
1131 movdqu 0x10($inp), @XMM[1]
1132 movdqu 0x20($inp), @XMM[2]
1133 movdqu 0x30($inp), @XMM[3]
1134 movdqu 0x40($inp), @XMM[4]
1135 movdqu 0x50($inp), @XMM[5]
1136 movdqu 0x60($inp), @XMM[6]
1137 movdqu 0x70($inp), @XMM[7]
1138 mov $key, %rax # pass the $key
1139 lea 0x80($inp), $inp
1142 call _bsaes_decrypt8
1144 movdqu @XMM[0], 0x00($out) # write output
1145 movdqu @XMM[1], 0x10($out)
1146 movdqu @XMM[6], 0x20($out)
1147 movdqu @XMM[4], 0x30($out)
1148 movdqu @XMM[2], 0x40($out)
1149 movdqu @XMM[7], 0x50($out)
1150 movdqu @XMM[3], 0x60($out)
1151 movdqu @XMM[5], 0x70($out)
1152 lea 0x80($out), $out
1156 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1160 ######################################################################
1164 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1165 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1166 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1170 .globl bsaes_ecb_encrypt_blocks
1171 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1173 bsaes_ecb_encrypt_blocks:
1189 lea -0x48(%rsp),%rsp
1190 .cfi_adjust_cfa_offset 0x48
1192 $code.=<<___ if ($win64);
1193 lea -0xa0(%rsp), %rsp
1194 movaps %xmm6, 0x40(%rsp)
1195 movaps %xmm7, 0x50(%rsp)
1196 movaps %xmm8, 0x60(%rsp)
1197 movaps %xmm9, 0x70(%rsp)
1198 movaps %xmm10, 0x80(%rsp)
1199 movaps %xmm11, 0x90(%rsp)
1200 movaps %xmm12, 0xa0(%rsp)
1201 movaps %xmm13, 0xb0(%rsp)
1202 movaps %xmm14, 0xc0(%rsp)
1203 movaps %xmm15, 0xd0(%rsp)
1207 mov %rsp,%rbp # backup %rsp
1208 .cfi_def_cfa_register %rbp
1209 mov 240($arg4),%eax # rounds
1210 mov $arg1,$inp # backup arguments
1217 mov %eax,%ebx # backup rounds
1218 shl \$7,%rax # 128 bytes per inner round key
1219 sub \$`128-32`,%rax # size of bit-sliced key schedule
1221 mov %rsp,%rax # pass key schedule
1222 mov $key,%rcx # pass key
1223 mov %ebx,%r10d # pass rounds
1224 call _bsaes_key_convert
1225 pxor %xmm6,%xmm7 # fix up last round key
1226 movdqa %xmm7,(%rax) # save last round key
1230 movdqu 0x00($inp), @XMM[0] # load input
1231 movdqu 0x10($inp), @XMM[1]
1232 movdqu 0x20($inp), @XMM[2]
1233 movdqu 0x30($inp), @XMM[3]
1234 movdqu 0x40($inp), @XMM[4]
1235 movdqu 0x50($inp), @XMM[5]
1236 mov %rsp, %rax # pass key schedule
1237 movdqu 0x60($inp), @XMM[6]
1238 mov %ebx,%r10d # pass rounds
1239 movdqu 0x70($inp), @XMM[7]
1240 lea 0x80($inp), $inp
1242 call _bsaes_encrypt8
1244 movdqu @XMM[0], 0x00($out) # write output
1245 movdqu @XMM[1], 0x10($out)
1246 movdqu @XMM[4], 0x20($out)
1247 movdqu @XMM[6], 0x30($out)
1248 movdqu @XMM[3], 0x40($out)
1249 movdqu @XMM[7], 0x50($out)
1250 movdqu @XMM[2], 0x60($out)
1251 movdqu @XMM[5], 0x70($out)
1252 lea 0x80($out), $out
1259 movdqu 0x00($inp), @XMM[0] # load input
1260 mov %rsp, %rax # pass key schedule
1261 mov %ebx,%r10d # pass rounds
1264 movdqu 0x10($inp), @XMM[1]
1266 movdqu 0x20($inp), @XMM[2]
1269 movdqu 0x30($inp), @XMM[3]
1271 movdqu 0x40($inp), @XMM[4]
1274 movdqu 0x50($inp), @XMM[5]
1276 movdqu 0x60($inp), @XMM[6]
1277 call _bsaes_encrypt8
1278 movdqu @XMM[0], 0x00($out) # write output
1279 movdqu @XMM[1], 0x10($out)
1280 movdqu @XMM[4], 0x20($out)
1281 movdqu @XMM[6], 0x30($out)
1282 movdqu @XMM[3], 0x40($out)
1283 movdqu @XMM[7], 0x50($out)
1284 movdqu @XMM[2], 0x60($out)
1288 call _bsaes_encrypt8
1289 movdqu @XMM[0], 0x00($out) # write output
1290 movdqu @XMM[1], 0x10($out)
1291 movdqu @XMM[4], 0x20($out)
1292 movdqu @XMM[6], 0x30($out)
1293 movdqu @XMM[3], 0x40($out)
1294 movdqu @XMM[7], 0x50($out)
1298 call _bsaes_encrypt8
1299 movdqu @XMM[0], 0x00($out) # write output
1300 movdqu @XMM[1], 0x10($out)
1301 movdqu @XMM[4], 0x20($out)
1302 movdqu @XMM[6], 0x30($out)
1303 movdqu @XMM[3], 0x40($out)
1307 call _bsaes_encrypt8
1308 movdqu @XMM[0], 0x00($out) # write output
1309 movdqu @XMM[1], 0x10($out)
1310 movdqu @XMM[4], 0x20($out)
1311 movdqu @XMM[6], 0x30($out)
1315 call _bsaes_encrypt8
1316 movdqu @XMM[0], 0x00($out) # write output
1317 movdqu @XMM[1], 0x10($out)
1318 movdqu @XMM[4], 0x20($out)
1322 call _bsaes_encrypt8
1323 movdqu @XMM[0], 0x00($out) # write output
1324 movdqu @XMM[1], 0x10($out)
1328 call _bsaes_encrypt8
1329 movdqu @XMM[0], 0x00($out) # write output
1336 call asm_AES_encrypt
1345 .Lecb_enc_bzero: # wipe key schedule [if any]
1346 movdqa %xmm0, 0x00(%rax)
1347 movdqa %xmm0, 0x10(%rax)
1348 lea 0x20(%rax), %rax
1355 $code.=<<___ if ($win64);
1356 movaps 0x40(%rbp), %xmm6
1357 movaps 0x50(%rbp), %xmm7
1358 movaps 0x60(%rbp), %xmm8
1359 movaps 0x70(%rbp), %xmm9
1360 movaps 0x80(%rbp), %xmm10
1361 movaps 0x90(%rbp), %xmm11
1362 movaps 0xa0(%rbp), %xmm12
1363 movaps 0xb0(%rbp), %xmm13
1364 movaps 0xc0(%rbp), %xmm14
1365 movaps 0xd0(%rbp), %xmm15
1366 lea 0xa0(%rax), %rax
1382 lea (%rax), %rsp # restore %rsp
1383 .cfi_def_cfa_register %rsp
1387 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1389 .globl bsaes_ecb_decrypt_blocks
1390 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1392 bsaes_ecb_decrypt_blocks:
1408 lea -0x48(%rsp),%rsp
1409 .cfi_adjust_cfa_offset 0x48
1411 $code.=<<___ if ($win64);
1412 lea -0xa0(%rsp), %rsp
1413 movaps %xmm6, 0x40(%rsp)
1414 movaps %xmm7, 0x50(%rsp)
1415 movaps %xmm8, 0x60(%rsp)
1416 movaps %xmm9, 0x70(%rsp)
1417 movaps %xmm10, 0x80(%rsp)
1418 movaps %xmm11, 0x90(%rsp)
1419 movaps %xmm12, 0xa0(%rsp)
1420 movaps %xmm13, 0xb0(%rsp)
1421 movaps %xmm14, 0xc0(%rsp)
1422 movaps %xmm15, 0xd0(%rsp)
1426 mov %rsp,%rbp # backup %rsp
1427 .cfi_def_cfa_register %rbp
1428 mov 240($arg4),%eax # rounds
1429 mov $arg1,$inp # backup arguments
1436 mov %eax,%ebx # backup rounds
1437 shl \$7,%rax # 128 bytes per inner round key
1438 sub \$`128-32`,%rax # size of bit-sliced key schedule
1440 mov %rsp,%rax # pass key schedule
1441 mov $key,%rcx # pass key
1442 mov %ebx,%r10d # pass rounds
1443 call _bsaes_key_convert
1444 pxor (%rsp),%xmm7 # fix up 0 round key
1445 movdqa %xmm6,(%rax) # save last round key
1450 movdqu 0x00($inp), @XMM[0] # load input
1451 movdqu 0x10($inp), @XMM[1]
1452 movdqu 0x20($inp), @XMM[2]
1453 movdqu 0x30($inp), @XMM[3]
1454 movdqu 0x40($inp), @XMM[4]
1455 movdqu 0x50($inp), @XMM[5]
1456 mov %rsp, %rax # pass key schedule
1457 movdqu 0x60($inp), @XMM[6]
1458 mov %ebx,%r10d # pass rounds
1459 movdqu 0x70($inp), @XMM[7]
1460 lea 0x80($inp), $inp
1462 call _bsaes_decrypt8
1464 movdqu @XMM[0], 0x00($out) # write output
1465 movdqu @XMM[1], 0x10($out)
1466 movdqu @XMM[6], 0x20($out)
1467 movdqu @XMM[4], 0x30($out)
1468 movdqu @XMM[2], 0x40($out)
1469 movdqu @XMM[7], 0x50($out)
1470 movdqu @XMM[3], 0x60($out)
1471 movdqu @XMM[5], 0x70($out)
1472 lea 0x80($out), $out
1479 movdqu 0x00($inp), @XMM[0] # load input
1480 mov %rsp, %rax # pass key schedule
1481 mov %ebx,%r10d # pass rounds
1484 movdqu 0x10($inp), @XMM[1]
1486 movdqu 0x20($inp), @XMM[2]
1489 movdqu 0x30($inp), @XMM[3]
1491 movdqu 0x40($inp), @XMM[4]
1494 movdqu 0x50($inp), @XMM[5]
1496 movdqu 0x60($inp), @XMM[6]
1497 call _bsaes_decrypt8
1498 movdqu @XMM[0], 0x00($out) # write output
1499 movdqu @XMM[1], 0x10($out)
1500 movdqu @XMM[6], 0x20($out)
1501 movdqu @XMM[4], 0x30($out)
1502 movdqu @XMM[2], 0x40($out)
1503 movdqu @XMM[7], 0x50($out)
1504 movdqu @XMM[3], 0x60($out)
1508 call _bsaes_decrypt8
1509 movdqu @XMM[0], 0x00($out) # write output
1510 movdqu @XMM[1], 0x10($out)
1511 movdqu @XMM[6], 0x20($out)
1512 movdqu @XMM[4], 0x30($out)
1513 movdqu @XMM[2], 0x40($out)
1514 movdqu @XMM[7], 0x50($out)
1518 call _bsaes_decrypt8
1519 movdqu @XMM[0], 0x00($out) # write output
1520 movdqu @XMM[1], 0x10($out)
1521 movdqu @XMM[6], 0x20($out)
1522 movdqu @XMM[4], 0x30($out)
1523 movdqu @XMM[2], 0x40($out)
1527 call _bsaes_decrypt8
1528 movdqu @XMM[0], 0x00($out) # write output
1529 movdqu @XMM[1], 0x10($out)
1530 movdqu @XMM[6], 0x20($out)
1531 movdqu @XMM[4], 0x30($out)
1535 call _bsaes_decrypt8
1536 movdqu @XMM[0], 0x00($out) # write output
1537 movdqu @XMM[1], 0x10($out)
1538 movdqu @XMM[6], 0x20($out)
1542 call _bsaes_decrypt8
1543 movdqu @XMM[0], 0x00($out) # write output
1544 movdqu @XMM[1], 0x10($out)
1548 call _bsaes_decrypt8
1549 movdqu @XMM[0], 0x00($out) # write output
1556 call asm_AES_decrypt
1565 .Lecb_dec_bzero: # wipe key schedule [if any]
1566 movdqa %xmm0, 0x00(%rax)
1567 movdqa %xmm0, 0x10(%rax)
1568 lea 0x20(%rax), %rax
1575 $code.=<<___ if ($win64);
1576 movaps 0x40(%rbp), %xmm6
1577 movaps 0x50(%rbp), %xmm7
1578 movaps 0x60(%rbp), %xmm8
1579 movaps 0x70(%rbp), %xmm9
1580 movaps 0x80(%rbp), %xmm10
1581 movaps 0x90(%rbp), %xmm11
1582 movaps 0xa0(%rbp), %xmm12
1583 movaps 0xb0(%rbp), %xmm13
1584 movaps 0xc0(%rbp), %xmm14
1585 movaps 0xd0(%rbp), %xmm15
1586 lea 0xa0(%rax), %rax
1602 lea (%rax), %rsp # restore %rsp
1603 .cfi_def_cfa_register %rsp
1607 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1611 .extern asm_AES_cbc_encrypt
1612 .globl bsaes_cbc_encrypt
1613 .type bsaes_cbc_encrypt,\@abi-omnipotent
1618 $code.=<<___ if ($win64);
1619 mov 48(%rsp),$arg6 # pull direction flag
1623 jne asm_AES_cbc_encrypt
1625 jb asm_AES_cbc_encrypt
1641 lea -0x48(%rsp), %rsp
1642 .cfi_adjust_cfa_offset 0x48
1644 $code.=<<___ if ($win64);
1645 mov 0xa0(%rsp),$arg5 # pull ivp
1646 lea -0xa0(%rsp), %rsp
1647 movaps %xmm6, 0x40(%rsp)
1648 movaps %xmm7, 0x50(%rsp)
1649 movaps %xmm8, 0x60(%rsp)
1650 movaps %xmm9, 0x70(%rsp)
1651 movaps %xmm10, 0x80(%rsp)
1652 movaps %xmm11, 0x90(%rsp)
1653 movaps %xmm12, 0xa0(%rsp)
1654 movaps %xmm13, 0xb0(%rsp)
1655 movaps %xmm14, 0xc0(%rsp)
1656 movaps %xmm15, 0xd0(%rsp)
1660 mov %rsp, %rbp # backup %rsp
1661 .cfi_def_cfa_register %rbp
1662 mov 240($arg4), %eax # rounds
1663 mov $arg1, $inp # backup arguments
1668 shr \$4, $len # bytes to blocks
1670 mov %eax, %edx # rounds
1671 shl \$7, %rax # 128 bytes per inner round key
1672 sub \$`128-32`, %rax # size of bit-sliced key schedule
1675 mov %rsp, %rax # pass key schedule
1676 mov $key, %rcx # pass key
1677 mov %edx, %r10d # pass rounds
1678 call _bsaes_key_convert
1679 pxor (%rsp),%xmm7 # fix up 0 round key
1680 movdqa %xmm6,(%rax) # save last round key
1683 movdqu (%rbx), @XMM[15] # load IV
1686 movdqu 0x00($inp), @XMM[0] # load input
1687 movdqu 0x10($inp), @XMM[1]
1688 movdqu 0x20($inp), @XMM[2]
1689 movdqu 0x30($inp), @XMM[3]
1690 movdqu 0x40($inp), @XMM[4]
1691 movdqu 0x50($inp), @XMM[5]
1692 mov %rsp, %rax # pass key schedule
1693 movdqu 0x60($inp), @XMM[6]
1694 mov %edx,%r10d # pass rounds
1695 movdqu 0x70($inp), @XMM[7]
1696 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1698 call _bsaes_decrypt8
1700 pxor 0x20(%rbp), @XMM[0] # ^= IV
1701 movdqu 0x00($inp), @XMM[8] # re-load input
1702 movdqu 0x10($inp), @XMM[9]
1703 pxor @XMM[8], @XMM[1]
1704 movdqu 0x20($inp), @XMM[10]
1705 pxor @XMM[9], @XMM[6]
1706 movdqu 0x30($inp), @XMM[11]
1707 pxor @XMM[10], @XMM[4]
1708 movdqu 0x40($inp), @XMM[12]
1709 pxor @XMM[11], @XMM[2]
1710 movdqu 0x50($inp), @XMM[13]
1711 pxor @XMM[12], @XMM[7]
1712 movdqu 0x60($inp), @XMM[14]
1713 pxor @XMM[13], @XMM[3]
1714 movdqu 0x70($inp), @XMM[15] # IV
1715 pxor @XMM[14], @XMM[5]
1716 movdqu @XMM[0], 0x00($out) # write output
1717 lea 0x80($inp), $inp
1718 movdqu @XMM[1], 0x10($out)
1719 movdqu @XMM[6], 0x20($out)
1720 movdqu @XMM[4], 0x30($out)
1721 movdqu @XMM[2], 0x40($out)
1722 movdqu @XMM[7], 0x50($out)
1723 movdqu @XMM[3], 0x60($out)
1724 movdqu @XMM[5], 0x70($out)
1725 lea 0x80($out), $out
1732 movdqu 0x00($inp), @XMM[0] # load input
1733 mov %rsp, %rax # pass key schedule
1734 mov %edx, %r10d # pass rounds
1737 movdqu 0x10($inp), @XMM[1]
1739 movdqu 0x20($inp), @XMM[2]
1742 movdqu 0x30($inp), @XMM[3]
1744 movdqu 0x40($inp), @XMM[4]
1747 movdqu 0x50($inp), @XMM[5]
1749 movdqu 0x60($inp), @XMM[6]
1750 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1751 call _bsaes_decrypt8
1752 pxor 0x20(%rbp), @XMM[0] # ^= IV
1753 movdqu 0x00($inp), @XMM[8] # re-load input
1754 movdqu 0x10($inp), @XMM[9]
1755 pxor @XMM[8], @XMM[1]
1756 movdqu 0x20($inp), @XMM[10]
1757 pxor @XMM[9], @XMM[6]
1758 movdqu 0x30($inp), @XMM[11]
1759 pxor @XMM[10], @XMM[4]
1760 movdqu 0x40($inp), @XMM[12]
1761 pxor @XMM[11], @XMM[2]
1762 movdqu 0x50($inp), @XMM[13]
1763 pxor @XMM[12], @XMM[7]
1764 movdqu 0x60($inp), @XMM[15] # IV
1765 pxor @XMM[13], @XMM[3]
1766 movdqu @XMM[0], 0x00($out) # write output
1767 movdqu @XMM[1], 0x10($out)
1768 movdqu @XMM[6], 0x20($out)
1769 movdqu @XMM[4], 0x30($out)
1770 movdqu @XMM[2], 0x40($out)
1771 movdqu @XMM[7], 0x50($out)
1772 movdqu @XMM[3], 0x60($out)
1776 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1777 call _bsaes_decrypt8
1778 pxor 0x20(%rbp), @XMM[0] # ^= IV
1779 movdqu 0x00($inp), @XMM[8] # re-load input
1780 movdqu 0x10($inp), @XMM[9]
1781 pxor @XMM[8], @XMM[1]
1782 movdqu 0x20($inp), @XMM[10]
1783 pxor @XMM[9], @XMM[6]
1784 movdqu 0x30($inp), @XMM[11]
1785 pxor @XMM[10], @XMM[4]
1786 movdqu 0x40($inp), @XMM[12]
1787 pxor @XMM[11], @XMM[2]
1788 movdqu 0x50($inp), @XMM[15] # IV
1789 pxor @XMM[12], @XMM[7]
1790 movdqu @XMM[0], 0x00($out) # write output
1791 movdqu @XMM[1], 0x10($out)
1792 movdqu @XMM[6], 0x20($out)
1793 movdqu @XMM[4], 0x30($out)
1794 movdqu @XMM[2], 0x40($out)
1795 movdqu @XMM[7], 0x50($out)
1799 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1800 call _bsaes_decrypt8
1801 pxor 0x20(%rbp), @XMM[0] # ^= IV
1802 movdqu 0x00($inp), @XMM[8] # re-load input
1803 movdqu 0x10($inp), @XMM[9]
1804 pxor @XMM[8], @XMM[1]
1805 movdqu 0x20($inp), @XMM[10]
1806 pxor @XMM[9], @XMM[6]
1807 movdqu 0x30($inp), @XMM[11]
1808 pxor @XMM[10], @XMM[4]
1809 movdqu 0x40($inp), @XMM[15] # IV
1810 pxor @XMM[11], @XMM[2]
1811 movdqu @XMM[0], 0x00($out) # write output
1812 movdqu @XMM[1], 0x10($out)
1813 movdqu @XMM[6], 0x20($out)
1814 movdqu @XMM[4], 0x30($out)
1815 movdqu @XMM[2], 0x40($out)
1819 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1820 call _bsaes_decrypt8
1821 pxor 0x20(%rbp), @XMM[0] # ^= IV
1822 movdqu 0x00($inp), @XMM[8] # re-load input
1823 movdqu 0x10($inp), @XMM[9]
1824 pxor @XMM[8], @XMM[1]
1825 movdqu 0x20($inp), @XMM[10]
1826 pxor @XMM[9], @XMM[6]
1827 movdqu 0x30($inp), @XMM[15] # IV
1828 pxor @XMM[10], @XMM[4]
1829 movdqu @XMM[0], 0x00($out) # write output
1830 movdqu @XMM[1], 0x10($out)
1831 movdqu @XMM[6], 0x20($out)
1832 movdqu @XMM[4], 0x30($out)
1836 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1837 call _bsaes_decrypt8
1838 pxor 0x20(%rbp), @XMM[0] # ^= IV
1839 movdqu 0x00($inp), @XMM[8] # re-load input
1840 movdqu 0x10($inp), @XMM[9]
1841 pxor @XMM[8], @XMM[1]
1842 movdqu 0x20($inp), @XMM[15] # IV
1843 pxor @XMM[9], @XMM[6]
1844 movdqu @XMM[0], 0x00($out) # write output
1845 movdqu @XMM[1], 0x10($out)
1846 movdqu @XMM[6], 0x20($out)
1850 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1851 call _bsaes_decrypt8
1852 pxor 0x20(%rbp), @XMM[0] # ^= IV
1853 movdqu 0x00($inp), @XMM[8] # re-load input
1854 movdqu 0x10($inp), @XMM[15] # IV
1855 pxor @XMM[8], @XMM[1]
1856 movdqu @XMM[0], 0x00($out) # write output
1857 movdqu @XMM[1], 0x10($out)
1862 lea 0x20(%rbp), $arg2 # buffer output
1864 call asm_AES_decrypt # doesn't touch %xmm
1865 pxor 0x20(%rbp), @XMM[15] # ^= IV
1866 movdqu @XMM[15], ($out) # write output
1867 movdqa @XMM[0], @XMM[15] # IV
1870 movdqu @XMM[15], (%rbx) # return IV
1873 .Lcbc_dec_bzero: # wipe key schedule [if any]
1874 movdqa %xmm0, 0x00(%rax)
1875 movdqa %xmm0, 0x10(%rax)
1876 lea 0x20(%rax), %rax
1883 $code.=<<___ if ($win64);
1884 movaps 0x40(%rbp), %xmm6
1885 movaps 0x50(%rbp), %xmm7
1886 movaps 0x60(%rbp), %xmm8
1887 movaps 0x70(%rbp), %xmm9
1888 movaps 0x80(%rbp), %xmm10
1889 movaps 0x90(%rbp), %xmm11
1890 movaps 0xa0(%rbp), %xmm12
1891 movaps 0xb0(%rbp), %xmm13
1892 movaps 0xc0(%rbp), %xmm14
1893 movaps 0xd0(%rbp), %xmm15
1894 lea 0xa0(%rax), %rax
1910 lea (%rax), %rsp # restore %rsp
1911 .cfi_def_cfa_register %rsp
1915 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1917 .globl bsaes_ctr32_encrypt_blocks
1918 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1920 bsaes_ctr32_encrypt_blocks:
1936 lea -0x48(%rsp), %rsp
1937 .cfi_adjust_cfa_offset 0x48
1939 $code.=<<___ if ($win64);
1940 mov 0xa0(%rsp),$arg5 # pull ivp
1941 lea -0xa0(%rsp), %rsp
1942 movaps %xmm6, 0x40(%rsp)
1943 movaps %xmm7, 0x50(%rsp)
1944 movaps %xmm8, 0x60(%rsp)
1945 movaps %xmm9, 0x70(%rsp)
1946 movaps %xmm10, 0x80(%rsp)
1947 movaps %xmm11, 0x90(%rsp)
1948 movaps %xmm12, 0xa0(%rsp)
1949 movaps %xmm13, 0xb0(%rsp)
1950 movaps %xmm14, 0xc0(%rsp)
1951 movaps %xmm15, 0xd0(%rsp)
1955 mov %rsp, %rbp # backup %rsp
1956 .cfi_def_cfa_register %rbp
1957 movdqu ($arg5), %xmm0 # load counter
1958 mov 240($arg4), %eax # rounds
1959 mov $arg1, $inp # backup arguments
1963 movdqa %xmm0, 0x20(%rbp) # copy counter
1967 mov %eax, %ebx # rounds
1968 shl \$7, %rax # 128 bytes per inner round key
1969 sub \$`128-32`, %rax # size of bit-sliced key schedule
1972 mov %rsp, %rax # pass key schedule
1973 mov $key, %rcx # pass key
1974 mov %ebx, %r10d # pass rounds
1975 call _bsaes_key_convert
1976 pxor %xmm6,%xmm7 # fix up last round key
1977 movdqa %xmm7,(%rax) # save last round key
1979 movdqa (%rsp), @XMM[9] # load round0 key
1980 lea .LADD1(%rip), %r11
1981 movdqa 0x20(%rbp), @XMM[0] # counter copy
1982 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1983 pshufb @XMM[8], @XMM[9] # byte swap upper part
1984 pshufb @XMM[8], @XMM[0]
1985 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1989 movdqa @XMM[0], 0x20(%rbp) # save counter
1990 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1991 movdqa @XMM[0], @XMM[2]
1992 paddd 0x00(%r11), @XMM[1] # .LADD1
1993 movdqa @XMM[0], @XMM[3]
1994 paddd 0x10(%r11), @XMM[2] # .LADD2
1995 movdqa @XMM[0], @XMM[4]
1996 paddd 0x20(%r11), @XMM[3] # .LADD3
1997 movdqa @XMM[0], @XMM[5]
1998 paddd 0x30(%r11), @XMM[4] # .LADD4
1999 movdqa @XMM[0], @XMM[6]
2000 paddd 0x40(%r11), @XMM[5] # .LADD5
2001 movdqa @XMM[0], @XMM[7]
2002 paddd 0x50(%r11), @XMM[6] # .LADD6
2003 paddd 0x60(%r11), @XMM[7] # .LADD7
2005 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
2006 # to flip byte order in 32-bit counter
2007 movdqa (%rsp), @XMM[9] # round 0 key
2008 lea 0x10(%rsp), %rax # pass key schedule
2009 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
2010 pxor @XMM[9], @XMM[0] # xor with round0 key
2011 pxor @XMM[9], @XMM[1]
2012 pxor @XMM[9], @XMM[2]
2013 pxor @XMM[9], @XMM[3]
2014 pshufb @XMM[8], @XMM[0]
2015 pshufb @XMM[8], @XMM[1]
2016 pxor @XMM[9], @XMM[4]
2017 pxor @XMM[9], @XMM[5]
2018 pshufb @XMM[8], @XMM[2]
2019 pshufb @XMM[8], @XMM[3]
2020 pxor @XMM[9], @XMM[6]
2021 pxor @XMM[9], @XMM[7]
2022 pshufb @XMM[8], @XMM[4]
2023 pshufb @XMM[8], @XMM[5]
2024 pshufb @XMM[8], @XMM[6]
2025 pshufb @XMM[8], @XMM[7]
2026 lea .LBS0(%rip), %r11 # constants table
2027 mov %ebx,%r10d # pass rounds
2029 call _bsaes_encrypt8_bitslice
2032 jc .Lctr_enc_loop_done
2034 movdqu 0x00($inp), @XMM[8] # load input
2035 movdqu 0x10($inp), @XMM[9]
2036 movdqu 0x20($inp), @XMM[10]
2037 movdqu 0x30($inp), @XMM[11]
2038 movdqu 0x40($inp), @XMM[12]
2039 movdqu 0x50($inp), @XMM[13]
2040 movdqu 0x60($inp), @XMM[14]
2041 movdqu 0x70($inp), @XMM[15]
2043 pxor @XMM[0], @XMM[8]
2044 movdqa 0x20(%rbp), @XMM[0] # load counter
2045 pxor @XMM[9], @XMM[1]
2046 movdqu @XMM[8], 0x00($out) # write output
2047 pxor @XMM[10], @XMM[4]
2048 movdqu @XMM[1], 0x10($out)
2049 pxor @XMM[11], @XMM[6]
2050 movdqu @XMM[4], 0x20($out)
2051 pxor @XMM[12], @XMM[3]
2052 movdqu @XMM[6], 0x30($out)
2053 pxor @XMM[13], @XMM[7]
2054 movdqu @XMM[3], 0x40($out)
2055 pxor @XMM[14], @XMM[2]
2056 movdqu @XMM[7], 0x50($out)
2057 pxor @XMM[15], @XMM[5]
2058 movdqu @XMM[2], 0x60($out)
2059 lea .LADD1(%rip), %r11
2060 movdqu @XMM[5], 0x70($out)
2061 lea 0x80($out), $out
2062 paddd 0x70(%r11), @XMM[0] # .LADD8
2067 .Lctr_enc_loop_done:
2069 movdqu 0x00($inp), @XMM[8] # load input
2070 pxor @XMM[8], @XMM[0]
2071 movdqu @XMM[0], 0x00($out) # write output
2074 movdqu 0x10($inp), @XMM[9]
2075 pxor @XMM[9], @XMM[1]
2076 movdqu @XMM[1], 0x10($out)
2078 movdqu 0x20($inp), @XMM[10]
2079 pxor @XMM[10], @XMM[4]
2080 movdqu @XMM[4], 0x20($out)
2083 movdqu 0x30($inp), @XMM[11]
2084 pxor @XMM[11], @XMM[6]
2085 movdqu @XMM[6], 0x30($out)
2087 movdqu 0x40($inp), @XMM[12]
2088 pxor @XMM[12], @XMM[3]
2089 movdqu @XMM[3], 0x40($out)
2092 movdqu 0x50($inp), @XMM[13]
2093 pxor @XMM[13], @XMM[7]
2094 movdqu @XMM[7], 0x50($out)
2096 movdqu 0x60($inp), @XMM[14]
2097 pxor @XMM[14], @XMM[2]
2098 movdqu @XMM[2], 0x60($out)
2103 lea 0x20(%rbp), $arg1
2104 lea 0x30(%rbp), $arg2
2106 call asm_AES_encrypt
2107 movdqu ($inp), @XMM[1]
2109 mov 0x2c(%rbp), %eax # load 32-bit counter
2111 pxor 0x30(%rbp), @XMM[1]
2112 inc %eax # increment
2113 movdqu @XMM[1], ($out)
2116 mov %eax, 0x2c(%rsp) # save 32-bit counter
2123 .Lctr_enc_bzero: # wipe key schedule [if any]
2124 movdqa %xmm0, 0x00(%rax)
2125 movdqa %xmm0, 0x10(%rax)
2126 lea 0x20(%rax), %rax
2133 $code.=<<___ if ($win64);
2134 movaps 0x40(%rbp), %xmm6
2135 movaps 0x50(%rbp), %xmm7
2136 movaps 0x60(%rbp), %xmm8
2137 movaps 0x70(%rbp), %xmm9
2138 movaps 0x80(%rbp), %xmm10
2139 movaps 0x90(%rbp), %xmm11
2140 movaps 0xa0(%rbp), %xmm12
2141 movaps 0xb0(%rbp), %xmm13
2142 movaps 0xc0(%rbp), %xmm14
2143 movaps 0xd0(%rbp), %xmm15
2144 lea 0xa0(%rax), %rax
2160 lea (%rax), %rsp # restore %rsp
2161 .cfi_def_cfa_register %rsp
2165 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2167 ######################################################################
2168 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2169 # const AES_KEY *key1, const AES_KEY *key2,
2170 # const unsigned char iv[16]);
2172 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2176 .globl bsaes_xts_encrypt
2177 .type bsaes_xts_encrypt,\@abi-omnipotent
2195 lea -0x48(%rsp), %rsp
2196 .cfi_adjust_cfa_offset 0x48
2198 $code.=<<___ if ($win64);
2199 mov 0xa0(%rsp),$arg5 # pull key2
2200 mov 0xa8(%rsp),$arg6 # pull ivp
2201 lea -0xa0(%rsp), %rsp
2202 movaps %xmm6, 0x40(%rsp)
2203 movaps %xmm7, 0x50(%rsp)
2204 movaps %xmm8, 0x60(%rsp)
2205 movaps %xmm9, 0x70(%rsp)
2206 movaps %xmm10, 0x80(%rsp)
2207 movaps %xmm11, 0x90(%rsp)
2208 movaps %xmm12, 0xa0(%rsp)
2209 movaps %xmm13, 0xb0(%rsp)
2210 movaps %xmm14, 0xc0(%rsp)
2211 movaps %xmm15, 0xd0(%rsp)
2215 mov %rsp, %rbp # backup %rsp
2216 .cfi_def_cfa_register %rbp
2217 mov $arg1, $inp # backup arguments
2223 lea 0x20(%rbp), $arg2
2225 call asm_AES_encrypt # generate initial tweak
2227 mov 240($key), %eax # rounds
2228 mov $len, %rbx # backup $len
2230 mov %eax, %edx # rounds
2231 shl \$7, %rax # 128 bytes per inner round key
2232 sub \$`128-32`, %rax # size of bit-sliced key schedule
2235 mov %rsp, %rax # pass key schedule
2236 mov $key, %rcx # pass key
2237 mov %edx, %r10d # pass rounds
2238 call _bsaes_key_convert
2239 pxor %xmm6, %xmm7 # fix up last round key
2240 movdqa %xmm7, (%rax) # save last round key
2243 sub \$0x80, %rsp # place for tweak[8]
2244 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2247 movdqa .Lxts_magic(%rip), $twmask
2248 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2257 for ($i=0;$i<7;$i++) {
2259 pshufd \$0x13, $twtmp, $twres
2261 movdqa @XMM[7], @XMM[$i]
2262 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2263 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2264 pand $twmask, $twres # isolate carry and residue
2265 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2266 pxor $twres, @XMM[7]
2268 $code.=<<___ if ($i>=1);
2269 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2271 $code.=<<___ if ($i>=2);
2272 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2276 movdqu 0x60($inp), @XMM[8+6]
2277 pxor @XMM[8+5], @XMM[5]
2278 movdqu 0x70($inp), @XMM[8+7]
2279 lea 0x80($inp), $inp
2280 movdqa @XMM[7], 0x70(%rsp)
2281 pxor @XMM[8+6], @XMM[6]
2282 lea 0x80(%rsp), %rax # pass key schedule
2283 pxor @XMM[8+7], @XMM[7]
2284 mov %edx, %r10d # pass rounds
2286 call _bsaes_encrypt8
2288 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2289 pxor 0x10(%rsp), @XMM[1]
2290 movdqu @XMM[0], 0x00($out) # write output
2291 pxor 0x20(%rsp), @XMM[4]
2292 movdqu @XMM[1], 0x10($out)
2293 pxor 0x30(%rsp), @XMM[6]
2294 movdqu @XMM[4], 0x20($out)
2295 pxor 0x40(%rsp), @XMM[3]
2296 movdqu @XMM[6], 0x30($out)
2297 pxor 0x50(%rsp), @XMM[7]
2298 movdqu @XMM[3], 0x40($out)
2299 pxor 0x60(%rsp), @XMM[2]
2300 movdqu @XMM[7], 0x50($out)
2301 pxor 0x70(%rsp), @XMM[5]
2302 movdqu @XMM[2], 0x60($out)
2303 movdqu @XMM[5], 0x70($out)
2304 lea 0x80($out), $out
2306 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2308 movdqa .Lxts_magic(%rip), $twmask
2309 pcmpgtd @XMM[7], $twtmp
2310 pshufd \$0x13, $twtmp, $twres
2312 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2313 pand $twmask, $twres # isolate carry and residue
2314 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2315 pxor $twres, @XMM[7]
2324 for ($i=0;$i<7;$i++) {
2326 pshufd \$0x13, $twtmp, $twres
2328 movdqa @XMM[7], @XMM[$i]
2329 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2330 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2331 pand $twmask, $twres # isolate carry and residue
2332 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2333 pxor $twres, @XMM[7]
2335 $code.=<<___ if ($i>=1);
2336 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2337 cmp \$`0x10*$i`,$len
2340 $code.=<<___ if ($i>=2);
2341 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2345 movdqu 0x60($inp), @XMM[8+6]
2346 pxor @XMM[8+5], @XMM[5]
2347 movdqa @XMM[7], 0x70(%rsp)
2348 lea 0x70($inp), $inp
2349 pxor @XMM[8+6], @XMM[6]
2350 lea 0x80(%rsp), %rax # pass key schedule
2351 mov %edx, %r10d # pass rounds
2353 call _bsaes_encrypt8
2355 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2356 pxor 0x10(%rsp), @XMM[1]
2357 movdqu @XMM[0], 0x00($out) # write output
2358 pxor 0x20(%rsp), @XMM[4]
2359 movdqu @XMM[1], 0x10($out)
2360 pxor 0x30(%rsp), @XMM[6]
2361 movdqu @XMM[4], 0x20($out)
2362 pxor 0x40(%rsp), @XMM[3]
2363 movdqu @XMM[6], 0x30($out)
2364 pxor 0x50(%rsp), @XMM[7]
2365 movdqu @XMM[3], 0x40($out)
2366 pxor 0x60(%rsp), @XMM[2]
2367 movdqu @XMM[7], 0x50($out)
2368 movdqu @XMM[2], 0x60($out)
2369 lea 0x70($out), $out
2371 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2375 pxor @XMM[8+4], @XMM[4]
2376 lea 0x60($inp), $inp
2377 pxor @XMM[8+5], @XMM[5]
2378 lea 0x80(%rsp), %rax # pass key schedule
2379 mov %edx, %r10d # pass rounds
2381 call _bsaes_encrypt8
2383 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2384 pxor 0x10(%rsp), @XMM[1]
2385 movdqu @XMM[0], 0x00($out) # write output
2386 pxor 0x20(%rsp), @XMM[4]
2387 movdqu @XMM[1], 0x10($out)
2388 pxor 0x30(%rsp), @XMM[6]
2389 movdqu @XMM[4], 0x20($out)
2390 pxor 0x40(%rsp), @XMM[3]
2391 movdqu @XMM[6], 0x30($out)
2392 pxor 0x50(%rsp), @XMM[7]
2393 movdqu @XMM[3], 0x40($out)
2394 movdqu @XMM[7], 0x50($out)
2395 lea 0x60($out), $out
2397 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2401 pxor @XMM[8+3], @XMM[3]
2402 lea 0x50($inp), $inp
2403 pxor @XMM[8+4], @XMM[4]
2404 lea 0x80(%rsp), %rax # pass key schedule
2405 mov %edx, %r10d # pass rounds
2407 call _bsaes_encrypt8
2409 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2410 pxor 0x10(%rsp), @XMM[1]
2411 movdqu @XMM[0], 0x00($out) # write output
2412 pxor 0x20(%rsp), @XMM[4]
2413 movdqu @XMM[1], 0x10($out)
2414 pxor 0x30(%rsp), @XMM[6]
2415 movdqu @XMM[4], 0x20($out)
2416 pxor 0x40(%rsp), @XMM[3]
2417 movdqu @XMM[6], 0x30($out)
2418 movdqu @XMM[3], 0x40($out)
2419 lea 0x50($out), $out
2421 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2425 pxor @XMM[8+2], @XMM[2]
2426 lea 0x40($inp), $inp
2427 pxor @XMM[8+3], @XMM[3]
2428 lea 0x80(%rsp), %rax # pass key schedule
2429 mov %edx, %r10d # pass rounds
2431 call _bsaes_encrypt8
2433 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2434 pxor 0x10(%rsp), @XMM[1]
2435 movdqu @XMM[0], 0x00($out) # write output
2436 pxor 0x20(%rsp), @XMM[4]
2437 movdqu @XMM[1], 0x10($out)
2438 pxor 0x30(%rsp), @XMM[6]
2439 movdqu @XMM[4], 0x20($out)
2440 movdqu @XMM[6], 0x30($out)
2441 lea 0x40($out), $out
2443 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2447 pxor @XMM[8+1], @XMM[1]
2448 lea 0x30($inp), $inp
2449 pxor @XMM[8+2], @XMM[2]
2450 lea 0x80(%rsp), %rax # pass key schedule
2451 mov %edx, %r10d # pass rounds
2453 call _bsaes_encrypt8
2455 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2456 pxor 0x10(%rsp), @XMM[1]
2457 movdqu @XMM[0], 0x00($out) # write output
2458 pxor 0x20(%rsp), @XMM[4]
2459 movdqu @XMM[1], 0x10($out)
2460 movdqu @XMM[4], 0x20($out)
2461 lea 0x30($out), $out
2463 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2467 pxor @XMM[8+0], @XMM[0]
2468 lea 0x20($inp), $inp
2469 pxor @XMM[8+1], @XMM[1]
2470 lea 0x80(%rsp), %rax # pass key schedule
2471 mov %edx, %r10d # pass rounds
2473 call _bsaes_encrypt8
2475 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2476 pxor 0x10(%rsp), @XMM[1]
2477 movdqu @XMM[0], 0x00($out) # write output
2478 movdqu @XMM[1], 0x10($out)
2479 lea 0x20($out), $out
2481 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2485 pxor @XMM[0], @XMM[8]
2486 lea 0x10($inp), $inp
2487 movdqa @XMM[8], 0x20(%rbp)
2488 lea 0x20(%rbp), $arg1
2489 lea 0x20(%rbp), $arg2
2491 call asm_AES_encrypt # doesn't touch %xmm
2492 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2493 #pxor @XMM[8], @XMM[0]
2494 #lea 0x80(%rsp), %rax # pass key schedule
2495 #mov %edx, %r10d # pass rounds
2496 #call _bsaes_encrypt8
2497 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2498 movdqu @XMM[0], 0x00($out) # write output
2499 lea 0x10($out), $out
2501 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2510 movzb -16(%rdx), %ecx
2518 movdqu -16($out), @XMM[0]
2519 lea 0x20(%rbp), $arg1
2520 pxor @XMM[7], @XMM[0]
2521 lea 0x20(%rbp), $arg2
2522 movdqa @XMM[0], 0x20(%rbp)
2524 call asm_AES_encrypt # doesn't touch %xmm
2525 pxor 0x20(%rbp), @XMM[7]
2526 movdqu @XMM[7], -16($out)
2531 .Lxts_enc_bzero: # wipe key schedule [if any]
2532 movdqa %xmm0, 0x00(%rax)
2533 movdqa %xmm0, 0x10(%rax)
2534 lea 0x20(%rax), %rax
2541 $code.=<<___ if ($win64);
2542 movaps 0x40(%rbp), %xmm6
2543 movaps 0x50(%rbp), %xmm7
2544 movaps 0x60(%rbp), %xmm8
2545 movaps 0x70(%rbp), %xmm9
2546 movaps 0x80(%rbp), %xmm10
2547 movaps 0x90(%rbp), %xmm11
2548 movaps 0xa0(%rbp), %xmm12
2549 movaps 0xb0(%rbp), %xmm13
2550 movaps 0xc0(%rbp), %xmm14
2551 movaps 0xd0(%rbp), %xmm15
2552 lea 0xa0(%rax), %rax
2568 lea (%rax), %rsp # restore %rsp
2569 .cfi_def_cfa_register %rsp
2573 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2575 .globl bsaes_xts_decrypt
2576 .type bsaes_xts_decrypt,\@abi-omnipotent
2594 lea -0x48(%rsp), %rsp
2595 .cfi_adjust_cfa_offset 0x48
2597 $code.=<<___ if ($win64);
2598 mov 0xa0(%rsp),$arg5 # pull key2
2599 mov 0xa8(%rsp),$arg6 # pull ivp
2600 lea -0xa0(%rsp), %rsp
2601 movaps %xmm6, 0x40(%rsp)
2602 movaps %xmm7, 0x50(%rsp)
2603 movaps %xmm8, 0x60(%rsp)
2604 movaps %xmm9, 0x70(%rsp)
2605 movaps %xmm10, 0x80(%rsp)
2606 movaps %xmm11, 0x90(%rsp)
2607 movaps %xmm12, 0xa0(%rsp)
2608 movaps %xmm13, 0xb0(%rsp)
2609 movaps %xmm14, 0xc0(%rsp)
2610 movaps %xmm15, 0xd0(%rsp)
2614 mov %rsp, %rbp # backup %rsp
2615 mov $arg1, $inp # backup arguments
2621 lea 0x20(%rbp), $arg2
2623 call asm_AES_encrypt # generate initial tweak
2625 mov 240($key), %eax # rounds
2626 mov $len, %rbx # backup $len
2628 mov %eax, %edx # rounds
2629 shl \$7, %rax # 128 bytes per inner round key
2630 sub \$`128-32`, %rax # size of bit-sliced key schedule
2633 mov %rsp, %rax # pass key schedule
2634 mov $key, %rcx # pass key
2635 mov %edx, %r10d # pass rounds
2636 call _bsaes_key_convert
2637 pxor (%rsp), %xmm7 # fix up round 0 key
2638 movdqa %xmm6, (%rax) # save last round key
2639 movdqa %xmm7, (%rsp)
2641 xor %eax, %eax # if ($len%16) len-=16;
2648 sub \$0x80, %rsp # place for tweak[8]
2649 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2652 movdqa .Lxts_magic(%rip), $twmask
2653 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2662 for ($i=0;$i<7;$i++) {
2664 pshufd \$0x13, $twtmp, $twres
2666 movdqa @XMM[7], @XMM[$i]
2667 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2668 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2669 pand $twmask, $twres # isolate carry and residue
2670 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2671 pxor $twres, @XMM[7]
2673 $code.=<<___ if ($i>=1);
2674 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2676 $code.=<<___ if ($i>=2);
2677 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2681 movdqu 0x60($inp), @XMM[8+6]
2682 pxor @XMM[8+5], @XMM[5]
2683 movdqu 0x70($inp), @XMM[8+7]
2684 lea 0x80($inp), $inp
2685 movdqa @XMM[7], 0x70(%rsp)
2686 pxor @XMM[8+6], @XMM[6]
2687 lea 0x80(%rsp), %rax # pass key schedule
2688 pxor @XMM[8+7], @XMM[7]
2689 mov %edx, %r10d # pass rounds
2691 call _bsaes_decrypt8
2693 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2694 pxor 0x10(%rsp), @XMM[1]
2695 movdqu @XMM[0], 0x00($out) # write output
2696 pxor 0x20(%rsp), @XMM[6]
2697 movdqu @XMM[1], 0x10($out)
2698 pxor 0x30(%rsp), @XMM[4]
2699 movdqu @XMM[6], 0x20($out)
2700 pxor 0x40(%rsp), @XMM[2]
2701 movdqu @XMM[4], 0x30($out)
2702 pxor 0x50(%rsp), @XMM[7]
2703 movdqu @XMM[2], 0x40($out)
2704 pxor 0x60(%rsp), @XMM[3]
2705 movdqu @XMM[7], 0x50($out)
2706 pxor 0x70(%rsp), @XMM[5]
2707 movdqu @XMM[3], 0x60($out)
2708 movdqu @XMM[5], 0x70($out)
2709 lea 0x80($out), $out
2711 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2713 movdqa .Lxts_magic(%rip), $twmask
2714 pcmpgtd @XMM[7], $twtmp
2715 pshufd \$0x13, $twtmp, $twres
2717 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2718 pand $twmask, $twres # isolate carry and residue
2719 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2720 pxor $twres, @XMM[7]
2729 for ($i=0;$i<7;$i++) {
2731 pshufd \$0x13, $twtmp, $twres
2733 movdqa @XMM[7], @XMM[$i]
2734 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2735 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2736 pand $twmask, $twres # isolate carry and residue
2737 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2738 pxor $twres, @XMM[7]
2740 $code.=<<___ if ($i>=1);
2741 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2742 cmp \$`0x10*$i`,$len
2745 $code.=<<___ if ($i>=2);
2746 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2750 movdqu 0x60($inp), @XMM[8+6]
2751 pxor @XMM[8+5], @XMM[5]
2752 movdqa @XMM[7], 0x70(%rsp)
2753 lea 0x70($inp), $inp
2754 pxor @XMM[8+6], @XMM[6]
2755 lea 0x80(%rsp), %rax # pass key schedule
2756 mov %edx, %r10d # pass rounds
2758 call _bsaes_decrypt8
2760 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2761 pxor 0x10(%rsp), @XMM[1]
2762 movdqu @XMM[0], 0x00($out) # write output
2763 pxor 0x20(%rsp), @XMM[6]
2764 movdqu @XMM[1], 0x10($out)
2765 pxor 0x30(%rsp), @XMM[4]
2766 movdqu @XMM[6], 0x20($out)
2767 pxor 0x40(%rsp), @XMM[2]
2768 movdqu @XMM[4], 0x30($out)
2769 pxor 0x50(%rsp), @XMM[7]
2770 movdqu @XMM[2], 0x40($out)
2771 pxor 0x60(%rsp), @XMM[3]
2772 movdqu @XMM[7], 0x50($out)
2773 movdqu @XMM[3], 0x60($out)
2774 lea 0x70($out), $out
2776 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2780 pxor @XMM[8+4], @XMM[4]
2781 lea 0x60($inp), $inp
2782 pxor @XMM[8+5], @XMM[5]
2783 lea 0x80(%rsp), %rax # pass key schedule
2784 mov %edx, %r10d # pass rounds
2786 call _bsaes_decrypt8
2788 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2789 pxor 0x10(%rsp), @XMM[1]
2790 movdqu @XMM[0], 0x00($out) # write output
2791 pxor 0x20(%rsp), @XMM[6]
2792 movdqu @XMM[1], 0x10($out)
2793 pxor 0x30(%rsp), @XMM[4]
2794 movdqu @XMM[6], 0x20($out)
2795 pxor 0x40(%rsp), @XMM[2]
2796 movdqu @XMM[4], 0x30($out)
2797 pxor 0x50(%rsp), @XMM[7]
2798 movdqu @XMM[2], 0x40($out)
2799 movdqu @XMM[7], 0x50($out)
2800 lea 0x60($out), $out
2802 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2806 pxor @XMM[8+3], @XMM[3]
2807 lea 0x50($inp), $inp
2808 pxor @XMM[8+4], @XMM[4]
2809 lea 0x80(%rsp), %rax # pass key schedule
2810 mov %edx, %r10d # pass rounds
2812 call _bsaes_decrypt8
2814 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2815 pxor 0x10(%rsp), @XMM[1]
2816 movdqu @XMM[0], 0x00($out) # write output
2817 pxor 0x20(%rsp), @XMM[6]
2818 movdqu @XMM[1], 0x10($out)
2819 pxor 0x30(%rsp), @XMM[4]
2820 movdqu @XMM[6], 0x20($out)
2821 pxor 0x40(%rsp), @XMM[2]
2822 movdqu @XMM[4], 0x30($out)
2823 movdqu @XMM[2], 0x40($out)
2824 lea 0x50($out), $out
2826 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2830 pxor @XMM[8+2], @XMM[2]
2831 lea 0x40($inp), $inp
2832 pxor @XMM[8+3], @XMM[3]
2833 lea 0x80(%rsp), %rax # pass key schedule
2834 mov %edx, %r10d # pass rounds
2836 call _bsaes_decrypt8
2838 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2839 pxor 0x10(%rsp), @XMM[1]
2840 movdqu @XMM[0], 0x00($out) # write output
2841 pxor 0x20(%rsp), @XMM[6]
2842 movdqu @XMM[1], 0x10($out)
2843 pxor 0x30(%rsp), @XMM[4]
2844 movdqu @XMM[6], 0x20($out)
2845 movdqu @XMM[4], 0x30($out)
2846 lea 0x40($out), $out
2848 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2852 pxor @XMM[8+1], @XMM[1]
2853 lea 0x30($inp), $inp
2854 pxor @XMM[8+2], @XMM[2]
2855 lea 0x80(%rsp), %rax # pass key schedule
2856 mov %edx, %r10d # pass rounds
2858 call _bsaes_decrypt8
2860 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2861 pxor 0x10(%rsp), @XMM[1]
2862 movdqu @XMM[0], 0x00($out) # write output
2863 pxor 0x20(%rsp), @XMM[6]
2864 movdqu @XMM[1], 0x10($out)
2865 movdqu @XMM[6], 0x20($out)
2866 lea 0x30($out), $out
2868 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2872 pxor @XMM[8+0], @XMM[0]
2873 lea 0x20($inp), $inp
2874 pxor @XMM[8+1], @XMM[1]
2875 lea 0x80(%rsp), %rax # pass key schedule
2876 mov %edx, %r10d # pass rounds
2878 call _bsaes_decrypt8
2880 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2881 pxor 0x10(%rsp), @XMM[1]
2882 movdqu @XMM[0], 0x00($out) # write output
2883 movdqu @XMM[1], 0x10($out)
2884 lea 0x20($out), $out
2886 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2890 pxor @XMM[0], @XMM[8]
2891 lea 0x10($inp), $inp
2892 movdqa @XMM[8], 0x20(%rbp)
2893 lea 0x20(%rbp), $arg1
2894 lea 0x20(%rbp), $arg2
2896 call asm_AES_decrypt # doesn't touch %xmm
2897 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2898 #pxor @XMM[8], @XMM[0]
2899 #lea 0x80(%rsp), %rax # pass key schedule
2900 #mov %edx, %r10d # pass rounds
2901 #call _bsaes_decrypt8
2902 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2903 movdqu @XMM[0], 0x00($out) # write output
2904 lea 0x10($out), $out
2906 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2913 movdqa .Lxts_magic(%rip), $twmask
2914 pcmpgtd @XMM[7], $twtmp
2915 pshufd \$0x13, $twtmp, $twres
2916 movdqa @XMM[7], @XMM[6]
2917 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2918 pand $twmask, $twres # isolate carry and residue
2919 movdqu ($inp), @XMM[0]
2920 pxor $twres, @XMM[7]
2922 lea 0x20(%rbp), $arg1
2923 pxor @XMM[7], @XMM[0]
2924 lea 0x20(%rbp), $arg2
2925 movdqa @XMM[0], 0x20(%rbp)
2927 call asm_AES_decrypt # doesn't touch %xmm
2928 pxor 0x20(%rbp), @XMM[7]
2930 movdqu @XMM[7], ($out)
2933 movzb 16($inp), %eax
2942 movdqu ($out), @XMM[0]
2943 lea 0x20(%rbp), $arg1
2944 pxor @XMM[6], @XMM[0]
2945 lea 0x20(%rbp), $arg2
2946 movdqa @XMM[0], 0x20(%rbp)
2948 call asm_AES_decrypt # doesn't touch %xmm
2949 pxor 0x20(%rbp), @XMM[6]
2950 movdqu @XMM[6], ($out)
2955 .Lxts_dec_bzero: # wipe key schedule [if any]
2956 movdqa %xmm0, 0x00(%rax)
2957 movdqa %xmm0, 0x10(%rax)
2958 lea 0x20(%rax), %rax
2965 $code.=<<___ if ($win64);
2966 movaps 0x40(%rbp), %xmm6
2967 movaps 0x50(%rbp), %xmm7
2968 movaps 0x60(%rbp), %xmm8
2969 movaps 0x70(%rbp), %xmm9
2970 movaps 0x80(%rbp), %xmm10
2971 movaps 0x90(%rbp), %xmm11
2972 movaps 0xa0(%rbp), %xmm12
2973 movaps 0xb0(%rbp), %xmm13
2974 movaps 0xc0(%rbp), %xmm14
2975 movaps 0xd0(%rbp), %xmm15
2976 lea 0xa0(%rax), %rax
2992 lea (%rax), %rsp # restore %rsp
2993 .cfi_def_cfa_register %rsp
2997 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
3001 .type _bsaes_const,\@object
3004 .LM0ISR: # InvShiftRows constants
3005 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
3007 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
3009 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
3010 .LBS0: # bit-slice constants
3011 .quad 0x5555555555555555, 0x5555555555555555
3013 .quad 0x3333333333333333, 0x3333333333333333
3015 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
3016 .LSR: # shiftrows constants
3017 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
3019 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
3021 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
3022 .LSWPUP: # byte-swap upper dword
3023 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
3025 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
3026 .LADD1: # counter increment constants
3027 .quad 0x0000000000000000, 0x0000000100000000
3029 .quad 0x0000000000000000, 0x0000000200000000
3031 .quad 0x0000000000000000, 0x0000000300000000
3033 .quad 0x0000000000000000, 0x0000000400000000
3035 .quad 0x0000000000000000, 0x0000000500000000
3037 .quad 0x0000000000000000, 0x0000000600000000
3039 .quad 0x0000000000000000, 0x0000000700000000
3041 .quad 0x0000000000000000, 0x0000000800000000
3045 .quad 0x0101010101010101, 0x0101010101010101
3046 .quad 0x0202020202020202, 0x0202020202020202
3047 .quad 0x0404040404040404, 0x0404040404040404
3048 .quad 0x0808080808080808, 0x0808080808080808
3050 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
3052 .quad 0x6363636363636363, 0x6363636363636363
3053 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
3055 .size _bsaes_const,.-_bsaes_const
3058 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3059 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
3067 .extern __imp_RtlVirtualUnwind
3068 .type se_handler,\@abi-omnipotent
3082 mov 120($context),%rax # pull context->Rax
3083 mov 248($context),%rbx # pull context->Rip
3085 mov 8($disp),%rsi # disp->ImageBase
3086 mov 56($disp),%r11 # disp->HandlerData
3088 mov 0(%r11),%r10d # HandlerData[0]
3089 lea (%rsi,%r10),%r10 # prologue label
3090 cmp %r10,%rbx # context->Rip<=prologue label
3093 mov 4(%r11),%r10d # HandlerData[1]
3094 lea (%rsi,%r10),%r10 # epilogue label
3095 cmp %r10,%rbx # context->Rip>=epilogue label
3098 mov 8(%r11),%r10d # HandlerData[2]
3099 lea (%rsi,%r10),%r10 # epilogue label
3100 cmp %r10,%rbx # context->Rip>=tail label
3103 mov 160($context),%rax # pull context->Rbp
3105 lea 0x40(%rax),%rsi # %xmm save area
3106 lea 512($context),%rdi # &context.Xmm6
3107 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
3108 .long 0xa548f3fc # cld; rep movsq
3109 lea 0xa0+0x78(%rax),%rax # adjust stack pointer
3118 mov %rbx,144($context) # restore context->Rbx
3119 mov %rbp,160($context) # restore context->Rbp
3120 mov %r12,216($context) # restore context->R12
3121 mov %r13,224($context) # restore context->R13
3122 mov %r14,232($context) # restore context->R14
3123 mov %r15,240($context) # restore context->R15
3126 mov %rax,152($context) # restore context->Rsp
3128 mov 40($disp),%rdi # disp->ContextRecord
3129 mov $context,%rsi # context
3130 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3131 .long 0xa548f3fc # cld; rep movsq
3134 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3135 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3136 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3137 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3138 mov 40(%rsi),%r10 # disp->ContextRecord
3139 lea 56(%rsi),%r11 # &disp->HandlerData
3140 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3141 mov %r10,32(%rsp) # arg5
3142 mov %r11,40(%rsp) # arg6
3143 mov %r12,48(%rsp) # arg7
3144 mov %rcx,56(%rsp) # arg8, (NULL)
3145 call *__imp_RtlVirtualUnwind(%rip)
3147 mov \$1,%eax # ExceptionContinueSearch
3159 .size se_handler,.-se_handler
3164 $code.=<<___ if ($ecb);
3165 .rva .Lecb_enc_prologue
3166 .rva .Lecb_enc_epilogue
3169 .rva .Lecb_dec_prologue
3170 .rva .Lecb_dec_epilogue
3174 .rva .Lcbc_dec_prologue
3175 .rva .Lcbc_dec_epilogue
3178 .rva .Lctr_enc_prologue
3179 .rva .Lctr_enc_epilogue
3182 .rva .Lxts_enc_prologue
3183 .rva .Lxts_enc_epilogue
3186 .rva .Lxts_dec_prologue
3187 .rva .Lxts_dec_epilogue
3193 $code.=<<___ if ($ecb);
3197 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3203 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3211 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3217 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3223 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3229 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3235 $code =~ s/\`([^\`]*)\`/eval($1)/gem;