2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ###################################################################
11 ### AES-128 [originally in CTR mode] ###
12 ### bitsliced implementation for Intel Core 2 processors ###
13 ### requires support of SSE extensions up to SSSE3 ###
14 ### Author: Emilia Käsper and Peter Schwabe ###
15 ### Date: 2009-03-19 ###
18 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
19 ### further information. ###
20 ###################################################################
24 # Started as transliteration to "perlasm" the original code has
25 # undergone following changes:
27 # - code was made position-independent;
28 # - rounds were folded into a loop resulting in >5x size reduction
29 # from 12.5KB to 2.2KB;
30 # - above was possibile thanks to mixcolumns() modification that
31 # allowed to feed its output back to aesenc[last], this was
32 # achieved at cost of two additional inter-registers moves;
33 # - some instruction reordering and interleaving;
34 # - this module doesn't implement key setup subroutine, instead it
35 # relies on conversion of "conventional" key schedule as returned
36 # by AES_set_encrypt_key (see discussion below);
37 # - first and last round keys are treated differently, which allowed
38 # to skip one shiftrows(), reduce bit-sliced key schedule and
39 # speed-up conversion by 22%;
40 # - support for 192- and 256-bit keys was added;
42 # Resulting performance in CPU cycles spent to encrypt one byte out
43 # of 4096-byte buffer with 128-bit key is:
45 # Emilia's this(*) difference
47 # Core 2 9.30 8.69 +7%
48 # Nehalem(**) 7.63 6.88 +11%
52 # (*) Comparison is not completely fair, because "this" is ECB,
53 # i.e. no extra processing such as counter values calculation
54 # and xor-ing input as in Emilia's CTR implementation is
55 # performed. However, the CTR calculations stand for not more
56 # than 1% of total time, so comparison is *rather* fair.
58 # (**) Results were collected on Westmere, which is considered to
59 # be equivalent to Nehalem for this code.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
67 # conversion conversion/8x block
72 # The ratio values mean that 128-byte blocks will be processed
73 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
84 # one byte out of 4096-byte buffer with 128-bit key is:
93 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
94 # suboptimal, but XTS is meant to be used with larger blocks...
100 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
102 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
104 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
105 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
106 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
107 die "can't locate x86_64-xlate.pl";
109 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
112 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
113 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
114 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
117 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
120 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
121 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
126 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
127 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
131 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
132 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
154 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
155 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
175 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
176 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
180 &InvInBasisChange (@b);
181 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
182 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
185 sub InvInBasisChange { # OutBasisChange in reverse
186 my @b=@_[5,1,2,6,3,7,0,4];
204 sub InvOutBasisChange { # InBasisChange in reverse
205 my @b=@_[2,5,7,3,6,1,0,4];
226 #;*************************************************************
227 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
228 #;*************************************************************
229 my ($x0,$x1,$y0,$y1,$t0)=@_;
242 sub Mul_GF4_N { # not used, see next subroutine
243 # multiply and scale by N
244 my ($x0,$x1,$y0,$y1,$t0)=@_;
258 # interleaved Mul_GF4_N and Mul_GF4
259 my ($x0,$x1,$y0,$y1,$t0,
260 $x2,$x3,$y2,$y3,$t1)=@_;
288 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
295 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
296 @x[2], @x[3], @y[2], @y[3], @t[2]);
308 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
309 @x[6], @x[7], @y[2], @y[3], @t[2]);
314 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
323 #;********************************************************************
324 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
325 #;********************************************************************
329 # direct optimizations from hardware
384 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
386 # new smaller inversion
420 # output in s3, s2, s1, t1
422 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
424 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
425 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
427 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
430 # AES linear components
436 pxor 0x00($key),@x[0]
437 pxor 0x10($key),@x[1]
438 pxor 0x20($key),@x[2]
439 pxor 0x30($key),@x[3]
442 pxor 0x40($key),@x[4]
443 pxor 0x50($key),@x[5]
446 pxor 0x60($key),@x[6]
447 pxor 0x70($key),@x[7]
457 # modified to emit output in order suitable for feeding back to aesenc[last]
460 my $inv=@_[16]; # optional
462 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
463 pshufd \$0x93, @x[1], @t[1]
464 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
465 pshufd \$0x93, @x[2], @t[2]
467 pshufd \$0x93, @x[3], @t[3]
469 pshufd \$0x93, @x[4], @t[4]
471 pshufd \$0x93, @x[5], @t[5]
473 pshufd \$0x93, @x[6], @t[6]
475 pshufd \$0x93, @x[7], @t[7]
482 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
484 pshufd \$0x4E, @x[1], @x[1]
490 pshufd \$0x4E, @x[4], @t[0]
492 pshufd \$0x4E, @x[5], @t[1]
494 pshufd \$0x4E, @x[3], @x[4]
496 pshufd \$0x4E, @x[7], @x[5]
498 pshufd \$0x4E, @x[6], @x[3]
500 pshufd \$0x4E, @x[2], @x[6]
503 $code.=<<___ if (!$inv);
511 $code.=<<___ if ($inv);
524 sub InvMixColumns_orig {
529 # multiplication by 0x0e
530 pshufd \$0x93, @x[7], @t[7]
532 pxor @x[5], @x[7] # 7 5
533 pxor @x[5], @x[2] # 2 5
534 pshufd \$0x93, @x[0], @t[0]
536 pxor @x[0], @x[5] # 5 0 [1]
537 pxor @x[1], @x[0] # 0 1
538 pshufd \$0x93, @x[1], @t[1]
539 pxor @x[2], @x[1] # 1 25
540 pxor @x[6], @x[0] # 01 6 [2]
541 pxor @x[3], @x[1] # 125 3 [4]
542 pshufd \$0x93, @x[3], @t[3]
543 pxor @x[0], @x[2] # 25 016 [3]
544 pxor @x[7], @x[3] # 3 75
545 pxor @x[6], @x[7] # 75 6 [0]
546 pshufd \$0x93, @x[6], @t[6]
548 pxor @x[4], @x[6] # 6 4
549 pxor @x[3], @x[4] # 4 375 [6]
550 pxor @x[7], @x[3] # 375 756=36
551 pxor @t[5], @x[6] # 64 5 [7]
552 pxor @t[2], @x[3] # 36 2
553 pxor @t[4], @x[3] # 362 4 [5]
554 pshufd \$0x93, @t[5], @t[5]
556 my @y = @x[7,5,0,2,1,3,4,6];
558 # multiplication by 0x0b
562 pshufd \$0x93, @t[2], @t[2]
566 pshufd \$0x93, @t[4], @t[4]
567 pxor @t[6], @t[7] # clobber t[7]
571 pshufd \$0x93, @t[0], @t[0]
575 pshufd \$0x93, @t[1], @t[1]
579 pshufd \$0x93, @t[2], @t[2]
583 pshufd \$0x93, @t[3], @t[3]
589 pxor @t[5], @t[7] # clobber t[7] even more
592 pshufd \$0x93, @t[4], @t[4]
597 pshufd \$0x93, @t[5], @t[5]
598 pxor @t[6], @t[7] # restore t[7]
600 # multiplication by 0x0d
603 pshufd \$0x93, @t[6], @t[6]
607 pshufd \$0x93, @t[7], @t[7]
616 pshufd \$0x93, @t[0], @t[0]
620 pshufd \$0x93, @t[1], @t[1]
625 pshufd \$0x93, @t[2], @t[2]
627 pxor @t[3], @t[6] # clobber t[6]
634 pshufd \$0x93, @t[4], @t[4]
637 pxor @t[3], @t[6] # restore t[6]
639 pshufd \$0x93, @t[5], @t[5]
640 pshufd \$0x93, @t[6], @t[6]
641 pshufd \$0x93, @t[7], @t[7]
642 pshufd \$0x93, @t[3], @t[3]
644 # multiplication by 0x09
646 pxor @y[1], @t[1] # t[1]=y[1]
647 pxor @t[5], @t[0] # clobber t[0]
650 pxor @y[0], @t[0] # t[0]=y[0]
652 pxor @t[7], @t[6] # clobber t[6]
655 pxor @y[4], @t[4] # t[4]=y[4]
657 pxor @y[3], @t[3] # t[3]=y[3]
659 pxor @y[2], @t[2] # t[2]=y[2]
661 pxor @y[5], @t[5] # t[5]=y[5]
664 pxor @y[6], @t[6] # t[6]=y[6]
665 pxor @y[7], @t[7] # t[7]=y[7]
682 # Thanks to Jussi Kivilinna for providing pointer to
684 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
685 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
686 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
687 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
690 # multiplication by 0x05-0x00-0x04-0x00
691 pshufd \$0x4E, @x[0], @t[0]
692 pshufd \$0x4E, @x[6], @t[6]
694 pshufd \$0x4E, @x[7], @t[7]
696 pshufd \$0x4E, @x[1], @t[1]
698 pshufd \$0x4E, @x[2], @t[2]
700 pshufd \$0x4E, @x[3], @t[3]
704 pshufd \$0x4E, @x[4], @t[4]
708 pshufd \$0x4E, @x[5], @t[5]
723 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
726 sub aesenc { # not used
730 movdqa 0x30($const),@t[0] # .LSR
732 &ShiftRows (@b,@t[0]);
734 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
737 sub aesenclast { # not used
741 movdqa 0x40($const),@t[0] # .LSRM0
743 &ShiftRows (@b,@t[0]);
746 pxor 0x00($key),@b[0]
747 pxor 0x10($key),@b[1]
748 pxor 0x20($key),@b[4]
749 pxor 0x30($key),@b[6]
750 pxor 0x40($key),@b[3]
751 pxor 0x50($key),@b[7]
752 pxor 0x60($key),@b[2]
753 pxor 0x70($key),@b[5]
758 my ($a,$b,$n,$mask,$t)=@_;
770 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
790 my @x=reverse(@_[0..7]);
791 my ($t0,$t1,$t2,$t3)=@_[8..11];
793 movdqa 0x00($const),$t0 # .LBS0
794 movdqa 0x10($const),$t1 # .LBS1
796 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
797 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
799 movdqa 0x20($const),$t0 # .LBS2
801 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
802 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
804 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
805 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
811 .extern asm_AES_encrypt
812 .extern asm_AES_decrypt
814 .type _bsaes_encrypt8,\@abi-omnipotent
817 lea .LBS0(%rip), $const # constants table
819 movdqa ($key), @XMM[9] # round 0 key
821 movdqa 0x50($const), @XMM[8] # .LM0SR
822 pxor @XMM[9], @XMM[0] # xor with round0 key
823 pxor @XMM[9], @XMM[1]
824 pxor @XMM[9], @XMM[2]
825 pxor @XMM[9], @XMM[3]
826 pshufb @XMM[8], @XMM[0]
827 pshufb @XMM[8], @XMM[1]
828 pxor @XMM[9], @XMM[4]
829 pxor @XMM[9], @XMM[5]
830 pshufb @XMM[8], @XMM[2]
831 pshufb @XMM[8], @XMM[3]
832 pxor @XMM[9], @XMM[6]
833 pxor @XMM[9], @XMM[7]
834 pshufb @XMM[8], @XMM[4]
835 pshufb @XMM[8], @XMM[5]
836 pshufb @XMM[8], @XMM[6]
837 pshufb @XMM[8], @XMM[7]
838 _bsaes_encrypt8_bitslice:
840 &bitslice (@XMM[0..7, 8..11]);
847 &ShiftRows (@XMM[0..7, 8]);
848 $code.=".Lenc_sbox:\n";
849 &Sbox (@XMM[0..7, 8..15]);
854 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
856 movdqa 0x30($const), @XMM[8] # .LSR
858 movdqa 0x40($const), @XMM[8] # .LSRM0
863 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
864 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
866 movdqa ($key), @XMM[8] # last round key
867 pxor @XMM[8], @XMM[4]
868 pxor @XMM[8], @XMM[6]
869 pxor @XMM[8], @XMM[3]
870 pxor @XMM[8], @XMM[7]
871 pxor @XMM[8], @XMM[2]
872 pxor @XMM[8], @XMM[5]
873 pxor @XMM[8], @XMM[0]
874 pxor @XMM[8], @XMM[1]
876 .size _bsaes_encrypt8,.-_bsaes_encrypt8
878 .type _bsaes_decrypt8,\@abi-omnipotent
881 lea .LBS0(%rip), $const # constants table
883 movdqa ($key), @XMM[9] # round 0 key
885 movdqa -0x30($const), @XMM[8] # .LM0ISR
886 pxor @XMM[9], @XMM[0] # xor with round0 key
887 pxor @XMM[9], @XMM[1]
888 pxor @XMM[9], @XMM[2]
889 pxor @XMM[9], @XMM[3]
890 pshufb @XMM[8], @XMM[0]
891 pshufb @XMM[8], @XMM[1]
892 pxor @XMM[9], @XMM[4]
893 pxor @XMM[9], @XMM[5]
894 pshufb @XMM[8], @XMM[2]
895 pshufb @XMM[8], @XMM[3]
896 pxor @XMM[9], @XMM[6]
897 pxor @XMM[9], @XMM[7]
898 pshufb @XMM[8], @XMM[4]
899 pshufb @XMM[8], @XMM[5]
900 pshufb @XMM[8], @XMM[6]
901 pshufb @XMM[8], @XMM[7]
903 &bitslice (@XMM[0..7, 8..11]);
910 &ShiftRows (@XMM[0..7, 8]);
911 $code.=".Ldec_sbox:\n";
912 &InvSbox (@XMM[0..7, 8..15]);
917 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
919 movdqa -0x10($const), @XMM[8] # .LISR
921 movdqa -0x20($const), @XMM[8] # .LISRM0
926 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
928 movdqa ($key), @XMM[8] # last round key
929 pxor @XMM[8], @XMM[6]
930 pxor @XMM[8], @XMM[4]
931 pxor @XMM[8], @XMM[2]
932 pxor @XMM[8], @XMM[7]
933 pxor @XMM[8], @XMM[3]
934 pxor @XMM[8], @XMM[5]
935 pxor @XMM[8], @XMM[0]
936 pxor @XMM[8], @XMM[1]
938 .size _bsaes_decrypt8,.-_bsaes_decrypt8
942 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
945 my @x=reverse(@_[0..7]);
946 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
948 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
950 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
954 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
956 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
958 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
964 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
965 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
969 .type _bsaes_key_convert,\@abi-omnipotent
972 lea .Lmasks(%rip), $const
973 movdqu ($inp), %xmm7 # load round 0 key
975 movdqa 0x00($const), %xmm0 # 0x01...
976 movdqa 0x10($const), %xmm1 # 0x02...
977 movdqa 0x20($const), %xmm2 # 0x04...
978 movdqa 0x30($const), %xmm3 # 0x08...
979 movdqa 0x40($const), %xmm4 # .LM0
980 pcmpeqd %xmm5, %xmm5 # .LNOT
982 movdqu ($inp), %xmm6 # load round 1 key
983 movdqa %xmm7, ($out) # save round 0 key
989 pshufb %xmm4, %xmm6 # .LM0
998 psllq \$4, %xmm0 # 0x10...
1000 pcmpeqb %xmm1, %xmm9
1001 psllq \$4, %xmm1 # 0x20...
1005 movdqa %xmm0, %xmm12
1006 pcmpeqb %xmm2, %xmm10
1007 psllq \$4, %xmm2 # 0x40...
1008 movdqa %xmm1, %xmm13
1009 pcmpeqb %xmm3, %xmm11
1010 psllq \$4, %xmm3 # 0x80...
1012 movdqa %xmm2, %xmm14
1013 movdqa %xmm3, %xmm15
1014 pxor %xmm5, %xmm8 # "pnot"
1019 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1020 pcmpeqb %xmm0, %xmm12
1021 psrlq \$4, %xmm0 # 0x01...
1022 movdqa %xmm9, 0x10($out)
1023 pcmpeqb %xmm1, %xmm13
1024 psrlq \$4, %xmm1 # 0x02...
1025 lea 0x10($inp), $inp
1029 movdqa %xmm10, 0x20($out)
1030 pcmpeqb %xmm2, %xmm14
1031 psrlq \$4, %xmm2 # 0x04...
1032 movdqa %xmm11, 0x30($out)
1033 pcmpeqb %xmm3, %xmm15
1034 psrlq \$4, %xmm3 # 0x08...
1035 movdqu ($inp), %xmm6 # load next round key
1037 pxor %xmm5, %xmm13 # "pnot"
1039 movdqa %xmm12, 0x40($out)
1040 movdqa %xmm13, 0x50($out)
1041 movdqa %xmm14, 0x60($out)
1042 movdqa %xmm15, 0x70($out)
1047 movdqa 0x50($const), %xmm7 # .L63
1048 #movdqa %xmm6, ($out) # don't save last round key
1050 .size _bsaes_key_convert,.-_bsaes_key_convert
1054 if (0 && !$win64) { # following four functions are unsupported interface
1055 # used for benchmarking...
1057 .globl bsaes_enc_key_convert
1058 .type bsaes_enc_key_convert,\@function,2
1060 bsaes_enc_key_convert:
1061 mov 240($inp),%r10d # pass rounds
1062 mov $inp,%rcx # pass key
1063 mov $out,%rax # pass key schedule
1064 call _bsaes_key_convert
1065 pxor %xmm6,%xmm7 # fix up last round key
1066 movdqa %xmm7,(%rax) # save last round key
1068 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1070 .globl bsaes_encrypt_128
1071 .type bsaes_encrypt_128,\@function,4
1075 movdqu 0x00($inp), @XMM[0] # load input
1076 movdqu 0x10($inp), @XMM[1]
1077 movdqu 0x20($inp), @XMM[2]
1078 movdqu 0x30($inp), @XMM[3]
1079 movdqu 0x40($inp), @XMM[4]
1080 movdqu 0x50($inp), @XMM[5]
1081 movdqu 0x60($inp), @XMM[6]
1082 movdqu 0x70($inp), @XMM[7]
1083 mov $key, %rax # pass the $key
1084 lea 0x80($inp), $inp
1087 call _bsaes_encrypt8
1089 movdqu @XMM[0], 0x00($out) # write output
1090 movdqu @XMM[1], 0x10($out)
1091 movdqu @XMM[4], 0x20($out)
1092 movdqu @XMM[6], 0x30($out)
1093 movdqu @XMM[3], 0x40($out)
1094 movdqu @XMM[7], 0x50($out)
1095 movdqu @XMM[2], 0x60($out)
1096 movdqu @XMM[5], 0x70($out)
1097 lea 0x80($out), $out
1101 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1103 .globl bsaes_dec_key_convert
1104 .type bsaes_dec_key_convert,\@function,2
1106 bsaes_dec_key_convert:
1107 mov 240($inp),%r10d # pass rounds
1108 mov $inp,%rcx # pass key
1109 mov $out,%rax # pass key schedule
1110 call _bsaes_key_convert
1111 pxor ($out),%xmm7 # fix up round 0 key
1112 movdqa %xmm6,(%rax) # save last round key
1115 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1117 .globl bsaes_decrypt_128
1118 .type bsaes_decrypt_128,\@function,4
1122 movdqu 0x00($inp), @XMM[0] # load input
1123 movdqu 0x10($inp), @XMM[1]
1124 movdqu 0x20($inp), @XMM[2]
1125 movdqu 0x30($inp), @XMM[3]
1126 movdqu 0x40($inp), @XMM[4]
1127 movdqu 0x50($inp), @XMM[5]
1128 movdqu 0x60($inp), @XMM[6]
1129 movdqu 0x70($inp), @XMM[7]
1130 mov $key, %rax # pass the $key
1131 lea 0x80($inp), $inp
1134 call _bsaes_decrypt8
1136 movdqu @XMM[0], 0x00($out) # write output
1137 movdqu @XMM[1], 0x10($out)
1138 movdqu @XMM[6], 0x20($out)
1139 movdqu @XMM[4], 0x30($out)
1140 movdqu @XMM[2], 0x40($out)
1141 movdqu @XMM[7], 0x50($out)
1142 movdqu @XMM[3], 0x60($out)
1143 movdqu @XMM[5], 0x70($out)
1144 lea 0x80($out), $out
1148 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1152 ######################################################################
1156 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1157 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1158 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1162 .globl bsaes_ecb_encrypt_blocks
1163 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1165 bsaes_ecb_encrypt_blocks:
1174 lea -0x48(%rsp),%rsp
1176 $code.=<<___ if ($win64);
1177 lea -0xa0(%rsp), %rsp
1178 movaps %xmm6, 0x40(%rsp)
1179 movaps %xmm7, 0x50(%rsp)
1180 movaps %xmm8, 0x60(%rsp)
1181 movaps %xmm9, 0x70(%rsp)
1182 movaps %xmm10, 0x80(%rsp)
1183 movaps %xmm11, 0x90(%rsp)
1184 movaps %xmm12, 0xa0(%rsp)
1185 movaps %xmm13, 0xb0(%rsp)
1186 movaps %xmm14, 0xc0(%rsp)
1187 movaps %xmm15, 0xd0(%rsp)
1191 mov %rsp,%rbp # backup %rsp
1192 mov 240($arg4),%eax # rounds
1193 mov $arg1,$inp # backup arguments
1200 mov %eax,%ebx # backup rounds
1201 shl \$7,%rax # 128 bytes per inner round key
1202 sub \$`128-32`,%rax # size of bit-sliced key schedule
1204 mov %rsp,%rax # pass key schedule
1205 mov $key,%rcx # pass key
1206 mov %ebx,%r10d # pass rounds
1207 call _bsaes_key_convert
1208 pxor %xmm6,%xmm7 # fix up last round key
1209 movdqa %xmm7,(%rax) # save last round key
1213 movdqu 0x00($inp), @XMM[0] # load input
1214 movdqu 0x10($inp), @XMM[1]
1215 movdqu 0x20($inp), @XMM[2]
1216 movdqu 0x30($inp), @XMM[3]
1217 movdqu 0x40($inp), @XMM[4]
1218 movdqu 0x50($inp), @XMM[5]
1219 mov %rsp, %rax # pass key schedule
1220 movdqu 0x60($inp), @XMM[6]
1221 mov %ebx,%r10d # pass rounds
1222 movdqu 0x70($inp), @XMM[7]
1223 lea 0x80($inp), $inp
1225 call _bsaes_encrypt8
1227 movdqu @XMM[0], 0x00($out) # write output
1228 movdqu @XMM[1], 0x10($out)
1229 movdqu @XMM[4], 0x20($out)
1230 movdqu @XMM[6], 0x30($out)
1231 movdqu @XMM[3], 0x40($out)
1232 movdqu @XMM[7], 0x50($out)
1233 movdqu @XMM[2], 0x60($out)
1234 movdqu @XMM[5], 0x70($out)
1235 lea 0x80($out), $out
1242 movdqu 0x00($inp), @XMM[0] # load input
1243 mov %rsp, %rax # pass key schedule
1244 mov %ebx,%r10d # pass rounds
1247 movdqu 0x10($inp), @XMM[1]
1249 movdqu 0x20($inp), @XMM[2]
1252 movdqu 0x30($inp), @XMM[3]
1254 movdqu 0x40($inp), @XMM[4]
1257 movdqu 0x50($inp), @XMM[5]
1259 movdqu 0x60($inp), @XMM[6]
1260 call _bsaes_encrypt8
1261 movdqu @XMM[0], 0x00($out) # write output
1262 movdqu @XMM[1], 0x10($out)
1263 movdqu @XMM[4], 0x20($out)
1264 movdqu @XMM[6], 0x30($out)
1265 movdqu @XMM[3], 0x40($out)
1266 movdqu @XMM[7], 0x50($out)
1267 movdqu @XMM[2], 0x60($out)
1271 call _bsaes_encrypt8
1272 movdqu @XMM[0], 0x00($out) # write output
1273 movdqu @XMM[1], 0x10($out)
1274 movdqu @XMM[4], 0x20($out)
1275 movdqu @XMM[6], 0x30($out)
1276 movdqu @XMM[3], 0x40($out)
1277 movdqu @XMM[7], 0x50($out)
1281 call _bsaes_encrypt8
1282 movdqu @XMM[0], 0x00($out) # write output
1283 movdqu @XMM[1], 0x10($out)
1284 movdqu @XMM[4], 0x20($out)
1285 movdqu @XMM[6], 0x30($out)
1286 movdqu @XMM[3], 0x40($out)
1290 call _bsaes_encrypt8
1291 movdqu @XMM[0], 0x00($out) # write output
1292 movdqu @XMM[1], 0x10($out)
1293 movdqu @XMM[4], 0x20($out)
1294 movdqu @XMM[6], 0x30($out)
1298 call _bsaes_encrypt8
1299 movdqu @XMM[0], 0x00($out) # write output
1300 movdqu @XMM[1], 0x10($out)
1301 movdqu @XMM[4], 0x20($out)
1305 call _bsaes_encrypt8
1306 movdqu @XMM[0], 0x00($out) # write output
1307 movdqu @XMM[1], 0x10($out)
1311 call _bsaes_encrypt8
1312 movdqu @XMM[0], 0x00($out) # write output
1319 call asm_AES_encrypt
1328 .Lecb_enc_bzero: # wipe key schedule [if any]
1329 movdqa %xmm0, 0x00(%rax)
1330 movdqa %xmm0, 0x10(%rax)
1331 lea 0x20(%rax), %rax
1335 lea (%rbp),%rsp # restore %rsp
1337 $code.=<<___ if ($win64);
1338 movaps 0x40(%rbp), %xmm6
1339 movaps 0x50(%rbp), %xmm7
1340 movaps 0x60(%rbp), %xmm8
1341 movaps 0x70(%rbp), %xmm9
1342 movaps 0x80(%rbp), %xmm10
1343 movaps 0x90(%rbp), %xmm11
1344 movaps 0xa0(%rbp), %xmm12
1345 movaps 0xb0(%rbp), %xmm13
1346 movaps 0xc0(%rbp), %xmm14
1347 movaps 0xd0(%rbp), %xmm15
1348 lea 0xa0(%rbp), %rsp
1351 mov 0x48(%rsp), %r15
1352 mov 0x50(%rsp), %r14
1353 mov 0x58(%rsp), %r13
1354 mov 0x60(%rsp), %r12
1355 mov 0x68(%rsp), %rbx
1356 mov 0x70(%rsp), %rax
1357 lea 0x78(%rsp), %rsp
1361 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1363 .globl bsaes_ecb_decrypt_blocks
1364 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1366 bsaes_ecb_decrypt_blocks:
1375 lea -0x48(%rsp),%rsp
1377 $code.=<<___ if ($win64);
1378 lea -0xa0(%rsp), %rsp
1379 movaps %xmm6, 0x40(%rsp)
1380 movaps %xmm7, 0x50(%rsp)
1381 movaps %xmm8, 0x60(%rsp)
1382 movaps %xmm9, 0x70(%rsp)
1383 movaps %xmm10, 0x80(%rsp)
1384 movaps %xmm11, 0x90(%rsp)
1385 movaps %xmm12, 0xa0(%rsp)
1386 movaps %xmm13, 0xb0(%rsp)
1387 movaps %xmm14, 0xc0(%rsp)
1388 movaps %xmm15, 0xd0(%rsp)
1392 mov %rsp,%rbp # backup %rsp
1393 mov 240($arg4),%eax # rounds
1394 mov $arg1,$inp # backup arguments
1401 mov %eax,%ebx # backup rounds
1402 shl \$7,%rax # 128 bytes per inner round key
1403 sub \$`128-32`,%rax # size of bit-sliced key schedule
1405 mov %rsp,%rax # pass key schedule
1406 mov $key,%rcx # pass key
1407 mov %ebx,%r10d # pass rounds
1408 call _bsaes_key_convert
1409 pxor (%rsp),%xmm7 # fix up 0 round key
1410 movdqa %xmm6,(%rax) # save last round key
1415 movdqu 0x00($inp), @XMM[0] # load input
1416 movdqu 0x10($inp), @XMM[1]
1417 movdqu 0x20($inp), @XMM[2]
1418 movdqu 0x30($inp), @XMM[3]
1419 movdqu 0x40($inp), @XMM[4]
1420 movdqu 0x50($inp), @XMM[5]
1421 mov %rsp, %rax # pass key schedule
1422 movdqu 0x60($inp), @XMM[6]
1423 mov %ebx,%r10d # pass rounds
1424 movdqu 0x70($inp), @XMM[7]
1425 lea 0x80($inp), $inp
1427 call _bsaes_decrypt8
1429 movdqu @XMM[0], 0x00($out) # write output
1430 movdqu @XMM[1], 0x10($out)
1431 movdqu @XMM[6], 0x20($out)
1432 movdqu @XMM[4], 0x30($out)
1433 movdqu @XMM[2], 0x40($out)
1434 movdqu @XMM[7], 0x50($out)
1435 movdqu @XMM[3], 0x60($out)
1436 movdqu @XMM[5], 0x70($out)
1437 lea 0x80($out), $out
1444 movdqu 0x00($inp), @XMM[0] # load input
1445 mov %rsp, %rax # pass key schedule
1446 mov %ebx,%r10d # pass rounds
1449 movdqu 0x10($inp), @XMM[1]
1451 movdqu 0x20($inp), @XMM[2]
1454 movdqu 0x30($inp), @XMM[3]
1456 movdqu 0x40($inp), @XMM[4]
1459 movdqu 0x50($inp), @XMM[5]
1461 movdqu 0x60($inp), @XMM[6]
1462 call _bsaes_decrypt8
1463 movdqu @XMM[0], 0x00($out) # write output
1464 movdqu @XMM[1], 0x10($out)
1465 movdqu @XMM[6], 0x20($out)
1466 movdqu @XMM[4], 0x30($out)
1467 movdqu @XMM[2], 0x40($out)
1468 movdqu @XMM[7], 0x50($out)
1469 movdqu @XMM[3], 0x60($out)
1473 call _bsaes_decrypt8
1474 movdqu @XMM[0], 0x00($out) # write output
1475 movdqu @XMM[1], 0x10($out)
1476 movdqu @XMM[6], 0x20($out)
1477 movdqu @XMM[4], 0x30($out)
1478 movdqu @XMM[2], 0x40($out)
1479 movdqu @XMM[7], 0x50($out)
1483 call _bsaes_decrypt8
1484 movdqu @XMM[0], 0x00($out) # write output
1485 movdqu @XMM[1], 0x10($out)
1486 movdqu @XMM[6], 0x20($out)
1487 movdqu @XMM[4], 0x30($out)
1488 movdqu @XMM[2], 0x40($out)
1492 call _bsaes_decrypt8
1493 movdqu @XMM[0], 0x00($out) # write output
1494 movdqu @XMM[1], 0x10($out)
1495 movdqu @XMM[6], 0x20($out)
1496 movdqu @XMM[4], 0x30($out)
1500 call _bsaes_decrypt8
1501 movdqu @XMM[0], 0x00($out) # write output
1502 movdqu @XMM[1], 0x10($out)
1503 movdqu @XMM[6], 0x20($out)
1507 call _bsaes_decrypt8
1508 movdqu @XMM[0], 0x00($out) # write output
1509 movdqu @XMM[1], 0x10($out)
1513 call _bsaes_decrypt8
1514 movdqu @XMM[0], 0x00($out) # write output
1521 call asm_AES_decrypt
1530 .Lecb_dec_bzero: # wipe key schedule [if any]
1531 movdqa %xmm0, 0x00(%rax)
1532 movdqa %xmm0, 0x10(%rax)
1533 lea 0x20(%rax), %rax
1537 lea (%rbp),%rsp # restore %rsp
1539 $code.=<<___ if ($win64);
1540 movaps 0x40(%rbp), %xmm6
1541 movaps 0x50(%rbp), %xmm7
1542 movaps 0x60(%rbp), %xmm8
1543 movaps 0x70(%rbp), %xmm9
1544 movaps 0x80(%rbp), %xmm10
1545 movaps 0x90(%rbp), %xmm11
1546 movaps 0xa0(%rbp), %xmm12
1547 movaps 0xb0(%rbp), %xmm13
1548 movaps 0xc0(%rbp), %xmm14
1549 movaps 0xd0(%rbp), %xmm15
1550 lea 0xa0(%rbp), %rsp
1553 mov 0x48(%rsp), %r15
1554 mov 0x50(%rsp), %r14
1555 mov 0x58(%rsp), %r13
1556 mov 0x60(%rsp), %r12
1557 mov 0x68(%rsp), %rbx
1558 mov 0x70(%rsp), %rax
1559 lea 0x78(%rsp), %rsp
1563 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1567 .extern asm_AES_cbc_encrypt
1568 .globl bsaes_cbc_encrypt
1569 .type bsaes_cbc_encrypt,\@abi-omnipotent
1573 $code.=<<___ if ($win64);
1574 mov 48(%rsp),$arg6 # pull direction flag
1578 jne asm_AES_cbc_encrypt
1580 jb asm_AES_cbc_encrypt
1590 lea -0x48(%rsp), %rsp
1592 $code.=<<___ if ($win64);
1593 mov 0xa0(%rsp),$arg5 # pull ivp
1594 lea -0xa0(%rsp), %rsp
1595 movaps %xmm6, 0x40(%rsp)
1596 movaps %xmm7, 0x50(%rsp)
1597 movaps %xmm8, 0x60(%rsp)
1598 movaps %xmm9, 0x70(%rsp)
1599 movaps %xmm10, 0x80(%rsp)
1600 movaps %xmm11, 0x90(%rsp)
1601 movaps %xmm12, 0xa0(%rsp)
1602 movaps %xmm13, 0xb0(%rsp)
1603 movaps %xmm14, 0xc0(%rsp)
1604 movaps %xmm15, 0xd0(%rsp)
1608 mov %rsp, %rbp # backup %rsp
1609 mov 240($arg4), %eax # rounds
1610 mov $arg1, $inp # backup arguments
1615 shr \$4, $len # bytes to blocks
1617 mov %eax, %edx # rounds
1618 shl \$7, %rax # 128 bytes per inner round key
1619 sub \$`128-32`, %rax # size of bit-sliced key schedule
1622 mov %rsp, %rax # pass key schedule
1623 mov $key, %rcx # pass key
1624 mov %edx, %r10d # pass rounds
1625 call _bsaes_key_convert
1626 pxor (%rsp),%xmm7 # fix up 0 round key
1627 movdqa %xmm6,(%rax) # save last round key
1630 movdqu (%rbx), @XMM[15] # load IV
1633 movdqu 0x00($inp), @XMM[0] # load input
1634 movdqu 0x10($inp), @XMM[1]
1635 movdqu 0x20($inp), @XMM[2]
1636 movdqu 0x30($inp), @XMM[3]
1637 movdqu 0x40($inp), @XMM[4]
1638 movdqu 0x50($inp), @XMM[5]
1639 mov %rsp, %rax # pass key schedule
1640 movdqu 0x60($inp), @XMM[6]
1641 mov %edx,%r10d # pass rounds
1642 movdqu 0x70($inp), @XMM[7]
1643 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1645 call _bsaes_decrypt8
1647 pxor 0x20(%rbp), @XMM[0] # ^= IV
1648 movdqu 0x00($inp), @XMM[8] # re-load input
1649 movdqu 0x10($inp), @XMM[9]
1650 pxor @XMM[8], @XMM[1]
1651 movdqu 0x20($inp), @XMM[10]
1652 pxor @XMM[9], @XMM[6]
1653 movdqu 0x30($inp), @XMM[11]
1654 pxor @XMM[10], @XMM[4]
1655 movdqu 0x40($inp), @XMM[12]
1656 pxor @XMM[11], @XMM[2]
1657 movdqu 0x50($inp), @XMM[13]
1658 pxor @XMM[12], @XMM[7]
1659 movdqu 0x60($inp), @XMM[14]
1660 pxor @XMM[13], @XMM[3]
1661 movdqu 0x70($inp), @XMM[15] # IV
1662 pxor @XMM[14], @XMM[5]
1663 movdqu @XMM[0], 0x00($out) # write output
1664 lea 0x80($inp), $inp
1665 movdqu @XMM[1], 0x10($out)
1666 movdqu @XMM[6], 0x20($out)
1667 movdqu @XMM[4], 0x30($out)
1668 movdqu @XMM[2], 0x40($out)
1669 movdqu @XMM[7], 0x50($out)
1670 movdqu @XMM[3], 0x60($out)
1671 movdqu @XMM[5], 0x70($out)
1672 lea 0x80($out), $out
1679 movdqu 0x00($inp), @XMM[0] # load input
1680 mov %rsp, %rax # pass key schedule
1681 mov %edx, %r10d # pass rounds
1684 movdqu 0x10($inp), @XMM[1]
1686 movdqu 0x20($inp), @XMM[2]
1689 movdqu 0x30($inp), @XMM[3]
1691 movdqu 0x40($inp), @XMM[4]
1694 movdqu 0x50($inp), @XMM[5]
1696 movdqu 0x60($inp), @XMM[6]
1697 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1698 call _bsaes_decrypt8
1699 pxor 0x20(%rbp), @XMM[0] # ^= IV
1700 movdqu 0x00($inp), @XMM[8] # re-load input
1701 movdqu 0x10($inp), @XMM[9]
1702 pxor @XMM[8], @XMM[1]
1703 movdqu 0x20($inp), @XMM[10]
1704 pxor @XMM[9], @XMM[6]
1705 movdqu 0x30($inp), @XMM[11]
1706 pxor @XMM[10], @XMM[4]
1707 movdqu 0x40($inp), @XMM[12]
1708 pxor @XMM[11], @XMM[2]
1709 movdqu 0x50($inp), @XMM[13]
1710 pxor @XMM[12], @XMM[7]
1711 movdqu 0x60($inp), @XMM[15] # IV
1712 pxor @XMM[13], @XMM[3]
1713 movdqu @XMM[0], 0x00($out) # write output
1714 movdqu @XMM[1], 0x10($out)
1715 movdqu @XMM[6], 0x20($out)
1716 movdqu @XMM[4], 0x30($out)
1717 movdqu @XMM[2], 0x40($out)
1718 movdqu @XMM[7], 0x50($out)
1719 movdqu @XMM[3], 0x60($out)
1723 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1724 call _bsaes_decrypt8
1725 pxor 0x20(%rbp), @XMM[0] # ^= IV
1726 movdqu 0x00($inp), @XMM[8] # re-load input
1727 movdqu 0x10($inp), @XMM[9]
1728 pxor @XMM[8], @XMM[1]
1729 movdqu 0x20($inp), @XMM[10]
1730 pxor @XMM[9], @XMM[6]
1731 movdqu 0x30($inp), @XMM[11]
1732 pxor @XMM[10], @XMM[4]
1733 movdqu 0x40($inp), @XMM[12]
1734 pxor @XMM[11], @XMM[2]
1735 movdqu 0x50($inp), @XMM[15] # IV
1736 pxor @XMM[12], @XMM[7]
1737 movdqu @XMM[0], 0x00($out) # write output
1738 movdqu @XMM[1], 0x10($out)
1739 movdqu @XMM[6], 0x20($out)
1740 movdqu @XMM[4], 0x30($out)
1741 movdqu @XMM[2], 0x40($out)
1742 movdqu @XMM[7], 0x50($out)
1746 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1747 call _bsaes_decrypt8
1748 pxor 0x20(%rbp), @XMM[0] # ^= IV
1749 movdqu 0x00($inp), @XMM[8] # re-load input
1750 movdqu 0x10($inp), @XMM[9]
1751 pxor @XMM[8], @XMM[1]
1752 movdqu 0x20($inp), @XMM[10]
1753 pxor @XMM[9], @XMM[6]
1754 movdqu 0x30($inp), @XMM[11]
1755 pxor @XMM[10], @XMM[4]
1756 movdqu 0x40($inp), @XMM[15] # IV
1757 pxor @XMM[11], @XMM[2]
1758 movdqu @XMM[0], 0x00($out) # write output
1759 movdqu @XMM[1], 0x10($out)
1760 movdqu @XMM[6], 0x20($out)
1761 movdqu @XMM[4], 0x30($out)
1762 movdqu @XMM[2], 0x40($out)
1766 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1767 call _bsaes_decrypt8
1768 pxor 0x20(%rbp), @XMM[0] # ^= IV
1769 movdqu 0x00($inp), @XMM[8] # re-load input
1770 movdqu 0x10($inp), @XMM[9]
1771 pxor @XMM[8], @XMM[1]
1772 movdqu 0x20($inp), @XMM[10]
1773 pxor @XMM[9], @XMM[6]
1774 movdqu 0x30($inp), @XMM[15] # IV
1775 pxor @XMM[10], @XMM[4]
1776 movdqu @XMM[0], 0x00($out) # write output
1777 movdqu @XMM[1], 0x10($out)
1778 movdqu @XMM[6], 0x20($out)
1779 movdqu @XMM[4], 0x30($out)
1783 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1784 call _bsaes_decrypt8
1785 pxor 0x20(%rbp), @XMM[0] # ^= IV
1786 movdqu 0x00($inp), @XMM[8] # re-load input
1787 movdqu 0x10($inp), @XMM[9]
1788 pxor @XMM[8], @XMM[1]
1789 movdqu 0x20($inp), @XMM[15] # IV
1790 pxor @XMM[9], @XMM[6]
1791 movdqu @XMM[0], 0x00($out) # write output
1792 movdqu @XMM[1], 0x10($out)
1793 movdqu @XMM[6], 0x20($out)
1797 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1798 call _bsaes_decrypt8
1799 pxor 0x20(%rbp), @XMM[0] # ^= IV
1800 movdqu 0x00($inp), @XMM[8] # re-load input
1801 movdqu 0x10($inp), @XMM[15] # IV
1802 pxor @XMM[8], @XMM[1]
1803 movdqu @XMM[0], 0x00($out) # write output
1804 movdqu @XMM[1], 0x10($out)
1809 lea 0x20(%rbp), $arg2 # buffer output
1811 call asm_AES_decrypt # doesn't touch %xmm
1812 pxor 0x20(%rbp), @XMM[15] # ^= IV
1813 movdqu @XMM[15], ($out) # write output
1814 movdqa @XMM[0], @XMM[15] # IV
1817 movdqu @XMM[15], (%rbx) # return IV
1820 .Lcbc_dec_bzero: # wipe key schedule [if any]
1821 movdqa %xmm0, 0x00(%rax)
1822 movdqa %xmm0, 0x10(%rax)
1823 lea 0x20(%rax), %rax
1827 lea (%rbp),%rsp # restore %rsp
1829 $code.=<<___ if ($win64);
1830 movaps 0x40(%rbp), %xmm6
1831 movaps 0x50(%rbp), %xmm7
1832 movaps 0x60(%rbp), %xmm8
1833 movaps 0x70(%rbp), %xmm9
1834 movaps 0x80(%rbp), %xmm10
1835 movaps 0x90(%rbp), %xmm11
1836 movaps 0xa0(%rbp), %xmm12
1837 movaps 0xb0(%rbp), %xmm13
1838 movaps 0xc0(%rbp), %xmm14
1839 movaps 0xd0(%rbp), %xmm15
1840 lea 0xa0(%rbp), %rsp
1843 mov 0x48(%rsp), %r15
1844 mov 0x50(%rsp), %r14
1845 mov 0x58(%rsp), %r13
1846 mov 0x60(%rsp), %r12
1847 mov 0x68(%rsp), %rbx
1848 mov 0x70(%rsp), %rax
1849 lea 0x78(%rsp), %rsp
1853 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1855 .globl bsaes_ctr32_encrypt_blocks
1856 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1858 bsaes_ctr32_encrypt_blocks:
1867 lea -0x48(%rsp), %rsp
1869 $code.=<<___ if ($win64);
1870 mov 0xa0(%rsp),$arg5 # pull ivp
1871 lea -0xa0(%rsp), %rsp
1872 movaps %xmm6, 0x40(%rsp)
1873 movaps %xmm7, 0x50(%rsp)
1874 movaps %xmm8, 0x60(%rsp)
1875 movaps %xmm9, 0x70(%rsp)
1876 movaps %xmm10, 0x80(%rsp)
1877 movaps %xmm11, 0x90(%rsp)
1878 movaps %xmm12, 0xa0(%rsp)
1879 movaps %xmm13, 0xb0(%rsp)
1880 movaps %xmm14, 0xc0(%rsp)
1881 movaps %xmm15, 0xd0(%rsp)
1885 mov %rsp, %rbp # backup %rsp
1886 movdqu ($arg5), %xmm0 # load counter
1887 mov 240($arg4), %eax # rounds
1888 mov $arg1, $inp # backup arguments
1892 movdqa %xmm0, 0x20(%rbp) # copy counter
1896 mov %eax, %ebx # rounds
1897 shl \$7, %rax # 128 bytes per inner round key
1898 sub \$`128-32`, %rax # size of bit-sliced key schedule
1901 mov %rsp, %rax # pass key schedule
1902 mov $key, %rcx # pass key
1903 mov %ebx, %r10d # pass rounds
1904 call _bsaes_key_convert
1905 pxor %xmm6,%xmm7 # fix up last round key
1906 movdqa %xmm7,(%rax) # save last round key
1908 movdqa (%rsp), @XMM[9] # load round0 key
1909 lea .LADD1(%rip), %r11
1910 movdqa 0x20(%rbp), @XMM[0] # counter copy
1911 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1912 pshufb @XMM[8], @XMM[9] # byte swap upper part
1913 pshufb @XMM[8], @XMM[0]
1914 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1918 movdqa @XMM[0], 0x20(%rbp) # save counter
1919 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1920 movdqa @XMM[0], @XMM[2]
1921 paddd 0x00(%r11), @XMM[1] # .LADD1
1922 movdqa @XMM[0], @XMM[3]
1923 paddd 0x10(%r11), @XMM[2] # .LADD2
1924 movdqa @XMM[0], @XMM[4]
1925 paddd 0x20(%r11), @XMM[3] # .LADD3
1926 movdqa @XMM[0], @XMM[5]
1927 paddd 0x30(%r11), @XMM[4] # .LADD4
1928 movdqa @XMM[0], @XMM[6]
1929 paddd 0x40(%r11), @XMM[5] # .LADD5
1930 movdqa @XMM[0], @XMM[7]
1931 paddd 0x50(%r11), @XMM[6] # .LADD6
1932 paddd 0x60(%r11), @XMM[7] # .LADD7
1934 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1935 # to flip byte order in 32-bit counter
1936 movdqa (%rsp), @XMM[9] # round 0 key
1937 lea 0x10(%rsp), %rax # pass key schedule
1938 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1939 pxor @XMM[9], @XMM[0] # xor with round0 key
1940 pxor @XMM[9], @XMM[1]
1941 pxor @XMM[9], @XMM[2]
1942 pxor @XMM[9], @XMM[3]
1943 pshufb @XMM[8], @XMM[0]
1944 pshufb @XMM[8], @XMM[1]
1945 pxor @XMM[9], @XMM[4]
1946 pxor @XMM[9], @XMM[5]
1947 pshufb @XMM[8], @XMM[2]
1948 pshufb @XMM[8], @XMM[3]
1949 pxor @XMM[9], @XMM[6]
1950 pxor @XMM[9], @XMM[7]
1951 pshufb @XMM[8], @XMM[4]
1952 pshufb @XMM[8], @XMM[5]
1953 pshufb @XMM[8], @XMM[6]
1954 pshufb @XMM[8], @XMM[7]
1955 lea .LBS0(%rip), %r11 # constants table
1956 mov %ebx,%r10d # pass rounds
1958 call _bsaes_encrypt8_bitslice
1961 jc .Lctr_enc_loop_done
1963 movdqu 0x00($inp), @XMM[8] # load input
1964 movdqu 0x10($inp), @XMM[9]
1965 movdqu 0x20($inp), @XMM[10]
1966 movdqu 0x30($inp), @XMM[11]
1967 movdqu 0x40($inp), @XMM[12]
1968 movdqu 0x50($inp), @XMM[13]
1969 movdqu 0x60($inp), @XMM[14]
1970 movdqu 0x70($inp), @XMM[15]
1972 pxor @XMM[0], @XMM[8]
1973 movdqa 0x20(%rbp), @XMM[0] # load counter
1974 pxor @XMM[9], @XMM[1]
1975 movdqu @XMM[8], 0x00($out) # write output
1976 pxor @XMM[10], @XMM[4]
1977 movdqu @XMM[1], 0x10($out)
1978 pxor @XMM[11], @XMM[6]
1979 movdqu @XMM[4], 0x20($out)
1980 pxor @XMM[12], @XMM[3]
1981 movdqu @XMM[6], 0x30($out)
1982 pxor @XMM[13], @XMM[7]
1983 movdqu @XMM[3], 0x40($out)
1984 pxor @XMM[14], @XMM[2]
1985 movdqu @XMM[7], 0x50($out)
1986 pxor @XMM[15], @XMM[5]
1987 movdqu @XMM[2], 0x60($out)
1988 lea .LADD1(%rip), %r11
1989 movdqu @XMM[5], 0x70($out)
1990 lea 0x80($out), $out
1991 paddd 0x70(%r11), @XMM[0] # .LADD8
1996 .Lctr_enc_loop_done:
1998 movdqu 0x00($inp), @XMM[8] # load input
1999 pxor @XMM[8], @XMM[0]
2000 movdqu @XMM[0], 0x00($out) # write output
2003 movdqu 0x10($inp), @XMM[9]
2004 pxor @XMM[9], @XMM[1]
2005 movdqu @XMM[1], 0x10($out)
2007 movdqu 0x20($inp), @XMM[10]
2008 pxor @XMM[10], @XMM[4]
2009 movdqu @XMM[4], 0x20($out)
2012 movdqu 0x30($inp), @XMM[11]
2013 pxor @XMM[11], @XMM[6]
2014 movdqu @XMM[6], 0x30($out)
2016 movdqu 0x40($inp), @XMM[12]
2017 pxor @XMM[12], @XMM[3]
2018 movdqu @XMM[3], 0x40($out)
2021 movdqu 0x50($inp), @XMM[13]
2022 pxor @XMM[13], @XMM[7]
2023 movdqu @XMM[7], 0x50($out)
2025 movdqu 0x60($inp), @XMM[14]
2026 pxor @XMM[14], @XMM[2]
2027 movdqu @XMM[2], 0x60($out)
2032 lea 0x20(%rbp), $arg1
2033 lea 0x30(%rbp), $arg2
2035 call asm_AES_encrypt
2036 movdqu ($inp), @XMM[1]
2038 mov 0x2c(%rbp), %eax # load 32-bit counter
2040 pxor 0x30(%rbp), @XMM[1]
2041 inc %eax # increment
2042 movdqu @XMM[1], ($out)
2045 mov %eax, 0x2c(%rsp) # save 32-bit counter
2052 .Lctr_enc_bzero: # wipe key schedule [if any]
2053 movdqa %xmm0, 0x00(%rax)
2054 movdqa %xmm0, 0x10(%rax)
2055 lea 0x20(%rax), %rax
2059 lea (%rbp),%rsp # restore %rsp
2061 $code.=<<___ if ($win64);
2062 movaps 0x40(%rbp), %xmm6
2063 movaps 0x50(%rbp), %xmm7
2064 movaps 0x60(%rbp), %xmm8
2065 movaps 0x70(%rbp), %xmm9
2066 movaps 0x80(%rbp), %xmm10
2067 movaps 0x90(%rbp), %xmm11
2068 movaps 0xa0(%rbp), %xmm12
2069 movaps 0xb0(%rbp), %xmm13
2070 movaps 0xc0(%rbp), %xmm14
2071 movaps 0xd0(%rbp), %xmm15
2072 lea 0xa0(%rbp), %rsp
2075 mov 0x48(%rsp), %r15
2076 mov 0x50(%rsp), %r14
2077 mov 0x58(%rsp), %r13
2078 mov 0x60(%rsp), %r12
2079 mov 0x68(%rsp), %rbx
2080 mov 0x70(%rsp), %rax
2081 lea 0x78(%rsp), %rsp
2085 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2087 ######################################################################
2088 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2089 # const AES_KEY *key1, const AES_KEY *key2,
2090 # const unsigned char iv[16]);
2092 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2096 .globl bsaes_xts_encrypt
2097 .type bsaes_xts_encrypt,\@abi-omnipotent
2108 lea -0x48(%rsp), %rsp
2110 $code.=<<___ if ($win64);
2111 mov 0xa0(%rsp),$arg5 # pull key2
2112 mov 0xa8(%rsp),$arg6 # pull ivp
2113 lea -0xa0(%rsp), %rsp
2114 movaps %xmm6, 0x40(%rsp)
2115 movaps %xmm7, 0x50(%rsp)
2116 movaps %xmm8, 0x60(%rsp)
2117 movaps %xmm9, 0x70(%rsp)
2118 movaps %xmm10, 0x80(%rsp)
2119 movaps %xmm11, 0x90(%rsp)
2120 movaps %xmm12, 0xa0(%rsp)
2121 movaps %xmm13, 0xb0(%rsp)
2122 movaps %xmm14, 0xc0(%rsp)
2123 movaps %xmm15, 0xd0(%rsp)
2127 mov %rsp, %rbp # backup %rsp
2128 mov $arg1, $inp # backup arguments
2134 lea 0x20(%rbp), $arg2
2136 call asm_AES_encrypt # generate initial tweak
2138 mov 240($key), %eax # rounds
2139 mov $len, %rbx # backup $len
2141 mov %eax, %edx # rounds
2142 shl \$7, %rax # 128 bytes per inner round key
2143 sub \$`128-32`, %rax # size of bit-sliced key schedule
2146 mov %rsp, %rax # pass key schedule
2147 mov $key, %rcx # pass key
2148 mov %edx, %r10d # pass rounds
2149 call _bsaes_key_convert
2150 pxor %xmm6, %xmm7 # fix up last round key
2151 movdqa %xmm7, (%rax) # save last round key
2154 sub \$0x80, %rsp # place for tweak[8]
2155 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2158 movdqa .Lxts_magic(%rip), $twmask
2159 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2168 for ($i=0;$i<7;$i++) {
2170 pshufd \$0x13, $twtmp, $twres
2172 movdqa @XMM[7], @XMM[$i]
2173 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2174 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2175 pand $twmask, $twres # isolate carry and residue
2176 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2177 pxor $twres, @XMM[7]
2179 $code.=<<___ if ($i>=1);
2180 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2182 $code.=<<___ if ($i>=2);
2183 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2187 movdqu 0x60($inp), @XMM[8+6]
2188 pxor @XMM[8+5], @XMM[5]
2189 movdqu 0x70($inp), @XMM[8+7]
2190 lea 0x80($inp), $inp
2191 movdqa @XMM[7], 0x70(%rsp)
2192 pxor @XMM[8+6], @XMM[6]
2193 lea 0x80(%rsp), %rax # pass key schedule
2194 pxor @XMM[8+7], @XMM[7]
2195 mov %edx, %r10d # pass rounds
2197 call _bsaes_encrypt8
2199 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2200 pxor 0x10(%rsp), @XMM[1]
2201 movdqu @XMM[0], 0x00($out) # write output
2202 pxor 0x20(%rsp), @XMM[4]
2203 movdqu @XMM[1], 0x10($out)
2204 pxor 0x30(%rsp), @XMM[6]
2205 movdqu @XMM[4], 0x20($out)
2206 pxor 0x40(%rsp), @XMM[3]
2207 movdqu @XMM[6], 0x30($out)
2208 pxor 0x50(%rsp), @XMM[7]
2209 movdqu @XMM[3], 0x40($out)
2210 pxor 0x60(%rsp), @XMM[2]
2211 movdqu @XMM[7], 0x50($out)
2212 pxor 0x70(%rsp), @XMM[5]
2213 movdqu @XMM[2], 0x60($out)
2214 movdqu @XMM[5], 0x70($out)
2215 lea 0x80($out), $out
2217 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2219 movdqa .Lxts_magic(%rip), $twmask
2220 pcmpgtd @XMM[7], $twtmp
2221 pshufd \$0x13, $twtmp, $twres
2223 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2224 pand $twmask, $twres # isolate carry and residue
2225 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2226 pxor $twres, @XMM[7]
2235 for ($i=0;$i<7;$i++) {
2237 pshufd \$0x13, $twtmp, $twres
2239 movdqa @XMM[7], @XMM[$i]
2240 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2241 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2242 pand $twmask, $twres # isolate carry and residue
2243 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2244 pxor $twres, @XMM[7]
2246 $code.=<<___ if ($i>=1);
2247 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2248 cmp \$`0x10*$i`,$len
2251 $code.=<<___ if ($i>=2);
2252 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2256 movdqu 0x60($inp), @XMM[8+6]
2257 pxor @XMM[8+5], @XMM[5]
2258 movdqa @XMM[7], 0x70(%rsp)
2259 lea 0x70($inp), $inp
2260 pxor @XMM[8+6], @XMM[6]
2261 lea 0x80(%rsp), %rax # pass key schedule
2262 mov %edx, %r10d # pass rounds
2264 call _bsaes_encrypt8
2266 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2267 pxor 0x10(%rsp), @XMM[1]
2268 movdqu @XMM[0], 0x00($out) # write output
2269 pxor 0x20(%rsp), @XMM[4]
2270 movdqu @XMM[1], 0x10($out)
2271 pxor 0x30(%rsp), @XMM[6]
2272 movdqu @XMM[4], 0x20($out)
2273 pxor 0x40(%rsp), @XMM[3]
2274 movdqu @XMM[6], 0x30($out)
2275 pxor 0x50(%rsp), @XMM[7]
2276 movdqu @XMM[3], 0x40($out)
2277 pxor 0x60(%rsp), @XMM[2]
2278 movdqu @XMM[7], 0x50($out)
2279 movdqu @XMM[2], 0x60($out)
2280 lea 0x70($out), $out
2282 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2286 pxor @XMM[8+4], @XMM[4]
2287 lea 0x60($inp), $inp
2288 pxor @XMM[8+5], @XMM[5]
2289 lea 0x80(%rsp), %rax # pass key schedule
2290 mov %edx, %r10d # pass rounds
2292 call _bsaes_encrypt8
2294 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2295 pxor 0x10(%rsp), @XMM[1]
2296 movdqu @XMM[0], 0x00($out) # write output
2297 pxor 0x20(%rsp), @XMM[4]
2298 movdqu @XMM[1], 0x10($out)
2299 pxor 0x30(%rsp), @XMM[6]
2300 movdqu @XMM[4], 0x20($out)
2301 pxor 0x40(%rsp), @XMM[3]
2302 movdqu @XMM[6], 0x30($out)
2303 pxor 0x50(%rsp), @XMM[7]
2304 movdqu @XMM[3], 0x40($out)
2305 movdqu @XMM[7], 0x50($out)
2306 lea 0x60($out), $out
2308 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2312 pxor @XMM[8+3], @XMM[3]
2313 lea 0x50($inp), $inp
2314 pxor @XMM[8+4], @XMM[4]
2315 lea 0x80(%rsp), %rax # pass key schedule
2316 mov %edx, %r10d # pass rounds
2318 call _bsaes_encrypt8
2320 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2321 pxor 0x10(%rsp), @XMM[1]
2322 movdqu @XMM[0], 0x00($out) # write output
2323 pxor 0x20(%rsp), @XMM[4]
2324 movdqu @XMM[1], 0x10($out)
2325 pxor 0x30(%rsp), @XMM[6]
2326 movdqu @XMM[4], 0x20($out)
2327 pxor 0x40(%rsp), @XMM[3]
2328 movdqu @XMM[6], 0x30($out)
2329 movdqu @XMM[3], 0x40($out)
2330 lea 0x50($out), $out
2332 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2336 pxor @XMM[8+2], @XMM[2]
2337 lea 0x40($inp), $inp
2338 pxor @XMM[8+3], @XMM[3]
2339 lea 0x80(%rsp), %rax # pass key schedule
2340 mov %edx, %r10d # pass rounds
2342 call _bsaes_encrypt8
2344 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2345 pxor 0x10(%rsp), @XMM[1]
2346 movdqu @XMM[0], 0x00($out) # write output
2347 pxor 0x20(%rsp), @XMM[4]
2348 movdqu @XMM[1], 0x10($out)
2349 pxor 0x30(%rsp), @XMM[6]
2350 movdqu @XMM[4], 0x20($out)
2351 movdqu @XMM[6], 0x30($out)
2352 lea 0x40($out), $out
2354 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2358 pxor @XMM[8+1], @XMM[1]
2359 lea 0x30($inp), $inp
2360 pxor @XMM[8+2], @XMM[2]
2361 lea 0x80(%rsp), %rax # pass key schedule
2362 mov %edx, %r10d # pass rounds
2364 call _bsaes_encrypt8
2366 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2367 pxor 0x10(%rsp), @XMM[1]
2368 movdqu @XMM[0], 0x00($out) # write output
2369 pxor 0x20(%rsp), @XMM[4]
2370 movdqu @XMM[1], 0x10($out)
2371 movdqu @XMM[4], 0x20($out)
2372 lea 0x30($out), $out
2374 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2378 pxor @XMM[8+0], @XMM[0]
2379 lea 0x20($inp), $inp
2380 pxor @XMM[8+1], @XMM[1]
2381 lea 0x80(%rsp), %rax # pass key schedule
2382 mov %edx, %r10d # pass rounds
2384 call _bsaes_encrypt8
2386 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2387 pxor 0x10(%rsp), @XMM[1]
2388 movdqu @XMM[0], 0x00($out) # write output
2389 movdqu @XMM[1], 0x10($out)
2390 lea 0x20($out), $out
2392 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2396 pxor @XMM[0], @XMM[8]
2397 lea 0x10($inp), $inp
2398 movdqa @XMM[8], 0x20(%rbp)
2399 lea 0x20(%rbp), $arg1
2400 lea 0x20(%rbp), $arg2
2402 call asm_AES_encrypt # doesn't touch %xmm
2403 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2404 #pxor @XMM[8], @XMM[0]
2405 #lea 0x80(%rsp), %rax # pass key schedule
2406 #mov %edx, %r10d # pass rounds
2407 #call _bsaes_encrypt8
2408 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2409 movdqu @XMM[0], 0x00($out) # write output
2410 lea 0x10($out), $out
2412 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2421 movzb -16(%rdx), %ecx
2429 movdqu -16($out), @XMM[0]
2430 lea 0x20(%rbp), $arg1
2431 pxor @XMM[7], @XMM[0]
2432 lea 0x20(%rbp), $arg2
2433 movdqa @XMM[0], 0x20(%rbp)
2435 call asm_AES_encrypt # doesn't touch %xmm
2436 pxor 0x20(%rbp), @XMM[7]
2437 movdqu @XMM[7], -16($out)
2442 .Lxts_enc_bzero: # wipe key schedule [if any]
2443 movdqa %xmm0, 0x00(%rax)
2444 movdqa %xmm0, 0x10(%rax)
2445 lea 0x20(%rax), %rax
2449 lea (%rbp),%rsp # restore %rsp
2451 $code.=<<___ if ($win64);
2452 movaps 0x40(%rbp), %xmm6
2453 movaps 0x50(%rbp), %xmm7
2454 movaps 0x60(%rbp), %xmm8
2455 movaps 0x70(%rbp), %xmm9
2456 movaps 0x80(%rbp), %xmm10
2457 movaps 0x90(%rbp), %xmm11
2458 movaps 0xa0(%rbp), %xmm12
2459 movaps 0xb0(%rbp), %xmm13
2460 movaps 0xc0(%rbp), %xmm14
2461 movaps 0xd0(%rbp), %xmm15
2462 lea 0xa0(%rbp), %rsp
2465 mov 0x48(%rsp), %r15
2466 mov 0x50(%rsp), %r14
2467 mov 0x58(%rsp), %r13
2468 mov 0x60(%rsp), %r12
2469 mov 0x68(%rsp), %rbx
2470 mov 0x70(%rsp), %rax
2471 lea 0x78(%rsp), %rsp
2475 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2477 .globl bsaes_xts_decrypt
2478 .type bsaes_xts_decrypt,\@abi-omnipotent
2489 lea -0x48(%rsp), %rsp
2491 $code.=<<___ if ($win64);
2492 mov 0xa0(%rsp),$arg5 # pull key2
2493 mov 0xa8(%rsp),$arg6 # pull ivp
2494 lea -0xa0(%rsp), %rsp
2495 movaps %xmm6, 0x40(%rsp)
2496 movaps %xmm7, 0x50(%rsp)
2497 movaps %xmm8, 0x60(%rsp)
2498 movaps %xmm9, 0x70(%rsp)
2499 movaps %xmm10, 0x80(%rsp)
2500 movaps %xmm11, 0x90(%rsp)
2501 movaps %xmm12, 0xa0(%rsp)
2502 movaps %xmm13, 0xb0(%rsp)
2503 movaps %xmm14, 0xc0(%rsp)
2504 movaps %xmm15, 0xd0(%rsp)
2508 mov %rsp, %rbp # backup %rsp
2509 mov $arg1, $inp # backup arguments
2515 lea 0x20(%rbp), $arg2
2517 call asm_AES_encrypt # generate initial tweak
2519 mov 240($key), %eax # rounds
2520 mov $len, %rbx # backup $len
2522 mov %eax, %edx # rounds
2523 shl \$7, %rax # 128 bytes per inner round key
2524 sub \$`128-32`, %rax # size of bit-sliced key schedule
2527 mov %rsp, %rax # pass key schedule
2528 mov $key, %rcx # pass key
2529 mov %edx, %r10d # pass rounds
2530 call _bsaes_key_convert
2531 pxor (%rsp), %xmm7 # fix up round 0 key
2532 movdqa %xmm6, (%rax) # save last round key
2533 movdqa %xmm7, (%rsp)
2535 xor %eax, %eax # if ($len%16) len-=16;
2542 sub \$0x80, %rsp # place for tweak[8]
2543 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2546 movdqa .Lxts_magic(%rip), $twmask
2547 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2556 for ($i=0;$i<7;$i++) {
2558 pshufd \$0x13, $twtmp, $twres
2560 movdqa @XMM[7], @XMM[$i]
2561 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2562 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2563 pand $twmask, $twres # isolate carry and residue
2564 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2565 pxor $twres, @XMM[7]
2567 $code.=<<___ if ($i>=1);
2568 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2570 $code.=<<___ if ($i>=2);
2571 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2575 movdqu 0x60($inp), @XMM[8+6]
2576 pxor @XMM[8+5], @XMM[5]
2577 movdqu 0x70($inp), @XMM[8+7]
2578 lea 0x80($inp), $inp
2579 movdqa @XMM[7], 0x70(%rsp)
2580 pxor @XMM[8+6], @XMM[6]
2581 lea 0x80(%rsp), %rax # pass key schedule
2582 pxor @XMM[8+7], @XMM[7]
2583 mov %edx, %r10d # pass rounds
2585 call _bsaes_decrypt8
2587 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2588 pxor 0x10(%rsp), @XMM[1]
2589 movdqu @XMM[0], 0x00($out) # write output
2590 pxor 0x20(%rsp), @XMM[6]
2591 movdqu @XMM[1], 0x10($out)
2592 pxor 0x30(%rsp), @XMM[4]
2593 movdqu @XMM[6], 0x20($out)
2594 pxor 0x40(%rsp), @XMM[2]
2595 movdqu @XMM[4], 0x30($out)
2596 pxor 0x50(%rsp), @XMM[7]
2597 movdqu @XMM[2], 0x40($out)
2598 pxor 0x60(%rsp), @XMM[3]
2599 movdqu @XMM[7], 0x50($out)
2600 pxor 0x70(%rsp), @XMM[5]
2601 movdqu @XMM[3], 0x60($out)
2602 movdqu @XMM[5], 0x70($out)
2603 lea 0x80($out), $out
2605 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2607 movdqa .Lxts_magic(%rip), $twmask
2608 pcmpgtd @XMM[7], $twtmp
2609 pshufd \$0x13, $twtmp, $twres
2611 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2612 pand $twmask, $twres # isolate carry and residue
2613 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2614 pxor $twres, @XMM[7]
2623 for ($i=0;$i<7;$i++) {
2625 pshufd \$0x13, $twtmp, $twres
2627 movdqa @XMM[7], @XMM[$i]
2628 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2629 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2630 pand $twmask, $twres # isolate carry and residue
2631 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2632 pxor $twres, @XMM[7]
2634 $code.=<<___ if ($i>=1);
2635 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2636 cmp \$`0x10*$i`,$len
2639 $code.=<<___ if ($i>=2);
2640 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2644 movdqu 0x60($inp), @XMM[8+6]
2645 pxor @XMM[8+5], @XMM[5]
2646 movdqa @XMM[7], 0x70(%rsp)
2647 lea 0x70($inp), $inp
2648 pxor @XMM[8+6], @XMM[6]
2649 lea 0x80(%rsp), %rax # pass key schedule
2650 mov %edx, %r10d # pass rounds
2652 call _bsaes_decrypt8
2654 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2655 pxor 0x10(%rsp), @XMM[1]
2656 movdqu @XMM[0], 0x00($out) # write output
2657 pxor 0x20(%rsp), @XMM[6]
2658 movdqu @XMM[1], 0x10($out)
2659 pxor 0x30(%rsp), @XMM[4]
2660 movdqu @XMM[6], 0x20($out)
2661 pxor 0x40(%rsp), @XMM[2]
2662 movdqu @XMM[4], 0x30($out)
2663 pxor 0x50(%rsp), @XMM[7]
2664 movdqu @XMM[2], 0x40($out)
2665 pxor 0x60(%rsp), @XMM[3]
2666 movdqu @XMM[7], 0x50($out)
2667 movdqu @XMM[3], 0x60($out)
2668 lea 0x70($out), $out
2670 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2674 pxor @XMM[8+4], @XMM[4]
2675 lea 0x60($inp), $inp
2676 pxor @XMM[8+5], @XMM[5]
2677 lea 0x80(%rsp), %rax # pass key schedule
2678 mov %edx, %r10d # pass rounds
2680 call _bsaes_decrypt8
2682 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2683 pxor 0x10(%rsp), @XMM[1]
2684 movdqu @XMM[0], 0x00($out) # write output
2685 pxor 0x20(%rsp), @XMM[6]
2686 movdqu @XMM[1], 0x10($out)
2687 pxor 0x30(%rsp), @XMM[4]
2688 movdqu @XMM[6], 0x20($out)
2689 pxor 0x40(%rsp), @XMM[2]
2690 movdqu @XMM[4], 0x30($out)
2691 pxor 0x50(%rsp), @XMM[7]
2692 movdqu @XMM[2], 0x40($out)
2693 movdqu @XMM[7], 0x50($out)
2694 lea 0x60($out), $out
2696 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2700 pxor @XMM[8+3], @XMM[3]
2701 lea 0x50($inp), $inp
2702 pxor @XMM[8+4], @XMM[4]
2703 lea 0x80(%rsp), %rax # pass key schedule
2704 mov %edx, %r10d # pass rounds
2706 call _bsaes_decrypt8
2708 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2709 pxor 0x10(%rsp), @XMM[1]
2710 movdqu @XMM[0], 0x00($out) # write output
2711 pxor 0x20(%rsp), @XMM[6]
2712 movdqu @XMM[1], 0x10($out)
2713 pxor 0x30(%rsp), @XMM[4]
2714 movdqu @XMM[6], 0x20($out)
2715 pxor 0x40(%rsp), @XMM[2]
2716 movdqu @XMM[4], 0x30($out)
2717 movdqu @XMM[2], 0x40($out)
2718 lea 0x50($out), $out
2720 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2724 pxor @XMM[8+2], @XMM[2]
2725 lea 0x40($inp), $inp
2726 pxor @XMM[8+3], @XMM[3]
2727 lea 0x80(%rsp), %rax # pass key schedule
2728 mov %edx, %r10d # pass rounds
2730 call _bsaes_decrypt8
2732 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2733 pxor 0x10(%rsp), @XMM[1]
2734 movdqu @XMM[0], 0x00($out) # write output
2735 pxor 0x20(%rsp), @XMM[6]
2736 movdqu @XMM[1], 0x10($out)
2737 pxor 0x30(%rsp), @XMM[4]
2738 movdqu @XMM[6], 0x20($out)
2739 movdqu @XMM[4], 0x30($out)
2740 lea 0x40($out), $out
2742 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2746 pxor @XMM[8+1], @XMM[1]
2747 lea 0x30($inp), $inp
2748 pxor @XMM[8+2], @XMM[2]
2749 lea 0x80(%rsp), %rax # pass key schedule
2750 mov %edx, %r10d # pass rounds
2752 call _bsaes_decrypt8
2754 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2755 pxor 0x10(%rsp), @XMM[1]
2756 movdqu @XMM[0], 0x00($out) # write output
2757 pxor 0x20(%rsp), @XMM[6]
2758 movdqu @XMM[1], 0x10($out)
2759 movdqu @XMM[6], 0x20($out)
2760 lea 0x30($out), $out
2762 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2766 pxor @XMM[8+0], @XMM[0]
2767 lea 0x20($inp), $inp
2768 pxor @XMM[8+1], @XMM[1]
2769 lea 0x80(%rsp), %rax # pass key schedule
2770 mov %edx, %r10d # pass rounds
2772 call _bsaes_decrypt8
2774 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2775 pxor 0x10(%rsp), @XMM[1]
2776 movdqu @XMM[0], 0x00($out) # write output
2777 movdqu @XMM[1], 0x10($out)
2778 lea 0x20($out), $out
2780 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2784 pxor @XMM[0], @XMM[8]
2785 lea 0x10($inp), $inp
2786 movdqa @XMM[8], 0x20(%rbp)
2787 lea 0x20(%rbp), $arg1
2788 lea 0x20(%rbp), $arg2
2790 call asm_AES_decrypt # doesn't touch %xmm
2791 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2792 #pxor @XMM[8], @XMM[0]
2793 #lea 0x80(%rsp), %rax # pass key schedule
2794 #mov %edx, %r10d # pass rounds
2795 #call _bsaes_decrypt8
2796 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2797 movdqu @XMM[0], 0x00($out) # write output
2798 lea 0x10($out), $out
2800 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2807 movdqa .Lxts_magic(%rip), $twmask
2808 pcmpgtd @XMM[7], $twtmp
2809 pshufd \$0x13, $twtmp, $twres
2810 movdqa @XMM[7], @XMM[6]
2811 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2812 pand $twmask, $twres # isolate carry and residue
2813 movdqu ($inp), @XMM[0]
2814 pxor $twres, @XMM[7]
2816 lea 0x20(%rbp), $arg1
2817 pxor @XMM[7], @XMM[0]
2818 lea 0x20(%rbp), $arg2
2819 movdqa @XMM[0], 0x20(%rbp)
2821 call asm_AES_decrypt # doesn't touch %xmm
2822 pxor 0x20(%rbp), @XMM[7]
2824 movdqu @XMM[7], ($out)
2827 movzb 16($inp), %eax
2836 movdqu ($out), @XMM[0]
2837 lea 0x20(%rbp), $arg1
2838 pxor @XMM[6], @XMM[0]
2839 lea 0x20(%rbp), $arg2
2840 movdqa @XMM[0], 0x20(%rbp)
2842 call asm_AES_decrypt # doesn't touch %xmm
2843 pxor 0x20(%rbp), @XMM[6]
2844 movdqu @XMM[6], ($out)
2849 .Lxts_dec_bzero: # wipe key schedule [if any]
2850 movdqa %xmm0, 0x00(%rax)
2851 movdqa %xmm0, 0x10(%rax)
2852 lea 0x20(%rax), %rax
2856 lea (%rbp),%rsp # restore %rsp
2858 $code.=<<___ if ($win64);
2859 movaps 0x40(%rbp), %xmm6
2860 movaps 0x50(%rbp), %xmm7
2861 movaps 0x60(%rbp), %xmm8
2862 movaps 0x70(%rbp), %xmm9
2863 movaps 0x80(%rbp), %xmm10
2864 movaps 0x90(%rbp), %xmm11
2865 movaps 0xa0(%rbp), %xmm12
2866 movaps 0xb0(%rbp), %xmm13
2867 movaps 0xc0(%rbp), %xmm14
2868 movaps 0xd0(%rbp), %xmm15
2869 lea 0xa0(%rbp), %rsp
2872 mov 0x48(%rsp), %r15
2873 mov 0x50(%rsp), %r14
2874 mov 0x58(%rsp), %r13
2875 mov 0x60(%rsp), %r12
2876 mov 0x68(%rsp), %rbx
2877 mov 0x70(%rsp), %rax
2878 lea 0x78(%rsp), %rsp
2882 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2886 .type _bsaes_const,\@object
2889 .LM0ISR: # InvShiftRows constants
2890 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2892 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2894 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2895 .LBS0: # bit-slice constants
2896 .quad 0x5555555555555555, 0x5555555555555555
2898 .quad 0x3333333333333333, 0x3333333333333333
2900 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2901 .LSR: # shiftrows constants
2902 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2904 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2906 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2907 .LSWPUP: # byte-swap upper dword
2908 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2910 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2911 .LADD1: # counter increment constants
2912 .quad 0x0000000000000000, 0x0000000100000000
2914 .quad 0x0000000000000000, 0x0000000200000000
2916 .quad 0x0000000000000000, 0x0000000300000000
2918 .quad 0x0000000000000000, 0x0000000400000000
2920 .quad 0x0000000000000000, 0x0000000500000000
2922 .quad 0x0000000000000000, 0x0000000600000000
2924 .quad 0x0000000000000000, 0x0000000700000000
2926 .quad 0x0000000000000000, 0x0000000800000000
2930 .quad 0x0101010101010101, 0x0101010101010101
2931 .quad 0x0202020202020202, 0x0202020202020202
2932 .quad 0x0404040404040404, 0x0404040404040404
2933 .quad 0x0808080808080808, 0x0808080808080808
2935 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2937 .quad 0x6363636363636363, 0x6363636363636363
2938 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2940 .size _bsaes_const,.-_bsaes_const
2943 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2944 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2952 .extern __imp_RtlVirtualUnwind
2953 .type se_handler,\@abi-omnipotent
2967 mov 120($context),%rax # pull context->Rax
2968 mov 248($context),%rbx # pull context->Rip
2970 mov 8($disp),%rsi # disp->ImageBase
2971 mov 56($disp),%r11 # disp->HandlerData
2973 mov 0(%r11),%r10d # HandlerData[0]
2974 lea (%rsi,%r10),%r10 # prologue label
2975 cmp %r10,%rbx # context->Rip<prologue label
2978 mov 152($context),%rax # pull context->Rsp
2980 mov 4(%r11),%r10d # HandlerData[1]
2981 lea (%rsi,%r10),%r10 # epilogue label
2982 cmp %r10,%rbx # context->Rip>=epilogue label
2985 mov 160($context),%rax # pull context->Rbp
2987 lea 0x40(%rax),%rsi # %xmm save area
2988 lea 512($context),%rdi # &context.Xmm6
2989 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2990 .long 0xa548f3fc # cld; rep movsq
2991 lea 0xa0(%rax),%rax # adjust stack pointer
2999 lea 0x78(%rax),%rax # adjust stack pointer
3000 mov %rbx,144($context) # restore context->Rbx
3001 mov %rbp,160($context) # restore context->Rbp
3002 mov %r12,216($context) # restore context->R12
3003 mov %r13,224($context) # restore context->R13
3004 mov %r14,232($context) # restore context->R14
3005 mov %r15,240($context) # restore context->R15
3008 mov %rax,152($context) # restore context->Rsp
3010 mov 40($disp),%rdi # disp->ContextRecord
3011 mov $context,%rsi # context
3012 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3013 .long 0xa548f3fc # cld; rep movsq
3016 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3017 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3018 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3019 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3020 mov 40(%rsi),%r10 # disp->ContextRecord
3021 lea 56(%rsi),%r11 # &disp->HandlerData
3022 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3023 mov %r10,32(%rsp) # arg5
3024 mov %r11,40(%rsp) # arg6
3025 mov %r12,48(%rsp) # arg7
3026 mov %rcx,56(%rsp) # arg8, (NULL)
3027 call *__imp_RtlVirtualUnwind(%rip)
3029 mov \$1,%eax # ExceptionContinueSearch
3041 .size se_handler,.-se_handler
3046 $code.=<<___ if ($ecb);
3047 .rva .Lecb_enc_prologue
3048 .rva .Lecb_enc_epilogue
3051 .rva .Lecb_dec_prologue
3052 .rva .Lecb_dec_epilogue
3056 .rva .Lcbc_dec_prologue
3057 .rva .Lcbc_dec_epilogue
3060 .rva .Lctr_enc_prologue
3061 .rva .Lctr_enc_epilogue
3064 .rva .Lxts_enc_prologue
3065 .rva .Lxts_enc_epilogue
3068 .rva .Lxts_dec_prologue
3069 .rva .Lxts_dec_epilogue
3075 $code.=<<___ if ($ecb);
3079 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3083 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3089 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3093 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3097 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3101 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3105 $code =~ s/\`([^\`]*)\`/eval($1)/gem;