3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
67 # conversion conversion/8x block
72 # The ratio values mean that 128-byte blocks will be processed
73 # 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
84 # one byte out of 4096-byte buffer with 128-bit key is:
93 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
95 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
97 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
98 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
99 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
100 die "can't locate x86_64-xlate.pl";
102 open STDOUT,"| $^X $xlate $flavour $output";
104 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
105 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
108 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
111 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
112 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
117 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
118 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
122 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
123 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
145 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
146 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
166 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
167 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
171 &InvInBasisChange (@b);
172 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
173 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
176 sub InvInBasisChange { # OutBasisChange in reverse
177 my @b=@_[5,1,2,6,3,7,0,4];
195 sub InvOutBasisChange { # InBasisChange in reverse
196 my @b=@_[2,5,7,3,6,1,0,4];
217 #;*************************************************************
218 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
219 #;*************************************************************
220 my ($x0,$x1,$y0,$y1,$t0)=@_;
233 sub Mul_GF4_N { # not used, see next subroutine
234 # multiply and scale by N
235 my ($x0,$x1,$y0,$y1,$t0)=@_;
249 # interleaved Mul_GF4_N and Mul_GF4
250 my ($x0,$x1,$y0,$y1,$t0,
251 $x2,$x3,$y2,$y3,$t1)=@_;
279 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
286 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
287 @x[2], @x[3], @y[2], @y[3], @t[2]);
299 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
300 @x[6], @x[7], @y[2], @y[3], @t[2]);
305 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
314 #;********************************************************************
315 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
316 #;********************************************************************
320 # direct optimizations from hardware
375 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
377 # new smaller inversion
411 # output in s3, s2, s1, t1
413 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
415 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
416 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
418 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
421 # AES linear components
427 pxor 0x00($key),@x[0]
428 pxor 0x10($key),@x[1]
430 pxor 0x20($key),@x[2]
432 pxor 0x30($key),@x[3]
434 pxor 0x40($key),@x[4]
436 pxor 0x50($key),@x[5]
438 pxor 0x60($key),@x[6]
440 pxor 0x70($key),@x[7]
448 # modified to emit output in order suitable for feeding back to aesenc[last]
452 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
453 pshufd \$0x93, @x[1], @t[1]
454 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
455 pshufd \$0x93, @x[2], @t[2]
457 pshufd \$0x93, @x[3], @t[3]
459 pshufd \$0x93, @x[4], @t[4]
461 pshufd \$0x93, @x[5], @t[5]
463 pshufd \$0x93, @x[6], @t[6]
465 pshufd \$0x93, @x[7], @t[7]
472 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
474 pshufd \$0x4E, @x[1], @x[1]
480 pshufd \$0x4E, @x[4], @t[0]
482 pshufd \$0x4E, @x[5], @t[1]
484 pshufd \$0x4E, @x[3], @x[4]
486 pshufd \$0x4E, @x[7], @x[5]
488 pshufd \$0x4E, @x[6], @x[3]
490 pshufd \$0x4E, @x[2], @x[6]
507 # multiplication by 0x0e
508 pshufd \$0x93, @x[7], @t[7]
510 pxor @x[5], @x[7] # 7 5
511 pxor @x[5], @x[2] # 2 5
512 pshufd \$0x93, @x[0], @t[0]
514 pxor @x[0], @x[5] # 5 0 [1]
515 pxor @x[1], @x[0] # 0 1
516 pshufd \$0x93, @x[1], @t[1]
517 pxor @x[2], @x[1] # 1 25
518 pxor @x[6], @x[0] # 01 6 [2]
519 pxor @x[3], @x[1] # 125 3 [4]
520 pshufd \$0x93, @x[3], @t[3]
521 pxor @x[0], @x[2] # 25 016 [3]
522 pxor @x[7], @x[3] # 3 75
523 pxor @x[6], @x[7] # 75 6 [0]
524 pshufd \$0x93, @x[6], @t[6]
526 pxor @x[4], @x[6] # 6 4
527 pxor @x[3], @x[4] # 4 375 [6]
528 pxor @x[7], @x[3] # 375 756=36
529 pxor @t[5], @x[6] # 64 5 [7]
530 pxor @t[2], @x[3] # 36 2
531 pxor @t[4], @x[3] # 362 4 [5]
532 pshufd \$0x93, @t[5], @t[5]
534 my @y = @x[7,5,0,2,1,3,4,6];
536 # multiplication by 0x0b
540 pshufd \$0x93, @t[2], @t[2]
544 pshufd \$0x93, @t[4], @t[4]
545 pxor @t[6], @t[7] # clobber t[7]
549 pshufd \$0x93, @t[0], @t[0]
553 pshufd \$0x93, @t[1], @t[1]
557 pshufd \$0x93, @t[2], @t[2]
561 pshufd \$0x93, @t[3], @t[3]
567 pxor @t[5], @t[7] # clobber t[7] even more
570 pshufd \$0x93, @t[4], @t[4]
575 pshufd \$0x93, @t[5], @t[5]
576 pxor @t[6], @t[7] # restore t[7]
578 # multiplication by 0x0d
581 pshufd \$0x93, @t[6], @t[6]
585 pshufd \$0x93, @t[7], @t[7]
594 pshufd \$0x93, @t[0], @t[0]
598 pshufd \$0x93, @t[1], @t[1]
603 pshufd \$0x93, @t[2], @t[2]
605 pxor @t[3], @t[6] # clobber t[6]
612 pshufd \$0x93, @t[4], @t[4]
615 pxor @t[3], @t[6] # restore t[6]
617 pshufd \$0x93, @t[5], @t[5]
618 pshufd \$0x93, @t[6], @t[6]
619 pshufd \$0x93, @t[7], @t[7]
620 pshufd \$0x93, @t[3], @t[3]
622 # multiplication by 0x09
624 pxor @y[1], @t[1] # t[1]=y[1]
625 pxor @t[5], @t[0] # clobber t[0]
628 pxor @y[0], @t[0] # t[0]=y[0]
630 pxor @t[7], @t[6] # clobber t[6]
633 pxor @y[4], @t[4] # t[4]=y[4]
635 pxor @y[3], @t[3] # t[3]=y[3]
637 pxor @y[2], @t[2] # t[2]=y[2]
639 pxor @y[5], @t[5] # t[5]=y[5]
642 pxor @y[6], @t[6] # t[6]=y[6]
643 pxor @y[7], @t[7] # t[7]=y[7]
656 sub aesenc { # not used
660 movdqa 0x30($const),@t[0] # .LSR
662 &ShiftRows (@b,@t[0]);
664 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
667 sub aesenclast { # not used
671 movdqa 0x40($const),@t[0] # .LSRM0
673 &ShiftRows (@b,@t[0]);
676 pxor 0x00($key),@b[0]
677 pxor 0x10($key),@b[1]
678 pxor 0x20($key),@b[4]
679 pxor 0x30($key),@b[6]
680 pxor 0x40($key),@b[3]
681 pxor 0x50($key),@b[7]
682 pxor 0x60($key),@b[2]
683 pxor 0x70($key),@b[5]
688 my ($a,$b,$n,$mask,$t)=@_;
700 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
720 my @x=reverse(@_[0..7]);
721 my ($t0,$t1,$t2,$t3)=@_[8..11];
723 movdqa 0x00($const),$t0 # .LBS0
724 movdqa 0x10($const),$t1 # .LBS1
726 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
727 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
729 movdqa 0x20($const),$t0 # .LBS2
731 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
732 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
734 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
735 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
744 .type _bsaes_encrypt8,\@abi-omnipotent
747 lea .LBS0(%rip), $const # constants table
749 movdqa ($key), @XMM[9] # round 0 key
751 movdqa 0x60($const), @XMM[8] # .LM0SR
752 pxor @XMM[9], @XMM[0] # xor with round0 key
753 pxor @XMM[9], @XMM[1]
754 pshufb @XMM[8], @XMM[0]
755 pxor @XMM[9], @XMM[2]
756 pshufb @XMM[8], @XMM[1]
757 pxor @XMM[9], @XMM[3]
758 pshufb @XMM[8], @XMM[2]
759 pxor @XMM[9], @XMM[4]
760 pshufb @XMM[8], @XMM[3]
761 pxor @XMM[9], @XMM[5]
762 pshufb @XMM[8], @XMM[4]
763 pxor @XMM[9], @XMM[6]
764 pshufb @XMM[8], @XMM[5]
765 pxor @XMM[9], @XMM[7]
766 pshufb @XMM[8], @XMM[6]
767 pshufb @XMM[8], @XMM[7]
768 _bsaes_encrypt8_bitslice:
770 &bitslice (@XMM[0..7, 8..11]);
777 &ShiftRows (@XMM[0..7, 8]);
778 $code.=".Lenc_sbox:\n";
779 &Sbox (@XMM[0..7, 8..15]);
784 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
786 movdqa 0x30($const), @XMM[8] # .LSR
788 movdqa 0x40($const), @XMM[8] # .LSRM0
793 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
794 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
796 movdqa ($key), @XMM[8] # last round key
797 pxor @XMM[8], @XMM[4]
798 pxor @XMM[8], @XMM[6]
799 pxor @XMM[8], @XMM[3]
800 pxor @XMM[8], @XMM[7]
801 pxor @XMM[8], @XMM[2]
802 pxor @XMM[8], @XMM[5]
803 pxor @XMM[8], @XMM[0]
804 pxor @XMM[8], @XMM[1]
806 .size _bsaes_encrypt8,.-_bsaes_encrypt8
808 .type _bsaes_decrypt8,\@abi-omnipotent
811 lea .LBS0(%rip), $const # constants table
813 movdqa ($key), @XMM[9] # round 0 key
815 movdqa -0x30($const), @XMM[8] # .LM0ISR
816 pxor @XMM[9], @XMM[0] # xor with round0 key
817 pxor @XMM[9], @XMM[1]
818 pshufb @XMM[8], @XMM[0]
819 pxor @XMM[9], @XMM[2]
820 pshufb @XMM[8], @XMM[1]
821 pxor @XMM[9], @XMM[3]
822 pshufb @XMM[8], @XMM[2]
823 pxor @XMM[9], @XMM[4]
824 pshufb @XMM[8], @XMM[3]
825 pxor @XMM[9], @XMM[5]
826 pshufb @XMM[8], @XMM[4]
827 pxor @XMM[9], @XMM[6]
828 pshufb @XMM[8], @XMM[5]
829 pxor @XMM[9], @XMM[7]
830 pshufb @XMM[8], @XMM[6]
831 pshufb @XMM[8], @XMM[7]
833 &bitslice (@XMM[0..7, 8..11]);
840 &ShiftRows (@XMM[0..7, 8]);
841 $code.=".Ldec_sbox:\n";
842 &InvSbox (@XMM[0..7, 8..15]);
847 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
849 movdqa -0x10($const), @XMM[8] # .LISR
851 movdqa -0x20($const), @XMM[8] # .LISRM0
856 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
858 movdqa ($key), @XMM[8] # last round key
859 pxor @XMM[8], @XMM[6]
860 pxor @XMM[8], @XMM[4]
861 pxor @XMM[8], @XMM[2]
862 pxor @XMM[8], @XMM[7]
863 pxor @XMM[8], @XMM[3]
864 pxor @XMM[8], @XMM[5]
865 pxor @XMM[8], @XMM[0]
866 pxor @XMM[8], @XMM[1]
868 .size _bsaes_decrypt8,.-_bsaes_decrypt8
872 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
875 my @x=reverse(@_[0..7]);
876 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
878 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
880 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
884 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
886 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
888 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
894 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
895 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
899 .type _bsaes_key_convert,\@abi-omnipotent
902 lea .LBS1(%rip), $const
903 movdqu ($inp), %xmm7 # load round 0 key
904 movdqa -0x10($const), %xmm8 # .LBS0
905 movdqa 0x00($const), %xmm9 # .LBS1
906 movdqa 0x10($const), %xmm10 # .LBS2
907 movdqa 0x40($const), %xmm13 # .LM0
908 movdqa 0x60($const), %xmm14 # .LNOT
910 movdqu 0x10($inp), %xmm6 # load round 1 key
912 movdqa %xmm7, ($out) # save round 0 key
918 pshufb %xmm13, %xmm6 # .LM0
921 &bitslice_key (map("%xmm$_",(0..7, 8..12)));
923 pxor %xmm14, %xmm5 # "pnot"
928 movdqa %xmm0, 0x00($out) # write bit-sliced round key
929 movdqa %xmm1, 0x10($out)
930 movdqa %xmm2, 0x20($out)
931 movdqa %xmm3, 0x30($out)
932 movdqa %xmm4, 0x40($out)
933 movdqa %xmm5, 0x50($out)
934 movdqa %xmm6, 0x60($out)
935 movdqa %xmm7, 0x70($out)
937 movdqu ($inp), %xmm6 # load next round key
941 movdqa 0x70($const), %xmm7 # .L63
942 #movdqa %xmm6, ($out) # don't save last round key
944 .size _bsaes_key_convert,.-_bsaes_key_convert
948 if (1 && !$win64) { # following four functions are unsupported interface
949 # used for benchmarking...
951 .globl bsaes_enc_key_convert
952 .type bsaes_enc_key_convert,\@function,2
954 bsaes_enc_key_convert:
955 mov 240($inp),%r10d # pass rounds
956 mov $inp,%rcx # pass key
957 mov $out,%rax # pass key schedule
958 call _bsaes_key_convert
959 pxor %xmm6,%xmm7 # fix up last round key
960 movdqa %xmm7,(%rax) # save last round key
962 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
964 .globl bsaes_encrypt_128
965 .type bsaes_encrypt_128,\@function,4
969 movdqu 0x00($inp), @XMM[0] # load input
970 movdqu 0x10($inp), @XMM[1]
971 movdqu 0x20($inp), @XMM[2]
972 movdqu 0x30($inp), @XMM[3]
973 movdqu 0x40($inp), @XMM[4]
974 movdqu 0x50($inp), @XMM[5]
975 movdqu 0x60($inp), @XMM[6]
976 movdqu 0x70($inp), @XMM[7]
977 mov $key, %rax # pass the $key
983 movdqu @XMM[0], 0x00($out) # write output
984 movdqu @XMM[1], 0x10($out)
985 movdqu @XMM[4], 0x20($out)
986 movdqu @XMM[6], 0x30($out)
987 movdqu @XMM[3], 0x40($out)
988 movdqu @XMM[7], 0x50($out)
989 movdqu @XMM[2], 0x60($out)
990 movdqu @XMM[5], 0x70($out)
995 .size bsaes_encrypt_128,.-bsaes_encrypt_128
997 .globl bsaes_dec_key_convert
998 .type bsaes_dec_key_convert,\@function,2
1000 bsaes_dec_key_convert:
1001 mov 240($inp),%r10d # pass rounds
1002 mov $inp,%rcx # pass key
1003 mov $out,%rax # pass key schedule
1004 call _bsaes_key_convert
1005 pxor ($out),%xmm7 # fix up round 0 key
1006 movdqa %xmm6,(%rax) # save last round key
1009 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1011 .globl bsaes_decrypt_128
1012 .type bsaes_decrypt_128,\@function,4
1016 movdqu 0x00($inp), @XMM[0] # load input
1017 movdqu 0x10($inp), @XMM[1]
1018 movdqu 0x20($inp), @XMM[2]
1019 movdqu 0x30($inp), @XMM[3]
1020 movdqu 0x40($inp), @XMM[4]
1021 movdqu 0x50($inp), @XMM[5]
1022 movdqu 0x60($inp), @XMM[6]
1023 movdqu 0x70($inp), @XMM[7]
1024 mov $key, %rax # pass the $key
1025 lea 0x80($inp), $inp
1028 call _bsaes_decrypt8
1030 movdqu @XMM[0], 0x00($out) # write output
1031 movdqu @XMM[1], 0x10($out)
1032 movdqu @XMM[6], 0x20($out)
1033 movdqu @XMM[4], 0x30($out)
1034 movdqu @XMM[2], 0x40($out)
1035 movdqu @XMM[7], 0x50($out)
1036 movdqu @XMM[3], 0x60($out)
1037 movdqu @XMM[5], 0x70($out)
1038 lea 0x80($out), $out
1042 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1046 ######################################################################
1050 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1051 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1052 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1054 if (0) { # suppress unreferenced ECB subroutines, spare some space...
1056 .globl bsaes_ecb_encrypt_blocks
1057 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1059 bsaes_ecb_encrypt_blocks:
1066 lea -0x48(%rsp),%rsp
1068 $code.=<<___ if ($win64);
1069 lea -0xa0(%rsp), %rsp
1070 movaps %xmm6, 0x40(%rsp)
1071 movaps %xmm7, 0x50(%rsp)
1072 movaps %xmm8, 0x60(%rsp)
1073 movaps %xmm9, 0x70(%rsp)
1074 movaps %xmm10, 0x80(%rsp)
1075 movaps %xmm11, 0x90(%rsp)
1076 movaps %xmm12, 0xa0(%rsp)
1077 movaps %xmm13, 0xb0(%rsp)
1078 movaps %xmm14, 0xc0(%rsp)
1079 movaps %xmm15, 0xd0(%rsp)
1083 mov %rsp,%rbp # backup %rsp
1084 mov 240($arg4),%eax # rounds
1085 mov $arg1,$inp # backup arguments
1092 mov %eax,%ebx # backup rounds
1093 shl \$7,%rax # 128 bytes per inner round key
1094 sub \$`128-32`,%rax # size of bit-sliced key schedule
1096 mov %rsp,%rax # pass key schedule
1097 mov $key,%rcx # pass key
1098 mov %ebx,%r10d # pass rounds
1099 call _bsaes_key_convert
1100 pxor %xmm6,%xmm7 # fix up last round key
1101 movdqa %xmm7,(%rax) # save last round key
1105 movdqu 0x00($inp), @XMM[0] # load input
1106 movdqu 0x10($inp), @XMM[1]
1107 movdqu 0x20($inp), @XMM[2]
1108 movdqu 0x30($inp), @XMM[3]
1109 movdqu 0x40($inp), @XMM[4]
1110 movdqu 0x50($inp), @XMM[5]
1111 mov %rsp, %rax # pass key schedule
1112 movdqu 0x60($inp), @XMM[6]
1113 mov %ebx,%r10d # pass rounds
1114 movdqu 0x70($inp), @XMM[7]
1115 lea 0x80($inp), $inp
1117 call _bsaes_encrypt8
1119 movdqu @XMM[0], 0x00($out) # write output
1120 movdqu @XMM[1], 0x10($out)
1121 movdqu @XMM[4], 0x20($out)
1122 movdqu @XMM[6], 0x30($out)
1123 movdqu @XMM[3], 0x40($out)
1124 movdqu @XMM[7], 0x50($out)
1125 movdqu @XMM[2], 0x60($out)
1126 movdqu @XMM[5], 0x70($out)
1127 lea 0x80($out), $out
1134 movdqu 0x00($inp), @XMM[0] # load input
1135 mov %rsp, %rax # pass key schedule
1136 mov %ebx,%r10d # pass rounds
1139 movdqu 0x10($inp), @XMM[1]
1141 movdqu 0x20($inp), @XMM[2]
1144 movdqu 0x30($inp), @XMM[3]
1146 movdqu 0x40($inp), @XMM[4]
1149 movdqu 0x50($inp), @XMM[5]
1151 movdqu 0x60($inp), @XMM[6]
1152 call _bsaes_encrypt8
1153 movdqu @XMM[0], 0x00($out) # write output
1154 movdqu @XMM[1], 0x10($out)
1155 movdqu @XMM[4], 0x20($out)
1156 movdqu @XMM[6], 0x30($out)
1157 movdqu @XMM[3], 0x40($out)
1158 movdqu @XMM[7], 0x50($out)
1159 movdqu @XMM[2], 0x60($out)
1163 call _bsaes_encrypt8
1164 movdqu @XMM[0], 0x00($out) # write output
1165 movdqu @XMM[1], 0x10($out)
1166 movdqu @XMM[4], 0x20($out)
1167 movdqu @XMM[6], 0x30($out)
1168 movdqu @XMM[3], 0x40($out)
1169 movdqu @XMM[7], 0x50($out)
1173 call _bsaes_encrypt8
1174 movdqu @XMM[0], 0x00($out) # write output
1175 movdqu @XMM[1], 0x10($out)
1176 movdqu @XMM[4], 0x20($out)
1177 movdqu @XMM[6], 0x30($out)
1178 movdqu @XMM[3], 0x40($out)
1182 call _bsaes_encrypt8
1183 movdqu @XMM[0], 0x00($out) # write output
1184 movdqu @XMM[1], 0x10($out)
1185 movdqu @XMM[4], 0x20($out)
1186 movdqu @XMM[6], 0x30($out)
1190 call _bsaes_encrypt8
1191 movdqu @XMM[0], 0x00($out) # write output
1192 movdqu @XMM[1], 0x10($out)
1193 movdqu @XMM[4], 0x20($out)
1197 call _bsaes_encrypt8
1198 movdqu @XMM[0], 0x00($out) # write output
1199 movdqu @XMM[1], 0x10($out)
1203 call _bsaes_encrypt8
1204 movdqu @XMM[0], 0x00($out) # write output
1220 .Lecb_enc_bzero: # wipe key schedule [if any]
1221 movdqa %xmm0, 0x00(%rax)
1222 movdqa %xmm0, 0x10(%rax)
1223 lea 0x20(%rax), %rax
1227 lea (%rbp),%rsp # restore %rsp
1229 $code.=<<___ if ($win64);
1230 movaps 0x40(%rbp), %xmm6
1231 movaps 0x50(%rbp), %xmm7
1232 movaps 0x60(%rbp), %xmm8
1233 movaps 0x70(%rbp), %xmm9
1234 movaps 0x80(%rbp), %xmm10
1235 movaps 0x90(%rbp), %xmm11
1236 movaps 0xa0(%rbp), %xmm12
1237 movaps 0xb0(%rbp), %xmm13
1238 movaps 0xc0(%rbp), %xmm14
1239 movaps 0xd0(%rbp), %xmm15
1240 lea 0xa0(%rbp), %rsp
1243 mov 0x48(%rsp), %r15
1244 mov 0x50(%rsp), %r14
1245 mov 0x58(%rsp), %r13
1246 mov 0x60(%rsp), %r12
1247 mov 0x68(%rsp), %rbx
1248 mov 0x70(%rsp), %rbp
1249 lea 0x78(%rsp), %rsp
1252 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1254 .globl bsaes_ecb_decrypt_blocks
1255 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1257 bsaes_ecb_decrypt_blocks:
1264 lea -0x48(%rsp),%rsp
1266 $code.=<<___ if ($win64);
1267 lea -0xa0(%rsp), %rsp
1268 movaps %xmm6, 0x40(%rsp)
1269 movaps %xmm7, 0x50(%rsp)
1270 movaps %xmm8, 0x60(%rsp)
1271 movaps %xmm9, 0x70(%rsp)
1272 movaps %xmm10, 0x80(%rsp)
1273 movaps %xmm11, 0x90(%rsp)
1274 movaps %xmm12, 0xa0(%rsp)
1275 movaps %xmm13, 0xb0(%rsp)
1276 movaps %xmm14, 0xc0(%rsp)
1277 movaps %xmm15, 0xd0(%rsp)
1281 mov %rsp,%rbp # backup %rsp
1282 mov 240($arg4),%eax # rounds
1283 mov $arg1,$inp # backup arguments
1290 mov %eax,%ebx # backup rounds
1291 shl \$7,%rax # 128 bytes per inner round key
1292 sub \$`128-32`,%rax # size of bit-sliced key schedule
1294 mov %rsp,%rax # pass key schedule
1295 mov $key,%rcx # pass key
1296 mov %ebx,%r10d # pass rounds
1297 call _bsaes_key_convert
1298 pxor (%rsp),%xmm7 # fix up 0 round key
1299 movdqa %xmm6,(%rax) # save last round key
1304 movdqu 0x00($inp), @XMM[0] # load input
1305 movdqu 0x10($inp), @XMM[1]
1306 movdqu 0x20($inp), @XMM[2]
1307 movdqu 0x30($inp), @XMM[3]
1308 movdqu 0x40($inp), @XMM[4]
1309 movdqu 0x50($inp), @XMM[5]
1310 mov %rsp, %rax # pass key schedule
1311 movdqu 0x60($inp), @XMM[6]
1312 mov %ebx,%r10d # pass rounds
1313 movdqu 0x70($inp), @XMM[7]
1314 lea 0x80($inp), $inp
1316 call _bsaes_decrypt8
1318 movdqu @XMM[0], 0x00($out) # write output
1319 movdqu @XMM[1], 0x10($out)
1320 movdqu @XMM[6], 0x20($out)
1321 movdqu @XMM[4], 0x30($out)
1322 movdqu @XMM[2], 0x40($out)
1323 movdqu @XMM[7], 0x50($out)
1324 movdqu @XMM[3], 0x60($out)
1325 movdqu @XMM[5], 0x70($out)
1326 lea 0x80($out), $out
1333 movdqu 0x00($inp), @XMM[0] # load input
1334 mov %rsp, %rax # pass key schedule
1335 mov %ebx,%r10d # pass rounds
1338 movdqu 0x10($inp), @XMM[1]
1340 movdqu 0x20($inp), @XMM[2]
1343 movdqu 0x30($inp), @XMM[3]
1345 movdqu 0x40($inp), @XMM[4]
1348 movdqu 0x50($inp), @XMM[5]
1350 movdqu 0x60($inp), @XMM[6]
1351 call _bsaes_decrypt8
1352 movdqu @XMM[0], 0x00($out) # write output
1353 movdqu @XMM[1], 0x10($out)
1354 movdqu @XMM[6], 0x20($out)
1355 movdqu @XMM[4], 0x30($out)
1356 movdqu @XMM[2], 0x40($out)
1357 movdqu @XMM[7], 0x50($out)
1358 movdqu @XMM[3], 0x60($out)
1362 call _bsaes_decrypt8
1363 movdqu @XMM[0], 0x00($out) # write output
1364 movdqu @XMM[1], 0x10($out)
1365 movdqu @XMM[6], 0x20($out)
1366 movdqu @XMM[4], 0x30($out)
1367 movdqu @XMM[2], 0x40($out)
1368 movdqu @XMM[7], 0x50($out)
1372 call _bsaes_decrypt8
1373 movdqu @XMM[0], 0x00($out) # write output
1374 movdqu @XMM[1], 0x10($out)
1375 movdqu @XMM[6], 0x20($out)
1376 movdqu @XMM[4], 0x30($out)
1377 movdqu @XMM[2], 0x40($out)
1381 call _bsaes_decrypt8
1382 movdqu @XMM[0], 0x00($out) # write output
1383 movdqu @XMM[1], 0x10($out)
1384 movdqu @XMM[6], 0x20($out)
1385 movdqu @XMM[4], 0x30($out)
1389 call _bsaes_decrypt8
1390 movdqu @XMM[0], 0x00($out) # write output
1391 movdqu @XMM[1], 0x10($out)
1392 movdqu @XMM[6], 0x20($out)
1396 call _bsaes_decrypt8
1397 movdqu @XMM[0], 0x00($out) # write output
1398 movdqu @XMM[1], 0x10($out)
1402 call _bsaes_decrypt8
1403 movdqu @XMM[0], 0x00($out) # write output
1419 .Lecb_dec_bzero: # wipe key schedule [if any]
1420 movdqa %xmm0, 0x00(%rax)
1421 movdqa %xmm0, 0x10(%rax)
1422 lea 0x20(%rax), %rax
1426 lea (%rbp),%rsp # restore %rsp
1428 $code.=<<___ if ($win64);
1429 movaps 0x40(%rbp), %xmm6
1430 movaps 0x50(%rbp), %xmm7
1431 movaps 0x60(%rbp), %xmm8
1432 movaps 0x70(%rbp), %xmm9
1433 movaps 0x80(%rbp), %xmm10
1434 movaps 0x90(%rbp), %xmm11
1435 movaps 0xa0(%rbp), %xmm12
1436 movaps 0xb0(%rbp), %xmm13
1437 movaps 0xc0(%rbp), %xmm14
1438 movaps 0xd0(%rbp), %xmm15
1439 lea 0xa0(%rbp), %rsp
1442 mov 0x48(%rsp), %r15
1443 mov 0x50(%rsp), %r14
1444 mov 0x58(%rsp), %r13
1445 mov 0x60(%rsp), %r12
1446 mov 0x68(%rsp), %rbx
1447 mov 0x70(%rsp), %rbp
1448 lea 0x78(%rsp), %rsp
1451 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1455 .extern AES_cbc_encrypt
1456 .globl bsaes_cbc_encrypt
1457 .type bsaes_cbc_encrypt,\@abi-omnipotent
1461 $code.=<<___ if ($win64);
1462 mov 48(%rsp),$arg6 # pull direction flag
1476 lea -0x48(%rsp), %rsp
1478 $code.=<<___ if ($win64);
1479 mov 0xa0(%rsp),$arg5 # pull ivp
1480 lea -0xa0(%rsp), %rsp
1481 movaps %xmm6, 0x40(%rsp)
1482 movaps %xmm7, 0x50(%rsp)
1483 movaps %xmm8, 0x60(%rsp)
1484 movaps %xmm9, 0x70(%rsp)
1485 movaps %xmm10, 0x80(%rsp)
1486 movaps %xmm11, 0x90(%rsp)
1487 movaps %xmm12, 0xa0(%rsp)
1488 movaps %xmm13, 0xb0(%rsp)
1489 movaps %xmm14, 0xc0(%rsp)
1490 movaps %xmm15, 0xd0(%rsp)
1494 mov %rsp, %rbp # backup %rsp
1495 mov 240($arg4), %eax # rounds
1496 mov $arg1, $inp # backup arguments
1501 shr \$4, $len # bytes to blocks
1503 mov %eax, %ebx # rounds
1504 shl \$7, %rax # 128 bytes per inner round key
1505 sub \$`128-32`, %rax # size of bit-sliced key schedule
1508 mov %rsp, %rax # pass key schedule
1509 mov $key, %rcx # pass key
1510 mov %ebx, %r10d # pass rounds
1511 call _bsaes_key_convert
1512 pxor (%rsp),%xmm7 # fix up 0 round key
1513 movdqa %xmm6,(%rax) # save last round key
1516 movdqu (%rdx), @XMM[15] # load IV
1519 movdqu 0x00($inp), @XMM[0] # load input
1520 movdqu 0x10($inp), @XMM[1]
1521 movdqu 0x20($inp), @XMM[2]
1522 movdqu 0x30($inp), @XMM[3]
1523 movdqu 0x40($inp), @XMM[4]
1524 movdqu 0x50($inp), @XMM[5]
1525 mov %rsp, %rax # pass key schedule
1526 movdqu 0x60($inp), @XMM[6]
1527 mov %ebx,%r10d # pass rounds
1528 movdqu 0x70($inp), @XMM[7]
1529 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1531 call _bsaes_decrypt8
1533 pxor 0x20(%rbp), @XMM[0] # ^= IV
1534 movdqu 0x00($inp), @XMM[8] # re-load input
1535 movdqu 0x10($inp), @XMM[9]
1536 pxor @XMM[8], @XMM[1]
1537 movdqu 0x20($inp), @XMM[10]
1538 pxor @XMM[9], @XMM[6]
1539 movdqu 0x30($inp), @XMM[11]
1540 pxor @XMM[10], @XMM[4]
1541 movdqu 0x40($inp), @XMM[12]
1542 pxor @XMM[11], @XMM[2]
1543 movdqu 0x50($inp), @XMM[13]
1544 pxor @XMM[12], @XMM[7]
1545 movdqu 0x60($inp), @XMM[14]
1546 pxor @XMM[13], @XMM[3]
1547 movdqu 0x70($inp), @XMM[15] # IV
1548 pxor @XMM[14], @XMM[5]
1549 movdqu @XMM[0], 0x00($out) # write output
1550 lea 0x80($inp), $inp
1551 movdqu @XMM[1], 0x10($out)
1552 movdqu @XMM[6], 0x20($out)
1553 movdqu @XMM[4], 0x30($out)
1554 movdqu @XMM[2], 0x40($out)
1555 movdqu @XMM[7], 0x50($out)
1556 movdqu @XMM[3], 0x60($out)
1557 movdqu @XMM[5], 0x70($out)
1558 lea 0x80($out), $out
1565 movdqu 0x00($inp), @XMM[0] # load input
1566 mov %rsp, %rax # pass key schedule
1567 mov %ebx, %r10d # pass rounds
1570 movdqu 0x10($inp), @XMM[1]
1572 movdqu 0x20($inp), @XMM[2]
1575 movdqu 0x30($inp), @XMM[3]
1577 movdqu 0x40($inp), @XMM[4]
1580 movdqu 0x50($inp), @XMM[5]
1582 movdqu 0x60($inp), @XMM[6]
1583 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1584 call _bsaes_decrypt8
1585 pxor 0x20(%rbp), @XMM[0] # ^= IV
1586 movdqu 0x00($inp), @XMM[8] # re-load input
1587 movdqu 0x10($inp), @XMM[9]
1588 pxor @XMM[8], @XMM[1]
1589 movdqu 0x20($inp), @XMM[10]
1590 pxor @XMM[9], @XMM[6]
1591 movdqu 0x30($inp), @XMM[11]
1592 pxor @XMM[10], @XMM[4]
1593 movdqu 0x40($inp), @XMM[12]
1594 pxor @XMM[11], @XMM[2]
1595 movdqu 0x50($inp), @XMM[13]
1596 pxor @XMM[12], @XMM[7]
1597 movdqu 0x60($inp), @XMM[15] # IV
1598 pxor @XMM[13], @XMM[3]
1599 movdqu @XMM[0], 0x00($out) # write output
1600 movdqu @XMM[1], 0x10($out)
1601 movdqu @XMM[6], 0x20($out)
1602 movdqu @XMM[4], 0x30($out)
1603 movdqu @XMM[2], 0x40($out)
1604 movdqu @XMM[7], 0x50($out)
1605 movdqu @XMM[3], 0x60($out)
1609 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1610 call _bsaes_decrypt8
1611 pxor 0x20(%rbp), @XMM[0] # ^= IV
1612 movdqu 0x00($inp), @XMM[8] # re-load input
1613 movdqu 0x10($inp), @XMM[9]
1614 pxor @XMM[8], @XMM[1]
1615 movdqu 0x20($inp), @XMM[10]
1616 pxor @XMM[9], @XMM[6]
1617 movdqu 0x30($inp), @XMM[11]
1618 pxor @XMM[10], @XMM[4]
1619 movdqu 0x40($inp), @XMM[12]
1620 pxor @XMM[11], @XMM[2]
1621 movdqu 0x50($inp), @XMM[15] # IV
1622 pxor @XMM[12], @XMM[7]
1623 movdqu @XMM[0], 0x00($out) # write output
1624 movdqu @XMM[1], 0x10($out)
1625 movdqu @XMM[6], 0x20($out)
1626 movdqu @XMM[4], 0x30($out)
1627 movdqu @XMM[2], 0x40($out)
1628 movdqu @XMM[7], 0x50($out)
1632 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1633 call _bsaes_decrypt8
1634 pxor 0x20(%rbp), @XMM[0] # ^= IV
1635 movdqu 0x00($inp), @XMM[8] # re-load input
1636 movdqu 0x10($inp), @XMM[9]
1637 pxor @XMM[8], @XMM[1]
1638 movdqu 0x20($inp), @XMM[10]
1639 pxor @XMM[9], @XMM[6]
1640 movdqu 0x30($inp), @XMM[11]
1641 pxor @XMM[10], @XMM[4]
1642 movdqu 0x40($inp), @XMM[15] # IV
1643 pxor @XMM[11], @XMM[2]
1644 movdqu @XMM[0], 0x00($out) # write output
1645 movdqu @XMM[1], 0x10($out)
1646 movdqu @XMM[6], 0x20($out)
1647 movdqu @XMM[4], 0x30($out)
1648 movdqu @XMM[2], 0x40($out)
1652 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1653 call _bsaes_decrypt8
1654 pxor 0x20(%rbp), @XMM[0] # ^= IV
1655 movdqu 0x00($inp), @XMM[8] # re-load input
1656 movdqu 0x10($inp), @XMM[9]
1657 pxor @XMM[8], @XMM[1]
1658 movdqu 0x20($inp), @XMM[10]
1659 pxor @XMM[9], @XMM[6]
1660 movdqu 0x30($inp), @XMM[15] # IV
1661 pxor @XMM[10], @XMM[4]
1662 movdqu @XMM[0], 0x00($out) # write output
1663 movdqu @XMM[1], 0x10($out)
1664 movdqu @XMM[6], 0x20($out)
1665 movdqu @XMM[4], 0x30($out)
1669 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1670 call _bsaes_decrypt8
1671 pxor 0x20(%rbp), @XMM[0] # ^= IV
1672 movdqu 0x00($inp), @XMM[8] # re-load input
1673 movdqu 0x10($inp), @XMM[9]
1674 pxor @XMM[8], @XMM[1]
1675 movdqu 0x20($inp), @XMM[15] # IV
1676 pxor @XMM[9], @XMM[6]
1677 movdqu @XMM[0], 0x00($out) # write output
1678 movdqu @XMM[1], 0x10($out)
1679 movdqu @XMM[6], 0x20($out)
1683 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1684 call _bsaes_decrypt8
1685 pxor 0x20(%rbp), @XMM[0] # ^= IV
1686 movdqu 0x00($inp), @XMM[8] # re-load input
1687 movdqu 0x10($inp), @XMM[15] # IV
1688 pxor @XMM[8], @XMM[1]
1689 movdqu @XMM[0], 0x00($out) # write output
1690 movdqu @XMM[1], 0x10($out)
1694 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1695 call _bsaes_decrypt8
1696 pxor 0x20(%rbp), @XMM[0] # ^= IV
1697 movdqu 0x00($inp), @XMM[15] # IV
1698 movdqu @XMM[0], 0x00($out) # write output
1701 movdqu @XMM[15], (%rdx) # return IV
1704 .Lcbc_dec_bzero: # wipe key schedule [if any]
1705 movdqa %xmm0, 0x00(%rax)
1706 movdqa %xmm0, 0x10(%rax)
1707 lea 0x20(%rax), %rax
1711 lea (%rbp),%rsp # restore %rsp
1713 $code.=<<___ if ($win64);
1714 movaps 0x40(%rbp), %xmm6
1715 movaps 0x50(%rbp), %xmm7
1716 movaps 0x60(%rbp), %xmm8
1717 movaps 0x70(%rbp), %xmm9
1718 movaps 0x80(%rbp), %xmm10
1719 movaps 0x90(%rbp), %xmm11
1720 movaps 0xa0(%rbp), %xmm12
1721 movaps 0xb0(%rbp), %xmm13
1722 movaps 0xc0(%rbp), %xmm14
1723 movaps 0xd0(%rbp), %xmm15
1724 lea 0xa0(%rbp), %rsp
1727 mov 0x48(%rsp), %r15
1728 mov 0x50(%rsp), %r14
1729 mov 0x58(%rsp), %r13
1730 mov 0x60(%rsp), %r12
1731 mov 0x68(%rsp), %rbx
1732 mov 0x70(%rsp), %rbp
1733 lea 0x78(%rsp), %rsp
1736 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1738 .globl bsaes_ctr32_encrypt_blocks
1739 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1741 bsaes_ctr32_encrypt_blocks:
1748 lea -0x48(%rsp), %rsp
1750 $code.=<<___ if ($win64);
1751 mov 0xa0(%rsp),$arg5 # pull ivp
1752 lea -0xa0(%rsp), %rsp
1753 movaps %xmm6, 0x40(%rsp)
1754 movaps %xmm7, 0x50(%rsp)
1755 movaps %xmm8, 0x60(%rsp)
1756 movaps %xmm9, 0x70(%rsp)
1757 movaps %xmm10, 0x80(%rsp)
1758 movaps %xmm11, 0x90(%rsp)
1759 movaps %xmm12, 0xa0(%rsp)
1760 movaps %xmm13, 0xb0(%rsp)
1761 movaps %xmm14, 0xc0(%rsp)
1762 movaps %xmm15, 0xd0(%rsp)
1766 mov %rsp, %rbp # backup %rsp
1767 movdqu ($arg5), %xmm0 # load counter
1768 mov 240($arg4), %eax # rounds
1769 mov $arg1, $inp # backup arguments
1773 movdqa %xmm0, 0x20(%rbp) # copy counter
1777 mov %eax, %ebx # rounds
1778 shl \$7, %rax # 128 bytes per inner round key
1779 sub \$`128-32`, %rax # size of bit-sliced key schedule
1782 mov %rsp, %rax # pass key schedule
1783 mov $key, %rcx # pass key
1784 mov %ebx, %r10d # pass rounds
1785 call _bsaes_key_convert
1786 pxor %xmm6,%xmm7 # fix up last round key
1787 movdqa %xmm7,(%rax) # save last round key
1789 movdqa (%rsp), @XMM[9] # load round0 key
1790 lea .LADD1(%rip), %r11
1791 movdqa 0x20(%rbp), @XMM[0] # counter copy
1792 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1793 pshufb @XMM[8], @XMM[9] # byte swap upper part
1794 pshufb @XMM[8], @XMM[0]
1795 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1799 movdqa @XMM[0], 0x20(%rbp) # save counter
1800 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1801 movdqa @XMM[0], @XMM[2]
1802 paddd 0x00(%r11), @XMM[1] # .LADD1
1803 movdqa @XMM[0], @XMM[3]
1804 paddd 0x10(%r11), @XMM[2] # .LADD2
1805 movdqa @XMM[0], @XMM[4]
1806 paddd 0x20(%r11), @XMM[3] # .LADD3
1807 movdqa @XMM[0], @XMM[5]
1808 paddd 0x30(%r11), @XMM[4] # .LADD4
1809 movdqa @XMM[0], @XMM[6]
1810 paddd 0x40(%r11), @XMM[5] # .LADD5
1811 movdqa @XMM[0], @XMM[7]
1812 paddd 0x50(%r11), @XMM[6] # .LADD6
1813 paddd 0x60(%r11), @XMM[7] # .LADD7
1815 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1816 # to flip byte order in 32-bit counter
1817 movdqa (%rsp), @XMM[9] # round 0 key
1818 lea 0x10(%rsp), %rax # pass key schedule
1819 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1820 pxor @XMM[9], @XMM[0] # xor with round0 key
1821 pxor @XMM[9], @XMM[1]
1822 pshufb @XMM[8], @XMM[0]
1823 pxor @XMM[9], @XMM[2]
1824 pshufb @XMM[8], @XMM[1]
1825 pxor @XMM[9], @XMM[3]
1826 pshufb @XMM[8], @XMM[2]
1827 pxor @XMM[9], @XMM[4]
1828 pshufb @XMM[8], @XMM[3]
1829 pxor @XMM[9], @XMM[5]
1830 pshufb @XMM[8], @XMM[4]
1831 pxor @XMM[9], @XMM[6]
1832 pshufb @XMM[8], @XMM[5]
1833 pxor @XMM[9], @XMM[7]
1834 pshufb @XMM[8], @XMM[6]
1835 lea .LBS0(%rip), %r11 # constants table
1836 pshufb @XMM[8], @XMM[7]
1837 mov %ebx,%r10d # pass rounds
1839 call _bsaes_encrypt8_bitslice
1842 jc .Lctr_enc_loop_done
1844 movdqu 0x00($inp), @XMM[8] # load input
1845 movdqu 0x10($inp), @XMM[9]
1846 movdqu 0x20($inp), @XMM[10]
1847 movdqu 0x30($inp), @XMM[11]
1848 movdqu 0x40($inp), @XMM[12]
1849 movdqu 0x50($inp), @XMM[13]
1850 movdqu 0x60($inp), @XMM[14]
1851 movdqu 0x70($inp), @XMM[15]
1853 pxor @XMM[0], @XMM[8]
1854 movdqa 0x20(%rbp), @XMM[0] # load counter
1855 pxor @XMM[9], @XMM[1]
1856 movdqu @XMM[8], 0x00($out) # write output
1857 pxor @XMM[10], @XMM[4]
1858 movdqu @XMM[1], 0x10($out)
1859 pxor @XMM[11], @XMM[6]
1860 movdqu @XMM[4], 0x20($out)
1861 pxor @XMM[12], @XMM[3]
1862 movdqu @XMM[6], 0x30($out)
1863 pxor @XMM[13], @XMM[7]
1864 movdqu @XMM[3], 0x40($out)
1865 pxor @XMM[14], @XMM[2]
1866 movdqu @XMM[7], 0x50($out)
1867 pxor @XMM[15], @XMM[5]
1868 movdqu @XMM[2], 0x60($out)
1869 lea .LADD1(%rip), %r11
1870 movdqu @XMM[5], 0x70($out)
1871 lea 0x80($out), $out
1872 paddd 0x70(%r11), @XMM[0] # .LADD8
1877 .Lctr_enc_loop_done:
1878 movdqu 0x00($inp), @XMM[8] # load input
1879 pxor @XMM[8], @XMM[0]
1880 movdqu @XMM[0], 0x00($out) # write output
1883 movdqu 0x10($inp), @XMM[9]
1884 pxor @XMM[9], @XMM[1]
1885 movdqu @XMM[1], 0x10($out)
1887 movdqu 0x20($inp), @XMM[10]
1888 pxor @XMM[10], @XMM[4]
1889 movdqu @XMM[4], 0x20($out)
1892 movdqu 0x30($inp), @XMM[11]
1893 pxor @XMM[11], @XMM[6]
1894 movdqu @XMM[6], 0x30($out)
1896 movdqu 0x40($inp), @XMM[12]
1897 pxor @XMM[12], @XMM[3]
1898 movdqu @XMM[3], 0x40($out)
1901 movdqu 0x50($inp), @XMM[13]
1902 pxor @XMM[13], @XMM[7]
1903 movdqu @XMM[7], 0x50($out)
1905 movdqu 0x60($inp), @XMM[14]
1906 pxor @XMM[14], @XMM[2]
1907 movdqu @XMM[2], 0x60($out)
1912 lea 0x20(%rbp), $arg1
1913 lea 0x30(%rbp), $arg2
1916 movdqu ($inp), @XMM[1]
1918 mov 0x2c(%rbp), %eax # load 32-bit counter
1920 pxor 0x30(%rbp), @XMM[1]
1921 inc %eax # increment
1922 movdqu @XMM[1], ($out)
1925 mov %eax, 0x2c(%rsp) # save 32-bit counter
1932 .Lctr_enc_bzero: # wipe key schedule [if any]
1933 movdqa %xmm0, 0x00(%rax)
1934 movdqa %xmm0, 0x10(%rax)
1935 lea 0x20(%rax), %rax
1939 lea (%rbp),%rsp # restore %rsp
1941 $code.=<<___ if ($win64);
1942 movaps 0x40(%rbp), %xmm6
1943 movaps 0x50(%rbp), %xmm7
1944 movaps 0x60(%rbp), %xmm8
1945 movaps 0x70(%rbp), %xmm9
1946 movaps 0x80(%rbp), %xmm10
1947 movaps 0x90(%rbp), %xmm11
1948 movaps 0xa0(%rbp), %xmm12
1949 movaps 0xb0(%rbp), %xmm13
1950 movaps 0xc0(%rbp), %xmm14
1951 movaps 0xd0(%rbp), %xmm15
1952 lea 0xa0(%rbp), %rsp
1955 mov 0x48(%rsp), %r15
1956 mov 0x50(%rsp), %r14
1957 mov 0x58(%rsp), %r13
1958 mov 0x60(%rsp), %r12
1959 mov 0x68(%rsp), %rbx
1960 mov 0x70(%rsp), %rbp
1961 lea 0x78(%rsp), %rsp
1964 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1968 .type _bsaes_const,\@object
1971 .LM0ISR: # InvShiftRows constants
1972 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
1974 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
1976 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
1977 .LBS0: # bit-slice constants
1978 .quad 0x5555555555555555, 0x5555555555555555
1980 .quad 0x3333333333333333, 0x3333333333333333
1982 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
1983 .LSR: # shiftrows constants
1984 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
1986 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
1988 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
1990 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
1991 .LNOT: # magic constants
1992 .quad 0xffffffffffffffff, 0xffffffffffffffff
1994 .quad 0x6363636363636363, 0x6363636363636363
1995 .LSWPUP: # byte-swap upper dword
1996 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
1998 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
1999 .LADD1: # counter increment constants
2000 .quad 0x0000000000000000, 0x0000000100000000
2002 .quad 0x0000000000000000, 0x0000000200000000
2004 .quad 0x0000000000000000, 0x0000000300000000
2006 .quad 0x0000000000000000, 0x0000000400000000
2008 .quad 0x0000000000000000, 0x0000000500000000
2010 .quad 0x0000000000000000, 0x0000000600000000
2012 .quad 0x0000000000000000, 0x0000000700000000
2014 .quad 0x0000000000000000, 0x0000000800000000
2015 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2017 .size _bsaes_const,.-_bsaes_const
2020 $code =~ s/\`([^\`]*)\`/eval($1)/gem;