3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
67 # conversion conversion/8x block
72 # The ratio values mean that 128-byte blocks will be processed
73 # 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
84 # one byte out of 4096-byte buffer with 128-bit key is:
93 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
95 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
97 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
98 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
99 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
100 die "can't locate x86_64-xlate.pl";
102 open STDOUT,"| $^X $xlate $flavour $output";
104 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
105 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
108 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
111 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
112 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
117 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
118 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
122 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
123 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
145 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
146 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
166 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
167 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
171 &InvInBasisChange (@b);
172 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
173 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
176 sub InvInBasisChange { # OutBasisChange in reverse
177 my @b=@_[5,1,2,6,3,7,0,4];
195 sub InvOutBasisChange { # InBasisChange in reverse
196 my @b=@_[2,5,7,3,6,1,0,4];
217 #;*************************************************************
218 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
219 #;*************************************************************
220 my ($x0,$x1,$y0,$y1,$t0)=@_;
233 sub Mul_GF4_N { # not used, see next subroutine
234 # multiply and scale by N
235 my ($x0,$x1,$y0,$y1,$t0)=@_;
249 # interleaved Mul_GF4_N and Mul_GF4
250 my ($x0,$x1,$y0,$y1,$t0,
251 $x2,$x3,$y2,$y3,$t1)=@_;
279 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
286 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
287 @x[2], @x[3], @y[2], @y[3], @t[2]);
299 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
300 @x[6], @x[7], @y[2], @y[3], @t[2]);
305 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
314 #;********************************************************************
315 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
316 #;********************************************************************
320 # direct optimizations from hardware
375 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
377 # new smaller inversion
411 # output in s3, s2, s1, t1
413 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
415 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
416 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
418 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
421 # AES linear components
427 pxor 0x00($key),@x[0]
428 pxor 0x10($key),@x[1]
430 pxor 0x20($key),@x[2]
432 pxor 0x30($key),@x[3]
434 pxor 0x40($key),@x[4]
436 pxor 0x50($key),@x[5]
438 pxor 0x60($key),@x[6]
440 pxor 0x70($key),@x[7]
448 # modified to emit output in order suitable for feeding back to aesenc[last]
452 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
453 pshufd \$0x93, @x[1], @t[1]
454 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
455 pshufd \$0x93, @x[2], @t[2]
457 pshufd \$0x93, @x[3], @t[3]
459 pshufd \$0x93, @x[4], @t[4]
461 pshufd \$0x93, @x[5], @t[5]
463 pshufd \$0x93, @x[6], @t[6]
465 pshufd \$0x93, @x[7], @t[7]
472 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
474 pshufd \$0x4E, @x[1], @x[1]
480 pshufd \$0x4E, @x[4], @t[0]
482 pshufd \$0x4E, @x[5], @t[1]
484 pshufd \$0x4E, @x[3], @x[4]
486 pshufd \$0x4E, @x[7], @x[5]
488 pshufd \$0x4E, @x[6], @x[3]
490 pshufd \$0x4E, @x[2], @x[6]
507 # multiplication by 0x0e
508 pshufd \$0x93, @x[7], @t[7]
510 pxor @x[5], @x[7] # 7 5
511 pxor @x[5], @x[2] # 2 5
512 pshufd \$0x93, @x[0], @t[0]
514 pxor @x[0], @x[5] # 5 0 [1]
515 pxor @x[1], @x[0] # 0 1
516 pshufd \$0x93, @x[1], @t[1]
517 pxor @x[2], @x[1] # 1 25
518 pxor @x[6], @x[0] # 01 6 [2]
519 pxor @x[3], @x[1] # 125 3 [4]
520 pshufd \$0x93, @x[3], @t[3]
521 pxor @x[0], @x[2] # 25 016 [3]
522 pxor @x[7], @x[3] # 3 75
523 pxor @x[6], @x[7] # 75 6 [0]
524 pshufd \$0x93, @x[6], @t[6]
526 pxor @x[4], @x[6] # 6 4
527 pxor @x[3], @x[4] # 4 375 [6]
528 pxor @x[7], @x[3] # 375 756=36
529 pxor @t[5], @x[6] # 64 5 [7]
530 pxor @t[2], @x[3] # 36 2
531 pxor @t[4], @x[3] # 362 4 [5]
532 pshufd \$0x93, @t[5], @t[5]
534 my @y = @x[7,5,0,2,1,3,4,6];
536 # multiplication by 0x0b
540 pshufd \$0x93, @t[2], @t[2]
544 pshufd \$0x93, @t[4], @t[4]
545 pxor @t[6], @t[7] # clobber t[7]
549 pshufd \$0x93, @t[0], @t[0]
553 pshufd \$0x93, @t[1], @t[1]
557 pshufd \$0x93, @t[2], @t[2]
561 pshufd \$0x93, @t[3], @t[3]
567 pxor @t[5], @t[7] # clobber t[7] even more
570 pshufd \$0x93, @t[4], @t[4]
575 pshufd \$0x93, @t[5], @t[5]
576 pxor @t[6], @t[7] # restore t[7]
578 # multiplication by 0x0d
581 pshufd \$0x93, @t[6], @t[6]
585 pshufd \$0x93, @t[7], @t[7]
594 pshufd \$0x93, @t[0], @t[0]
598 pshufd \$0x93, @t[1], @t[1]
603 pshufd \$0x93, @t[2], @t[2]
605 pxor @t[3], @t[6] # clobber t[6]
612 pshufd \$0x93, @t[4], @t[4]
615 pxor @t[3], @t[6] # restore t[6]
617 pshufd \$0x93, @t[5], @t[5]
618 pshufd \$0x93, @t[6], @t[6]
619 pshufd \$0x93, @t[7], @t[7]
620 pshufd \$0x93, @t[3], @t[3]
622 # multiplication by 0x09
624 pxor @y[1], @t[1] # t[1]=y[1]
625 pxor @t[5], @t[0] # clobber t[0]
628 pxor @y[0], @t[0] # t[0]=y[0]
630 pxor @t[7], @t[6] # clobber t[6]
633 pxor @y[4], @t[4] # t[4]=y[4]
635 pxor @y[3], @t[3] # t[3]=y[3]
637 pxor @y[2], @t[2] # t[2]=y[2]
639 pxor @y[5], @t[5] # t[5]=y[5]
642 pxor @y[6], @t[6] # t[6]=y[6]
643 pxor @y[7], @t[7] # t[7]=y[7]
656 sub aesenc { # not used
660 movdqa 0x30($const),@t[0] # .LSR
662 &ShiftRows (@b,@t[0]);
664 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
667 sub aesenclast { # not used
671 movdqa 0x40($const),@t[0] # .LSRM0
673 &ShiftRows (@b,@t[0]);
676 pxor 0x00($key),@b[0]
677 pxor 0x10($key),@b[1]
678 pxor 0x20($key),@b[4]
679 pxor 0x30($key),@b[6]
680 pxor 0x40($key),@b[3]
681 pxor 0x50($key),@b[7]
682 pxor 0x60($key),@b[2]
683 pxor 0x70($key),@b[5]
688 my ($a,$b,$n,$mask,$t)=@_;
700 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
720 my @x=reverse(@_[0..7]);
721 my ($t0,$t1,$t2,$t3)=@_[8..11];
723 movdqa 0x00($const),$t0 # .LBS0
724 movdqa 0x10($const),$t1 # .LBS1
726 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
727 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
729 movdqa 0x20($const),$t0 # .LBS2
731 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
732 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
734 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
735 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
744 .type _bsaes_encrypt8,\@abi-omnipotent
747 lea .LBS0(%rip), $const # constants table
749 movdqa ($key), @XMM[9] # round 0 key
751 movdqa 0x60($const), @XMM[8] # .LM0SR
752 pxor @XMM[9], @XMM[0] # xor with round0 key
753 pxor @XMM[9], @XMM[1]
754 pshufb @XMM[8], @XMM[0]
755 pxor @XMM[9], @XMM[2]
756 pshufb @XMM[8], @XMM[1]
757 pxor @XMM[9], @XMM[3]
758 pshufb @XMM[8], @XMM[2]
759 pxor @XMM[9], @XMM[4]
760 pshufb @XMM[8], @XMM[3]
761 pxor @XMM[9], @XMM[5]
762 pshufb @XMM[8], @XMM[4]
763 pxor @XMM[9], @XMM[6]
764 pshufb @XMM[8], @XMM[5]
765 pxor @XMM[9], @XMM[7]
766 pshufb @XMM[8], @XMM[6]
767 pshufb @XMM[8], @XMM[7]
768 _bsaes_encrypt8_bitslice:
770 &bitslice (@XMM[0..7, 8..11]);
777 &ShiftRows (@XMM[0..7, 8]);
778 $code.=".Lenc_sbox:\n";
779 &Sbox (@XMM[0..7, 8..15]);
784 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
786 movdqa 0x30($const), @XMM[8] # .LSR
788 movdqa 0x40($const), @XMM[8] # .LSRM0
793 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
794 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
796 movdqa ($key), @XMM[8] # last round key
797 pxor @XMM[8], @XMM[4]
798 pxor @XMM[8], @XMM[6]
799 pxor @XMM[8], @XMM[3]
800 pxor @XMM[8], @XMM[7]
801 pxor @XMM[8], @XMM[2]
802 pxor @XMM[8], @XMM[5]
803 pxor @XMM[8], @XMM[0]
804 pxor @XMM[8], @XMM[1]
806 .size _bsaes_encrypt8,.-_bsaes_encrypt8
808 .type _bsaes_decrypt8,\@abi-omnipotent
811 lea .LBS0(%rip), $const # constants table
813 movdqa ($key), @XMM[9] # round 0 key
815 movdqa -0x30($const), @XMM[8] # .LM0ISR
816 pxor @XMM[9], @XMM[0] # xor with round0 key
817 pxor @XMM[9], @XMM[1]
818 pshufb @XMM[8], @XMM[0]
819 pxor @XMM[9], @XMM[2]
820 pshufb @XMM[8], @XMM[1]
821 pxor @XMM[9], @XMM[3]
822 pshufb @XMM[8], @XMM[2]
823 pxor @XMM[9], @XMM[4]
824 pshufb @XMM[8], @XMM[3]
825 pxor @XMM[9], @XMM[5]
826 pshufb @XMM[8], @XMM[4]
827 pxor @XMM[9], @XMM[6]
828 pshufb @XMM[8], @XMM[5]
829 pxor @XMM[9], @XMM[7]
830 pshufb @XMM[8], @XMM[6]
831 pshufb @XMM[8], @XMM[7]
833 &bitslice (@XMM[0..7, 8..11]);
840 &ShiftRows (@XMM[0..7, 8]);
841 $code.=".Ldec_sbox:\n";
842 &InvSbox (@XMM[0..7, 8..15]);
847 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
849 movdqa -0x10($const), @XMM[8] # .LISR
851 movdqa -0x20($const), @XMM[8] # .LISRM0
856 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
858 movdqa ($key), @XMM[8] # last round key
859 pxor @XMM[8], @XMM[6]
860 pxor @XMM[8], @XMM[4]
861 pxor @XMM[8], @XMM[2]
862 pxor @XMM[8], @XMM[7]
863 pxor @XMM[8], @XMM[3]
864 pxor @XMM[8], @XMM[5]
865 pxor @XMM[8], @XMM[0]
866 pxor @XMM[8], @XMM[1]
868 .size _bsaes_decrypt8,.-_bsaes_decrypt8
872 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
875 my @x=reverse(@_[0..7]);
876 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
878 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
880 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
884 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
886 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
888 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
894 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
895 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
899 .type _bsaes_key_convert,\@abi-omnipotent
902 lea .LBS1(%rip), $const
903 movdqu ($inp), %xmm7 # load round 0 key
904 movdqa -0x10($const), %xmm8 # .LBS0
905 movdqa 0x00($const), %xmm9 # .LBS1
906 movdqa 0x10($const), %xmm10 # .LBS2
907 movdqa 0x40($const), %xmm13 # .LM0
908 movdqa 0x60($const), %xmm14 # .LNOT
910 movdqu 0x10($inp), %xmm6 # load round 1 key
912 movdqa %xmm7, ($out) # save round 0 key
918 pshufb %xmm13, %xmm6 # .LM0
921 &bitslice_key (map("%xmm$_",(0..7, 8..12)));
923 pxor %xmm14, %xmm5 # "pnot"
928 movdqa %xmm0, 0x00($out) # write bit-sliced round key
929 movdqa %xmm1, 0x10($out)
930 movdqa %xmm2, 0x20($out)
931 movdqa %xmm3, 0x30($out)
932 movdqa %xmm4, 0x40($out)
933 movdqa %xmm5, 0x50($out)
934 movdqa %xmm6, 0x60($out)
935 movdqa %xmm7, 0x70($out)
937 movdqu ($inp), %xmm6 # load next round key
941 movdqa 0x70($const), %xmm7 # .L63
942 #movdqa %xmm6, ($out) # don't save last round key
944 .size _bsaes_key_convert,.-_bsaes_key_convert
948 if (1 && !$win64) { # following four functions are unsupported interface
949 # used for benchmarking...
951 .globl bsaes_enc_key_convert
952 .type bsaes_enc_key_convert,\@function,2
954 bsaes_enc_key_convert:
955 mov 240($inp),%r10d # pass rounds
956 mov $inp,%rcx # pass key
957 mov $out,%rax # pass key schedule
958 call _bsaes_key_convert
959 pxor %xmm6,%xmm7 # fix up last round key
960 movdqa %xmm7,(%rax) # save last round key
962 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
964 .globl bsaes_encrypt_128
965 .type bsaes_encrypt_128,\@function,4
969 movdqu 0x00($inp), @XMM[0] # load input
970 movdqu 0x10($inp), @XMM[1]
971 movdqu 0x20($inp), @XMM[2]
972 movdqu 0x30($inp), @XMM[3]
973 movdqu 0x40($inp), @XMM[4]
974 movdqu 0x50($inp), @XMM[5]
975 movdqu 0x60($inp), @XMM[6]
976 movdqu 0x70($inp), @XMM[7]
977 mov $key, %rax # pass the $key
983 movdqu @XMM[0], 0x00($out) # write output
984 movdqu @XMM[1], 0x10($out)
985 movdqu @XMM[4], 0x20($out)
986 movdqu @XMM[6], 0x30($out)
987 movdqu @XMM[3], 0x40($out)
988 movdqu @XMM[7], 0x50($out)
989 movdqu @XMM[2], 0x60($out)
990 movdqu @XMM[5], 0x70($out)
995 .size bsaes_encrypt_128,.-bsaes_encrypt_128
997 .globl bsaes_dec_key_convert
998 .type bsaes_dec_key_convert,\@function,2
1000 bsaes_dec_key_convert:
1001 mov 240($inp),%r10d # pass rounds
1002 mov $inp,%rcx # pass key
1003 mov $out,%rax # pass key schedule
1004 call _bsaes_key_convert
1005 pxor ($out),%xmm7 # fix up round 0 key
1006 movdqa %xmm6,(%rax) # save last round key
1009 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1011 .globl bsaes_decrypt_128
1012 .type bsaes_decrypt_128,\@function,4
1016 movdqu 0x00($inp), @XMM[0] # load input
1017 movdqu 0x10($inp), @XMM[1]
1018 movdqu 0x20($inp), @XMM[2]
1019 movdqu 0x30($inp), @XMM[3]
1020 movdqu 0x40($inp), @XMM[4]
1021 movdqu 0x50($inp), @XMM[5]
1022 movdqu 0x60($inp), @XMM[6]
1023 movdqu 0x70($inp), @XMM[7]
1024 mov $key, %rax # pass the $key
1025 lea 0x80($inp), $inp
1028 call _bsaes_decrypt8
1030 movdqu @XMM[0], 0x00($out) # write output
1031 movdqu @XMM[1], 0x10($out)
1032 movdqu @XMM[6], 0x20($out)
1033 movdqu @XMM[4], 0x30($out)
1034 movdqu @XMM[2], 0x40($out)
1035 movdqu @XMM[7], 0x50($out)
1036 movdqu @XMM[3], 0x60($out)
1037 movdqu @XMM[5], 0x70($out)
1038 lea 0x80($out), $out
1042 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1046 ######################################################################
1050 my ($arg1,$arg2,$arg3,$arg4,$arg5) = $win64 ? ("%rcx","%rdx","%r8","%r9","%r10")
1051 : ("%rdi","%rsi","%rdx","%rcx","%r8");
1052 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1055 .globl bsaes_ecb_encrypt_blocks
1056 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1058 bsaes_ecb_encrypt_blocks:
1065 lea -0x48(%rsp),%rsp
1067 $code.=<<___ if ($win64);
1068 lea -0xa0(%rsp), %rsp
1069 movaps %xmm6, 0x40(%rsp)
1070 movaps %xmm7, 0x50(%rsp)
1071 movaps %xmm8, 0x60(%rsp)
1072 movaps %xmm9, 0x70(%rsp)
1073 movaps %xmm10, 0x80(%rsp)
1074 movaps %xmm11, 0x90(%rsp)
1075 movaps %xmm12, 0xa0(%rsp)
1076 movaps %xmm13, 0xb0(%rsp)
1077 movaps %xmm14, 0xc0(%rsp)
1078 movaps %xmm15, 0xd0(%rsp)
1082 mov %rsp,%rbp # backup %rsp
1083 mov 240($arg4),%eax # rounds
1084 mov $arg1,$inp # backup arguments
1091 mov %eax,%ebx # backup rounds
1092 shl \$7,%rax # 128 bytes per inner round key
1093 sub \$`128-32`,%rax # size of bit-sliced key schedule
1095 mov %rsp,%rax # pass key schedule
1096 mov $key,%rcx # pass key
1097 mov %ebx,%r10d # pass rounds
1098 call _bsaes_key_convert
1099 pxor %xmm6,%xmm7 # fix up last round key
1100 movdqa %xmm7,(%rax) # save last round key
1104 movdqu 0x00($inp), @XMM[0] # load input
1105 movdqu 0x10($inp), @XMM[1]
1106 movdqu 0x20($inp), @XMM[2]
1107 movdqu 0x30($inp), @XMM[3]
1108 movdqu 0x40($inp), @XMM[4]
1109 movdqu 0x50($inp), @XMM[5]
1110 mov %rsp, %rax # pass key schedule
1111 movdqu 0x60($inp), @XMM[6]
1112 mov %ebx,%r10d # pass rounds
1113 movdqu 0x70($inp), @XMM[7]
1114 lea 0x80($inp), $inp
1116 call _bsaes_encrypt8
1118 movdqu @XMM[0], 0x00($out) # write output
1119 movdqu @XMM[1], 0x10($out)
1120 movdqu @XMM[4], 0x20($out)
1121 movdqu @XMM[6], 0x30($out)
1122 movdqu @XMM[3], 0x40($out)
1123 movdqu @XMM[7], 0x50($out)
1124 movdqu @XMM[2], 0x60($out)
1125 movdqu @XMM[5], 0x70($out)
1126 lea 0x80($out), $out
1133 movdqu 0x00($inp), @XMM[0] # load input
1134 mov %rsp, %rax # pass key schedule
1135 mov %ebx,%r10d # pass rounds
1138 movdqu 0x10($inp), @XMM[1]
1140 movdqu 0x20($inp), @XMM[2]
1143 movdqu 0x30($inp), @XMM[3]
1145 movdqu 0x40($inp), @XMM[4]
1148 movdqu 0x50($inp), @XMM[5]
1150 movdqu 0x60($inp), @XMM[6]
1151 call _bsaes_encrypt8
1152 movdqu @XMM[0], 0x00($out) # write output
1153 movdqu @XMM[1], 0x10($out)
1154 movdqu @XMM[4], 0x20($out)
1155 movdqu @XMM[6], 0x30($out)
1156 movdqu @XMM[3], 0x40($out)
1157 movdqu @XMM[7], 0x50($out)
1158 movdqu @XMM[2], 0x60($out)
1162 call _bsaes_encrypt8
1163 movdqu @XMM[0], 0x00($out) # write output
1164 movdqu @XMM[1], 0x10($out)
1165 movdqu @XMM[4], 0x20($out)
1166 movdqu @XMM[6], 0x30($out)
1167 movdqu @XMM[3], 0x40($out)
1168 movdqu @XMM[7], 0x50($out)
1172 call _bsaes_encrypt8
1173 movdqu @XMM[0], 0x00($out) # write output
1174 movdqu @XMM[1], 0x10($out)
1175 movdqu @XMM[4], 0x20($out)
1176 movdqu @XMM[6], 0x30($out)
1177 movdqu @XMM[3], 0x40($out)
1181 call _bsaes_encrypt8
1182 movdqu @XMM[0], 0x00($out) # write output
1183 movdqu @XMM[1], 0x10($out)
1184 movdqu @XMM[4], 0x20($out)
1185 movdqu @XMM[6], 0x30($out)
1189 call _bsaes_encrypt8
1190 movdqu @XMM[0], 0x00($out) # write output
1191 movdqu @XMM[1], 0x10($out)
1192 movdqu @XMM[4], 0x20($out)
1196 call _bsaes_encrypt8
1197 movdqu @XMM[0], 0x00($out) # write output
1198 movdqu @XMM[1], 0x10($out)
1202 call _bsaes_encrypt8
1203 movdqu @XMM[0], 0x00($out) # write output
1219 .Lecb_enc_bzero: # wipe key schedule [if any]
1220 movdqa %xmm0, 0x00(%rax)
1221 movdqa %xmm0, 0x10(%rax)
1222 lea 0x20(%rax), %rax
1226 lea (%rbp),%rsp # restore %rsp
1228 $code.=<<___ if ($win64);
1229 movaps 0x40(%rbp), %xmm6
1230 movaps 0x50(%rbp), %xmm7
1231 movaps 0x60(%rbp), %xmm8
1232 movaps 0x70(%rbp), %xmm9
1233 movaps 0x80(%rbp), %xmm10
1234 movaps 0x90(%rbp), %xmm11
1235 movaps 0xa0(%rbp), %xmm12
1236 movaps 0xb0(%rbp), %xmm13
1237 movaps 0xc0(%rbp), %xmm14
1238 movaps 0xd0(%rbp), %xmm15
1239 lea 0xa0(%rbp), %rsp
1242 mov 0x48(%rsp), %r15
1243 mov 0x50(%rsp), %r14
1244 mov 0x58(%rsp), %r13
1245 mov 0x60(%rsp), %r12
1246 mov 0x68(%rsp), %rbx
1247 mov 0x70(%rsp), %rbp
1248 lea 0x78(%rsp), %rsp
1251 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1253 .globl bsaes_ctr32_encrypt_blocks
1254 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1256 bsaes_ctr32_encrypt_blocks:
1263 lea -0x48(%rsp), %rsp
1265 $code.=<<___ if ($win64);
1266 mov 0xa0(%rsp),$arg5 # pull ivp
1267 lea -0xa0(%rsp), %rsp
1268 movaps %xmm6, 0x40(%rsp)
1269 movaps %xmm7, 0x50(%rsp)
1270 movaps %xmm8, 0x60(%rsp)
1271 movaps %xmm9, 0x70(%rsp)
1272 movaps %xmm10, 0x80(%rsp)
1273 movaps %xmm11, 0x90(%rsp)
1274 movaps %xmm12, 0xa0(%rsp)
1275 movaps %xmm13, 0xb0(%rsp)
1276 movaps %xmm14, 0xc0(%rsp)
1277 movaps %xmm15, 0xd0(%rsp)
1281 mov %rsp, %rbp # backup %rsp
1282 movdqu ($arg5), %xmm0 # load counter
1283 mov 240($arg4), %eax # rounds
1284 mov $arg1, $inp # backup arguments
1288 movdqa %xmm0, 0x20(%rbp) # copy counter
1292 mov %eax, %ebx # rounds
1293 shl \$7, %rax # 128 bytes per inner round key
1294 sub \$`128-32`, %rax # size of bit-sliced key schedule
1297 mov %rsp, %rax # pass key schedule
1298 mov $key, %rcx # pass key
1299 mov %ebx, %r10d # pass rounds
1300 call _bsaes_key_convert
1301 pxor %xmm6,%xmm7 # fix up last round key
1302 movdqa %xmm7,(%rax) # save last round key
1304 movdqa (%rsp), @XMM[9] # load round0 key
1305 lea .LADD1(%rip), %r11
1306 movdqa 0x20(%rbp), @XMM[0] # counter copy
1307 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1308 pshufb @XMM[8], @XMM[9] # byte swap upper part
1309 pshufb @XMM[8], @XMM[0]
1310 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1314 movdqa @XMM[0], 0x20(%rbp) # save counter
1315 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1316 movdqa @XMM[0], @XMM[2]
1317 paddd 0x00(%r11), @XMM[1] # .LADD1
1318 movdqa @XMM[0], @XMM[3]
1319 paddd 0x10(%r11), @XMM[2] # .LADD2
1320 movdqa @XMM[0], @XMM[4]
1321 paddd 0x20(%r11), @XMM[3] # .LADD3
1322 movdqa @XMM[0], @XMM[5]
1323 paddd 0x30(%r11), @XMM[4] # .LADD4
1324 movdqa @XMM[0], @XMM[6]
1325 paddd 0x40(%r11), @XMM[5] # .LADD5
1326 movdqa @XMM[0], @XMM[7]
1327 paddd 0x50(%r11), @XMM[6] # .LADD6
1328 paddd 0x60(%r11), @XMM[7] # .LADD7
1330 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1331 # to flip byte order in 32-bit counter
1332 movdqa (%rsp), @XMM[9] # round 0 key
1333 lea 0x10(%rsp), %rax # pass key schedule
1334 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1335 pxor @XMM[9], @XMM[0] # xor with round0 key
1336 pxor @XMM[9], @XMM[1]
1337 pshufb @XMM[8], @XMM[0]
1338 pxor @XMM[9], @XMM[2]
1339 pshufb @XMM[8], @XMM[1]
1340 pxor @XMM[9], @XMM[3]
1341 pshufb @XMM[8], @XMM[2]
1342 pxor @XMM[9], @XMM[4]
1343 pshufb @XMM[8], @XMM[3]
1344 pxor @XMM[9], @XMM[5]
1345 pshufb @XMM[8], @XMM[4]
1346 pxor @XMM[9], @XMM[6]
1347 pshufb @XMM[8], @XMM[5]
1348 pxor @XMM[9], @XMM[7]
1349 pshufb @XMM[8], @XMM[6]
1350 lea .LBS0(%rip), %r11 # constants table
1351 pshufb @XMM[8], @XMM[7]
1352 mov %ebx,%r10d # pass rounds
1354 call _bsaes_encrypt8_bitslice
1357 jc .Lctr_enc_loop_done
1359 movdqu 0x00($inp), @XMM[8] # load input
1360 movdqu 0x10($inp), @XMM[9]
1361 movdqu 0x20($inp), @XMM[10]
1362 movdqu 0x30($inp), @XMM[11]
1363 movdqu 0x40($inp), @XMM[12]
1364 movdqu 0x50($inp), @XMM[13]
1365 movdqu 0x60($inp), @XMM[14]
1366 movdqu 0x70($inp), @XMM[15]
1368 pxor @XMM[0], @XMM[8]
1369 movdqa 0x20(%rbp), @XMM[0] # load counter
1370 pxor @XMM[9], @XMM[1]
1371 movdqu @XMM[8], 0x00($out) # write output
1372 pxor @XMM[10], @XMM[4]
1373 movdqu @XMM[1], 0x10($out)
1374 pxor @XMM[11], @XMM[6]
1375 movdqu @XMM[4], 0x20($out)
1376 pxor @XMM[12], @XMM[3]
1377 movdqu @XMM[6], 0x30($out)
1378 pxor @XMM[13], @XMM[7]
1379 movdqu @XMM[3], 0x40($out)
1380 pxor @XMM[14], @XMM[2]
1381 movdqu @XMM[7], 0x50($out)
1382 pxor @XMM[15], @XMM[5]
1383 movdqu @XMM[2], 0x60($out)
1384 lea .LADD1(%rip), %r11
1385 movdqu @XMM[5], 0x70($out)
1386 lea 0x80($out), $out
1387 paddd 0x70(%r11), @XMM[0] # .LADD8
1392 .Lctr_enc_loop_done:
1393 movdqu 0x00($inp), @XMM[8] # load input
1394 pxor @XMM[8], @XMM[0]
1395 movdqu @XMM[0], 0x00($out) # write output
1398 movdqu 0x10($inp), @XMM[9]
1399 pxor @XMM[9], @XMM[1]
1400 movdqu @XMM[1], 0x10($out)
1402 movdqu 0x20($inp), @XMM[10]
1403 pxor @XMM[10], @XMM[4]
1404 movdqu @XMM[4], 0x20($out)
1407 movdqu 0x30($inp), @XMM[11]
1408 pxor @XMM[11], @XMM[6]
1409 movdqu @XMM[6], 0x30($out)
1411 movdqu 0x40($inp), @XMM[12]
1412 pxor @XMM[12], @XMM[3]
1413 movdqu @XMM[3], 0x40($out)
1416 movdqu 0x50($inp), @XMM[13]
1417 pxor @XMM[13], @XMM[7]
1418 movdqu @XMM[7], 0x50($out)
1420 movdqu 0x60($inp), @XMM[14]
1421 pxor @XMM[14], @XMM[2]
1422 movdqu @XMM[2], 0x60($out)
1427 lea 0x20(%rbp), $arg1
1428 lea 0x30(%rbp), $arg2
1431 movdqu ($inp), @XMM[1]
1433 mov 0x2c(%rbp), %eax # load 32-bit counter
1435 pxor 0x30(%rbp), @XMM[1]
1436 inc %eax # increment
1437 movdqu @XMM[1], ($out)
1440 mov %eax, 0x2c(%rsp) # save 32-bit counter
1447 .Lctr_enc_bzero: # wipe key schedule [if any]
1448 movdqa %xmm0, 0x00(%rax)
1449 movdqa %xmm0, 0x10(%rax)
1450 lea 0x20(%rax), %rax
1454 lea (%rbp),%rsp # restore %rsp
1456 $code.=<<___ if ($win64);
1457 movaps 0x40(%rbp), %xmm6
1458 movaps 0x50(%rbp), %xmm7
1459 movaps 0x60(%rbp), %xmm8
1460 movaps 0x70(%rbp), %xmm9
1461 movaps 0x80(%rbp), %xmm10
1462 movaps 0x90(%rbp), %xmm11
1463 movaps 0xa0(%rbp), %xmm12
1464 movaps 0xb0(%rbp), %xmm13
1465 movaps 0xc0(%rbp), %xmm14
1466 movaps 0xd0(%rbp), %xmm15
1467 lea 0xa0(%rbp), %rsp
1470 mov 0x48(%rsp), %r15
1471 mov 0x50(%rsp), %r14
1472 mov 0x58(%rsp), %r13
1473 mov 0x60(%rsp), %r12
1474 mov 0x68(%rsp), %rbx
1475 mov 0x70(%rsp), %rbp
1476 lea 0x78(%rsp), %rsp
1479 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1484 .LM0ISR: # InvShiftRows constants
1485 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
1487 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
1489 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
1490 .LBS0: # bit-slice constants
1491 .quad 0x5555555555555555, 0x5555555555555555
1493 .quad 0x3333333333333333, 0x3333333333333333
1495 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
1496 .LSR: # shiftrows constants
1497 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
1499 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
1501 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
1503 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
1504 .LNOT: # magic constants
1505 .quad 0xffffffffffffffff, 0xffffffffffffffff
1507 .quad 0x6363636363636363, 0x6363636363636363
1508 .LSWPUP: # byte-swap upper dword
1509 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
1511 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
1512 .LADD1: # counter increment constants
1513 .quad 0x0000000000000000, 0x0000000100000000
1515 .quad 0x0000000000000000, 0x0000000200000000
1517 .quad 0x0000000000000000, 0x0000000300000000
1519 .quad 0x0000000000000000, 0x0000000400000000
1521 .quad 0x0000000000000000, 0x0000000500000000
1523 .quad 0x0000000000000000, 0x0000000600000000
1525 .quad 0x0000000000000000, 0x0000000700000000
1527 .quad 0x0000000000000000, 0x0000000800000000
1528 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper and Peter Schwabe"
1532 $code =~ s/\`([^\`]*)\`/eval($1)/gem;