3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
67 # conversion conversion/8x block
72 # The ratio values mean that 128-byte blocks will be processed
73 # 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
83 # Add decryption procedure.
89 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
91 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
93 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96 die "can't locate x86_64-xlate.pl";
98 open STDOUT,"| $^X $xlate $flavour $output";
100 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
101 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
104 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
107 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
108 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
113 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
114 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
118 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
119 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
141 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
142 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
162 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
163 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
167 &InvInBasisChange (@b);
168 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
169 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
172 sub InvInBasisChange { # OutBasisChange in reverse
173 my @b=@_[5,1,2,6,3,7,0,4];
191 sub InvOutBasisChange { # InBasisChange in reverse
192 my @b=@_[2,5,7,3,6,1,0,4];
213 #;*************************************************************
214 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
215 #;*************************************************************
216 my ($x0,$x1,$y0,$y1,$t0)=@_;
229 sub Mul_GF4_N { # not used, see next subroutine
230 # multiply and scale by N
231 my ($x0,$x1,$y0,$y1,$t0)=@_;
245 # interleaved Mul_GF4_N and Mul_GF4
246 my ($x0,$x1,$y0,$y1,$t0,
247 $x2,$x3,$y2,$y3,$t1)=@_;
275 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
282 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
283 @x[2], @x[3], @y[2], @y[3], @t[2]);
295 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
296 @x[6], @x[7], @y[2], @y[3], @t[2]);
301 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
310 #;********************************************************************
311 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
312 #;********************************************************************
316 # direct optimizations from hardware
371 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
373 # new smaller inversion
407 # output in s3, s2, s1, t1
409 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
411 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
412 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
414 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
417 # AES linear components
423 pxor 0x00($key),@x[0]
424 pxor 0x10($key),@x[1]
426 pxor 0x20($key),@x[2]
428 pxor 0x30($key),@x[3]
430 pxor 0x40($key),@x[4]
432 pxor 0x50($key),@x[5]
434 pxor 0x60($key),@x[6]
436 pxor 0x70($key),@x[7]
444 # modified to emit output in order suitable for feeding back to aesenc[last]
448 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
449 pshufd \$0x93, @x[1], @t[1]
450 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
451 pshufd \$0x93, @x[2], @t[2]
453 pshufd \$0x93, @x[3], @t[3]
455 pshufd \$0x93, @x[4], @t[4]
457 pshufd \$0x93, @x[5], @t[5]
459 pshufd \$0x93, @x[6], @t[6]
461 pshufd \$0x93, @x[7], @t[7]
468 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
470 pshufd \$0x4E, @x[1], @x[1]
476 pshufd \$0x4E, @x[4], @t[0]
478 pshufd \$0x4E, @x[5], @t[1]
480 pshufd \$0x4E, @x[3], @x[4]
482 pshufd \$0x4E, @x[7], @x[5]
484 pshufd \$0x4E, @x[6], @x[3]
486 pshufd \$0x4E, @x[2], @x[6]
503 pshufd \$0x93, @x[0], @t[0]
504 pshufd \$0x93, @x[1], @t[1]
506 pshufd \$0x93, @x[3], @t[3]
509 pshufd \$0x93, @x[6], @t[6]
510 pshufd \$0x93, @x[7], @t[7]
512 # multiplication by 0x0e
513 pxor @x[5], @x[7] # 7 5
514 pxor @x[5], @x[2] # 2 5
515 pxor @x[0], @x[5] # 5 0 [1]
516 pxor @x[1], @x[0] # 0 1
517 pxor @x[2], @x[1] # 1 25
518 pxor @x[6], @x[0] # 01 6 [2]
519 pxor @x[3], @x[1] # 125 3 [4]
520 pxor @x[0], @x[2] # 25 016 [3]
521 pxor @x[7], @x[3] # 3 75
522 pxor @x[6], @x[7] # 75 6 [0]
523 pxor @x[4], @x[6] # 6 4
524 pxor @x[3], @x[4] # 4 375 [6]
525 pxor @x[7], @x[3] # 375 756=36
526 pxor @t[5], @x[6] # 64 5 [7]
527 pshufd \$0x93, @t[5], @t[5]
528 pxor @t[2], @x[3] # 36 2
529 pshufd \$0x93, @t[2], @t[2]
530 pxor @t[4], @x[3] # 362 4 [5]
531 pshufd \$0x93, @t[4], @t[4]
533 my @y = @x[7,5,0,2,1,3,4,6];
535 # multiplication by 0x0b
539 pxor @t[7], @y[0] # 0^=057
540 pxor @y[0], @y[1] # 1^=057
542 pxor @t[6], @y[1] # 1^=057 16
544 pxor @t[6], @t[7] # clobber t[7]
548 pxor @t[7], @y[2] # 2^=12 67
553 pxor @t[5], @y[3] # 3^=0235
556 pxor @t[4], @y[7] # 7^=4 67
558 pxor @t[5], @t[7] # clobber t[7] even more
561 pxor @t[7], @y[6] # 6^=3 567
563 pxor @t[7], @y[5] # 5^=567
564 pxor @t[7], @y[4] # 4^=567
567 pxor @t[6], @t[7] # restore t[7]
570 pxor @t[4], @y[5] # 5^=24 567
574 pxor @t[4], @y[4] # 4^=134 567
576 pshufd \$0x93, @t[0], @t[0]
577 pshufd \$0x93, @t[1], @t[1]
578 pshufd \$0x93, @t[2], @t[2]
579 pshufd \$0x93, @t[3], @t[3]
580 pshufd \$0x93, @t[4], @t[4]
581 pshufd \$0x93, @t[5], @t[5]
582 pshufd \$0x93, @t[6], @t[6]
583 pshufd \$0x93, @t[7], @t[7]
585 # multiplication by 0x0d
588 pxor @t[6], @y[0] # 0^=056
593 pxor @t[7], @y[1] # 1^=157
594 pxor @y[1], @y[3] # 3^=157
598 pxor @t[6], @y[2] # 2^=026
600 pxor @t[3], @t[6] # clobber t[6]
603 pxor @t[6], @y[3] # 3^=0 36 157
608 pxor @t[7], @y[7] # 7^=457
609 pxor @y[7], @y[4] # 4^=457
611 pxor @t[2], @y[4] # 4^=12 457
615 pxor @t[6], @y[5] # 5^=25 36
619 pxor @t[7], @y[6] # 6^=47 36
620 pxor @t[3], @t[6] # restore t[6]
622 pshufd \$0x93, @t[0], @t[0]
623 pshufd \$0x93, @t[1], @t[1]
624 pshufd \$0x93, @t[2], @t[2]
625 pshufd \$0x93, @t[3], @t[3]
626 pshufd \$0x93, @t[4], @t[4]
627 pshufd \$0x93, @t[5], @t[5]
628 pshufd \$0x93, @t[6], @t[6]
629 pshufd \$0x93, @t[7], @t[7]
631 # multiplication by 0x09
635 pxor @t[6], @y[1] # 1^=156
636 pxor @y[1], @y[4] # 4^=156
637 pxor @t[4], @y[4] # 4^=4 156
639 pxor @t[7], @t[6] # clobber t[6]
640 pxor @t[5], @t[0] # clobber t[0]
642 pxor @t[0], @y[0] # 0^=05
645 pxor @t[7], @y[3] # 3^=05 37
648 pxor @t[6], @y[2] # 2^=2 67
652 pxor @t[6], @y[5] # 5^=25 67
655 pxor @t[6], @y[6] # 6^=3 67
658 pxor @t[7], @y[7] # 7^=47
680 sub aesenc { # not used
684 movdqa 0x30($const),@t[0] # .LSR
686 &ShiftRows (@b,@t[0]);
688 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
691 sub aesenclast { # not used
695 movdqa 0x40($const),@t[0] # .LSRM0
697 &ShiftRows (@b,@t[0]);
700 pxor 0x00($key),@b[0]
701 pxor 0x10($key),@b[1]
702 pxor 0x20($key),@b[4]
703 pxor 0x30($key),@b[6]
704 pxor 0x40($key),@b[3]
705 pxor 0x50($key),@b[7]
706 pxor 0x60($key),@b[2]
707 pxor 0x70($key),@b[5]
712 my ($a,$b,$n,$mask,$t)=@_;
724 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
744 my @x=reverse(@_[0..7]);
745 my ($t0,$t1,$t2,$t3)=@_[8..11];
747 movdqa 0x00($const),$t0 # .LBS0
748 movdqa 0x10($const),$t1 # .LBS1
750 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
751 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
753 movdqa 0x20($const),$t0 # .LBS2
755 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
756 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
758 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
759 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
768 .type _bsaes_encrypt8,\@abi-omnipotent
771 lea .LBS0(%rip), $const # constants table
773 movdqa ($key), @XMM[9] # round 0 key
775 movdqa 0x60($const), @XMM[8] # .LM0SR
776 pxor @XMM[9], @XMM[0] # xor with round0 key
777 pxor @XMM[9], @XMM[1]
778 pshufb @XMM[8], @XMM[0]
779 pxor @XMM[9], @XMM[2]
780 pshufb @XMM[8], @XMM[1]
781 pxor @XMM[9], @XMM[3]
782 pshufb @XMM[8], @XMM[2]
783 pxor @XMM[9], @XMM[4]
784 pshufb @XMM[8], @XMM[3]
785 pxor @XMM[9], @XMM[5]
786 pshufb @XMM[8], @XMM[4]
787 pxor @XMM[9], @XMM[6]
788 pshufb @XMM[8], @XMM[5]
789 pxor @XMM[9], @XMM[7]
790 pshufb @XMM[8], @XMM[6]
791 pshufb @XMM[8], @XMM[7]
792 _bsaes_encrypt8_bitslice:
794 &bitslice (@XMM[0..7, 8..11]);
801 &ShiftRows (@XMM[0..7, 8]);
802 $code.=".Lenc_sbox:\n";
803 &Sbox (@XMM[0..7, 8..15]);
808 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
810 movdqa 0x30($const), @XMM[8] # .LSR
812 movdqa 0x40($const), @XMM[8] # .LSRM0
817 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
818 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
820 movdqa ($key), @XMM[8] # last round key
821 pxor @XMM[8], @XMM[4]
822 pxor @XMM[8], @XMM[6]
823 pxor @XMM[8], @XMM[3]
824 pxor @XMM[8], @XMM[7]
825 pxor @XMM[8], @XMM[2]
826 pxor @XMM[8], @XMM[5]
827 pxor @XMM[8], @XMM[0]
828 pxor @XMM[8], @XMM[1]
830 .size _bsaes_encrypt8,.-_bsaes_encrypt8
832 .type _bsaes_decrypt8,\@abi-omnipotent
835 lea .LBS0(%rip), $const # constants table
837 movdqa ($key), @XMM[9] # round 0 key
839 movdqa -0x30($const), @XMM[8] # .LM0ISR
840 pxor @XMM[9], @XMM[0] # xor with round0 key
841 pxor @XMM[9], @XMM[1]
842 pshufb @XMM[8], @XMM[0]
843 pxor @XMM[9], @XMM[2]
844 pshufb @XMM[8], @XMM[1]
845 pxor @XMM[9], @XMM[3]
846 pshufb @XMM[8], @XMM[2]
847 pxor @XMM[9], @XMM[4]
848 pshufb @XMM[8], @XMM[3]
849 pxor @XMM[9], @XMM[5]
850 pshufb @XMM[8], @XMM[4]
851 pxor @XMM[9], @XMM[6]
852 pshufb @XMM[8], @XMM[5]
853 pxor @XMM[9], @XMM[7]
854 pshufb @XMM[8], @XMM[6]
855 pshufb @XMM[8], @XMM[7]
857 &bitslice (@XMM[0..7, 8..11]);
864 &ShiftRows (@XMM[0..7, 8]);
865 $code.=".Ldec_sbox:\n";
866 &InvSbox (@XMM[0..7, 8..15]);
871 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
873 movdqa -0x10($const), @XMM[8] # .LISR
875 movdqa -0x20($const), @XMM[8] # .LISRM0
880 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
882 movdqa ($key), @XMM[8] # last round key
883 pxor @XMM[8], @XMM[6]
884 pxor @XMM[8], @XMM[4]
885 pxor @XMM[8], @XMM[2]
886 pxor @XMM[8], @XMM[7]
887 pxor @XMM[8], @XMM[3]
888 pxor @XMM[8], @XMM[5]
889 pxor @XMM[8], @XMM[0]
890 pxor @XMM[8], @XMM[1]
892 .size _bsaes_decrypt8,.-_bsaes_decrypt8
896 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
899 my @x=reverse(@_[0..7]);
900 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
902 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
904 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
908 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
910 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
912 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
918 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
919 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
923 .type _bsaes_key_convert,\@abi-omnipotent
926 lea .LBS1(%rip), $const
927 movdqu ($inp), %xmm7 # load round 0 key
928 movdqa -0x10($const), %xmm8 # .LBS0
929 movdqa 0x00($const), %xmm9 # .LBS1
930 movdqa 0x10($const), %xmm10 # .LBS2
931 movdqa 0x40($const), %xmm13 # .LM0
932 movdqa 0x60($const), %xmm14 # .LNOT
934 movdqu 0x10($inp), %xmm6 # load round 1 key
936 movdqa %xmm7, ($out) # save round 0 key
942 pshufb %xmm13, %xmm6 # .LM0
945 &bitslice_key (map("%xmm$_",(0..7, 8..12)));
947 pxor %xmm14, %xmm5 # "pnot"
952 movdqa %xmm0, 0x00($out) # write bit-sliced round key
953 movdqa %xmm1, 0x10($out)
954 movdqa %xmm2, 0x20($out)
955 movdqa %xmm3, 0x30($out)
956 movdqa %xmm4, 0x40($out)
957 movdqa %xmm5, 0x50($out)
958 movdqa %xmm6, 0x60($out)
959 movdqa %xmm7, 0x70($out)
961 movdqu ($inp), %xmm6 # load next round key
965 movdqa 0x70($const), %xmm7 # .L63
966 #movdqa %xmm6, ($out) # don't save last round key
968 .size _bsaes_key_convert,.-_bsaes_key_convert
972 if (1 && !$win64) { # following four functions are unsupported interface
973 # used for benchmarking...
975 .globl bsaes_enc_key_convert
976 .type bsaes_enc_key_convert,\@function,2
978 bsaes_enc_key_convert:
979 mov 240($inp),%r10d # pass rounds
980 mov $inp,%rcx # pass key
981 mov $out,%rax # pass key schedule
982 call _bsaes_key_convert
983 pxor %xmm6,%xmm7 # fix up last round key
984 movdqa %xmm7,(%rax) # save last round key
986 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
988 .globl bsaes_encrypt_128
989 .type bsaes_encrypt_128,\@function,4
993 movdqu 0x00($inp), @XMM[0] # load input
994 movdqu 0x10($inp), @XMM[1]
995 movdqu 0x20($inp), @XMM[2]
996 movdqu 0x30($inp), @XMM[3]
997 movdqu 0x40($inp), @XMM[4]
998 movdqu 0x50($inp), @XMM[5]
999 movdqu 0x60($inp), @XMM[6]
1000 movdqu 0x70($inp), @XMM[7]
1001 mov $key, %rax # pass the $key
1002 lea 0x80($inp), $inp
1005 call _bsaes_encrypt8
1007 movdqu @XMM[0], 0x00($out) # write output
1008 movdqu @XMM[1], 0x10($out)
1009 movdqu @XMM[4], 0x20($out)
1010 movdqu @XMM[6], 0x30($out)
1011 movdqu @XMM[3], 0x40($out)
1012 movdqu @XMM[7], 0x50($out)
1013 movdqu @XMM[2], 0x60($out)
1014 movdqu @XMM[5], 0x70($out)
1015 lea 0x80($out), $out
1019 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1021 .globl bsaes_dec_key_convert
1022 .type bsaes_dec_key_convert,\@function,2
1024 bsaes_dec_key_convert:
1025 mov 240($inp),%r10d # pass rounds
1026 mov $inp,%rcx # pass key
1027 mov $out,%rax # pass key schedule
1028 call _bsaes_key_convert
1029 pxor ($out),%xmm7 # fix up round 0 key
1030 movdqa %xmm6,(%rax) # save last round key
1033 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1035 .globl bsaes_decrypt_128
1036 .type bsaes_decrypt_128,\@function,4
1040 movdqu 0x00($inp), @XMM[0] # load input
1041 movdqu 0x10($inp), @XMM[1]
1042 movdqu 0x20($inp), @XMM[2]
1043 movdqu 0x30($inp), @XMM[3]
1044 movdqu 0x40($inp), @XMM[4]
1045 movdqu 0x50($inp), @XMM[5]
1046 movdqu 0x60($inp), @XMM[6]
1047 movdqu 0x70($inp), @XMM[7]
1048 mov $key, %rax # pass the $key
1049 lea 0x80($inp), $inp
1052 call _bsaes_decrypt8
1054 movdqu @XMM[0], 0x00($out) # write output
1055 movdqu @XMM[1], 0x10($out)
1056 movdqu @XMM[6], 0x20($out)
1057 movdqu @XMM[4], 0x30($out)
1058 movdqu @XMM[2], 0x40($out)
1059 movdqu @XMM[7], 0x50($out)
1060 movdqu @XMM[3], 0x60($out)
1061 movdqu @XMM[5], 0x70($out)
1062 lea 0x80($out), $out
1066 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1070 ######################################################################
1074 my ($arg1,$arg2,$arg3,$arg4,$arg5) = $win64 ? ("%rcx","%rdx","%r8","%r9","%r10")
1075 : ("%rdi","%rsi","%rdx","%rcx","%r8");
1076 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1079 .globl bsaes_ecb_encrypt_blocks
1080 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1082 bsaes_ecb_encrypt_blocks:
1089 lea -0x48(%rsp),%rsp
1091 $code.=<<___ if ($win64);
1092 lea -0xa0(%rsp), %rsp
1093 movaps %xmm6, 0x40(%rsp)
1094 movaps %xmm7, 0x50(%rsp)
1095 movaps %xmm8, 0x60(%rsp)
1096 movaps %xmm9, 0x70(%rsp)
1097 movaps %xmm10, 0x80(%rsp)
1098 movaps %xmm11, 0x90(%rsp)
1099 movaps %xmm12, 0xa0(%rsp)
1100 movaps %xmm13, 0xb0(%rsp)
1101 movaps %xmm14, 0xc0(%rsp)
1102 movaps %xmm15, 0xd0(%rsp)
1106 mov %rsp,%rbp # backup %rsp
1107 mov 240($arg4),%eax # rounds
1108 mov $arg1,$inp # backup arguments
1115 mov %eax,%ebx # backup rounds
1116 shl \$7,%rax # 128 bytes per inner round key
1117 sub \$`128-32`,%rax # size of bit-sliced key schedule
1119 mov %rsp,%rax # pass key schedule
1120 mov $key,%rcx # pass key
1121 mov %ebx,%r10d # pass rounds
1122 call _bsaes_key_convert
1123 pxor %xmm6,%xmm7 # fix up last round key
1124 movdqa %xmm7,(%rax) # save last round key
1128 movdqu 0x00($inp), @XMM[0] # load input
1129 movdqu 0x10($inp), @XMM[1]
1130 movdqu 0x20($inp), @XMM[2]
1131 movdqu 0x30($inp), @XMM[3]
1132 movdqu 0x40($inp), @XMM[4]
1133 movdqu 0x50($inp), @XMM[5]
1134 mov %rsp, %rax # pass key schedule
1135 movdqu 0x60($inp), @XMM[6]
1136 mov %ebx,%r10d # pass rounds
1137 movdqu 0x70($inp), @XMM[7]
1138 lea 0x80($inp), $inp
1140 call _bsaes_encrypt8
1142 movdqu @XMM[0], 0x00($out) # write output
1143 movdqu @XMM[1], 0x10($out)
1144 movdqu @XMM[4], 0x20($out)
1145 movdqu @XMM[6], 0x30($out)
1146 movdqu @XMM[3], 0x40($out)
1147 movdqu @XMM[7], 0x50($out)
1148 movdqu @XMM[2], 0x60($out)
1149 movdqu @XMM[5], 0x70($out)
1150 lea 0x80($out), $out
1157 movdqu 0x00($inp), @XMM[0] # load input
1158 mov %rsp, %rax # pass key schedule
1159 mov %ebx,%r10d # pass rounds
1162 movdqu 0x10($inp), @XMM[1]
1164 movdqu 0x20($inp), @XMM[2]
1167 movdqu 0x30($inp), @XMM[3]
1169 movdqu 0x40($inp), @XMM[4]
1172 movdqu 0x50($inp), @XMM[5]
1174 movdqu 0x60($inp), @XMM[6]
1175 call _bsaes_encrypt8
1176 movdqu @XMM[0], 0x00($out) # write output
1177 movdqu @XMM[1], 0x10($out)
1178 movdqu @XMM[4], 0x20($out)
1179 movdqu @XMM[6], 0x30($out)
1180 movdqu @XMM[3], 0x40($out)
1181 movdqu @XMM[7], 0x50($out)
1182 movdqu @XMM[2], 0x60($out)
1186 call _bsaes_encrypt8
1187 movdqu @XMM[0], 0x00($out) # write output
1188 movdqu @XMM[1], 0x10($out)
1189 movdqu @XMM[4], 0x20($out)
1190 movdqu @XMM[6], 0x30($out)
1191 movdqu @XMM[3], 0x40($out)
1192 movdqu @XMM[7], 0x50($out)
1196 call _bsaes_encrypt8
1197 movdqu @XMM[0], 0x00($out) # write output
1198 movdqu @XMM[1], 0x10($out)
1199 movdqu @XMM[4], 0x20($out)
1200 movdqu @XMM[6], 0x30($out)
1201 movdqu @XMM[3], 0x40($out)
1205 call _bsaes_encrypt8
1206 movdqu @XMM[0], 0x00($out) # write output
1207 movdqu @XMM[1], 0x10($out)
1208 movdqu @XMM[4], 0x20($out)
1209 movdqu @XMM[6], 0x30($out)
1213 call _bsaes_encrypt8
1214 movdqu @XMM[0], 0x00($out) # write output
1215 movdqu @XMM[1], 0x10($out)
1216 movdqu @XMM[4], 0x20($out)
1220 call _bsaes_encrypt8
1221 movdqu @XMM[0], 0x00($out) # write output
1222 movdqu @XMM[1], 0x10($out)
1226 call _bsaes_encrypt8
1227 movdqu @XMM[0], 0x00($out) # write output
1243 .Lecb_enc_bzero: # wipe key schedule [if any]
1244 movdqa %xmm0, 0x00(%rax)
1245 movdqa %xmm0, 0x10(%rax)
1246 lea 0x20(%rax), %rax
1250 lea (%rbp),%rsp # restore %rsp
1252 $code.=<<___ if ($win64);
1253 movaps 0x40(%rbp), %xmm6
1254 movaps 0x50(%rbp), %xmm7
1255 movaps 0x60(%rbp), %xmm8
1256 movaps 0x70(%rbp), %xmm9
1257 movaps 0x80(%rbp), %xmm10
1258 movaps 0x90(%rbp), %xmm11
1259 movaps 0xa0(%rbp), %xmm12
1260 movaps 0xb0(%rbp), %xmm13
1261 movaps 0xc0(%rbp), %xmm14
1262 movaps 0xd0(%rbp), %xmm15
1263 lea 0xa0(%rbp), %rsp
1266 mov 0x48(%rsp), %r15
1267 mov 0x50(%rsp), %r14
1268 mov 0x58(%rsp), %r13
1269 mov 0x60(%rsp), %r12
1270 mov 0x68(%rsp), %rbx
1271 mov 0x70(%rsp), %rbp
1272 lea 0x78(%rsp), %rsp
1275 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1277 .globl bsaes_ctr32_encrypt_blocks
1278 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1280 bsaes_ctr32_encrypt_blocks:
1287 lea -0x48(%rsp), %rsp
1289 $code.=<<___ if ($win64);
1290 mov 0xa0(%rsp),$arg5 # pull ivp
1291 lea -0xa0(%rsp), %rsp
1292 movaps %xmm6, 0x40(%rsp)
1293 movaps %xmm7, 0x50(%rsp)
1294 movaps %xmm8, 0x60(%rsp)
1295 movaps %xmm9, 0x70(%rsp)
1296 movaps %xmm10, 0x80(%rsp)
1297 movaps %xmm11, 0x90(%rsp)
1298 movaps %xmm12, 0xa0(%rsp)
1299 movaps %xmm13, 0xb0(%rsp)
1300 movaps %xmm14, 0xc0(%rsp)
1301 movaps %xmm15, 0xd0(%rsp)
1305 mov %rsp, %rbp # backup %rsp
1306 movdqu ($arg5), %xmm0 # load counter
1307 mov 240($arg4), %eax # rounds
1308 mov $arg1, $inp # backup arguments
1312 movdqa %xmm0, 0x20(%rbp) # copy counter
1316 mov %eax, %ebx # rounds
1317 shl \$7, %rax # 128 bytes per inner round key
1318 sub \$`128-32`, %rax # size of bit-sliced key schedule
1321 mov %rsp, %rax # pass key schedule
1322 mov $key, %rcx # pass key
1323 mov %ebx, %r10d # pass rounds
1324 call _bsaes_key_convert
1325 pxor %xmm6,%xmm7 # fix up last round key
1326 movdqa %xmm7,(%rax) # save last round key
1328 movdqa (%rsp), @XMM[9] # load round0 key
1329 lea .LADD1(%rip), %r11
1330 movdqa 0x20(%rbp), @XMM[0] # counter copy
1331 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1332 pshufb @XMM[8], @XMM[9] # byte swap upper part
1333 pshufb @XMM[8], @XMM[0]
1334 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1338 movdqa @XMM[0], 0x20(%rbp) # save counter
1339 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1340 movdqa @XMM[0], @XMM[2]
1341 paddd 0x00(%r11), @XMM[1] # .LADD1
1342 movdqa @XMM[0], @XMM[3]
1343 paddd 0x10(%r11), @XMM[2] # .LADD2
1344 movdqa @XMM[0], @XMM[4]
1345 paddd 0x20(%r11), @XMM[3] # .LADD3
1346 movdqa @XMM[0], @XMM[5]
1347 paddd 0x30(%r11), @XMM[4] # .LADD4
1348 movdqa @XMM[0], @XMM[6]
1349 paddd 0x40(%r11), @XMM[5] # .LADD5
1350 movdqa @XMM[0], @XMM[7]
1351 paddd 0x50(%r11), @XMM[6] # .LADD6
1352 paddd 0x60(%r11), @XMM[7] # .LADD7
1354 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1355 # to flip byte order in 32-bit counter
1356 movdqa (%rsp), @XMM[9] # round 0 key
1357 lea 0x10(%rsp), %rax # pass key schedule
1358 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1359 pxor @XMM[9], @XMM[0] # xor with round0 key
1360 pxor @XMM[9], @XMM[1]
1361 pshufb @XMM[8], @XMM[0]
1362 pxor @XMM[9], @XMM[2]
1363 pshufb @XMM[8], @XMM[1]
1364 pxor @XMM[9], @XMM[3]
1365 pshufb @XMM[8], @XMM[2]
1366 pxor @XMM[9], @XMM[4]
1367 pshufb @XMM[8], @XMM[3]
1368 pxor @XMM[9], @XMM[5]
1369 pshufb @XMM[8], @XMM[4]
1370 pxor @XMM[9], @XMM[6]
1371 pshufb @XMM[8], @XMM[5]
1372 pxor @XMM[9], @XMM[7]
1373 pshufb @XMM[8], @XMM[6]
1374 lea .LBS0(%rip), %r11 # constants table
1375 pshufb @XMM[8], @XMM[7]
1376 mov %ebx,%r10d # pass rounds
1378 call _bsaes_encrypt8_bitslice
1381 jc .Lctr_enc_loop_done
1383 movdqu 0x00($inp), @XMM[8] # load input
1384 movdqu 0x10($inp), @XMM[9]
1385 movdqu 0x20($inp), @XMM[10]
1386 movdqu 0x30($inp), @XMM[11]
1387 movdqu 0x40($inp), @XMM[12]
1388 movdqu 0x50($inp), @XMM[13]
1389 movdqu 0x60($inp), @XMM[14]
1390 movdqu 0x70($inp), @XMM[15]
1392 pxor @XMM[0], @XMM[8]
1393 movdqa 0x20(%rbp), @XMM[0] # load counter
1394 pxor @XMM[9], @XMM[1]
1395 movdqu @XMM[8], 0x00($out) # write output
1396 pxor @XMM[10], @XMM[4]
1397 movdqu @XMM[1], 0x10($out)
1398 pxor @XMM[11], @XMM[6]
1399 movdqu @XMM[4], 0x20($out)
1400 pxor @XMM[12], @XMM[3]
1401 movdqu @XMM[6], 0x30($out)
1402 pxor @XMM[13], @XMM[7]
1403 movdqu @XMM[3], 0x40($out)
1404 pxor @XMM[14], @XMM[2]
1405 movdqu @XMM[7], 0x50($out)
1406 pxor @XMM[15], @XMM[5]
1407 movdqu @XMM[2], 0x60($out)
1408 lea .LADD1(%rip), %r11
1409 movdqu @XMM[5], 0x70($out)
1410 lea 0x80($out), $out
1411 paddd 0x70(%r11), @XMM[0] # .LADD8
1416 .Lctr_enc_loop_done:
1417 movdqu 0x00($inp), @XMM[8] # load input
1418 pxor @XMM[8], @XMM[0]
1419 movdqu @XMM[0], 0x00($out) # write output
1422 movdqu 0x10($inp), @XMM[9]
1423 pxor @XMM[9], @XMM[1]
1424 movdqu @XMM[1], 0x10($out)
1426 movdqu 0x20($inp), @XMM[10]
1427 pxor @XMM[10], @XMM[4]
1428 movdqu @XMM[4], 0x20($out)
1431 movdqu 0x30($inp), @XMM[11]
1432 pxor @XMM[11], @XMM[6]
1433 movdqu @XMM[6], 0x30($out)
1435 movdqu 0x40($inp), @XMM[12]
1436 pxor @XMM[12], @XMM[3]
1437 movdqu @XMM[3], 0x40($out)
1440 movdqu 0x50($inp), @XMM[13]
1441 pxor @XMM[13], @XMM[7]
1442 movdqu @XMM[7], 0x50($out)
1444 movdqu 0x60($inp), @XMM[14]
1445 pxor @XMM[14], @XMM[2]
1446 movdqu @XMM[2], 0x60($out)
1451 lea 0x20(%rbp), $arg1
1452 lea 0x30(%rbp), $arg2
1455 movdqu ($inp), @XMM[1]
1457 mov 0x2c(%rbp), %eax # load 32-bit counter
1459 pxor 0x30(%rbp), @XMM[1]
1460 inc %eax # increment
1461 movdqu @XMM[1], ($out)
1464 mov %eax, 0x2c(%rsp) # save 32-bit counter
1471 .Lctr_enc_bzero: # wipe key schedule [if any]
1472 movdqa %xmm0, 0x00(%rax)
1473 movdqa %xmm0, 0x10(%rax)
1474 lea 0x20(%rax), %rax
1478 lea (%rbp),%rsp # restore %rsp
1480 $code.=<<___ if ($win64);
1481 movaps 0x40(%rbp), %xmm6
1482 movaps 0x50(%rbp), %xmm7
1483 movaps 0x60(%rbp), %xmm8
1484 movaps 0x70(%rbp), %xmm9
1485 movaps 0x80(%rbp), %xmm10
1486 movaps 0x90(%rbp), %xmm11
1487 movaps 0xa0(%rbp), %xmm12
1488 movaps 0xb0(%rbp), %xmm13
1489 movaps 0xc0(%rbp), %xmm14
1490 movaps 0xd0(%rbp), %xmm15
1491 lea 0xa0(%rbp), %rsp
1494 mov 0x48(%rsp), %r15
1495 mov 0x50(%rsp), %r14
1496 mov 0x58(%rsp), %r13
1497 mov 0x60(%rsp), %r12
1498 mov 0x68(%rsp), %rbx
1499 mov 0x70(%rsp), %rbp
1500 lea 0x78(%rsp), %rsp
1503 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1508 .LM0ISR: # InvShiftRows constants
1509 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
1511 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
1513 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
1514 .LBS0: # bit-slice constants
1515 .quad 0x5555555555555555, 0x5555555555555555
1517 .quad 0x3333333333333333, 0x3333333333333333
1519 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
1520 .LSR: # shiftrows constants
1521 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
1523 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
1525 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
1527 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
1528 .LNOT: # magic constants
1529 .quad 0xffffffffffffffff, 0xffffffffffffffff
1531 .quad 0x6363636363636363, 0x6363636363636363
1532 .LSWPUP: # byte-swap upper dword
1533 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
1535 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
1536 .LADD1: # counter increment constants
1537 .quad 0x0000000000000000, 0x0000000100000000
1539 .quad 0x0000000000000000, 0x0000000200000000
1541 .quad 0x0000000000000000, 0x0000000300000000
1543 .quad 0x0000000000000000, 0x0000000400000000
1545 .quad 0x0000000000000000, 0x0000000500000000
1547 .quad 0x0000000000000000, 0x0000000600000000
1549 .quad 0x0000000000000000, 0x0000000700000000
1551 .quad 0x0000000000000000, 0x0000000800000000
1552 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper and Peter Schwabe"
1556 $code =~ s/\`([^\`]*)\`/eval($1)/gem;