3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
67 # conversion conversion/8x block
72 # The ratio values mean that 128-byte blocks will be processed
73 # 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
84 # one byte out of 4096-byte buffer with 128-bit key is:
91 # Add bsaes_xts_[en|de]crypt. Small-block performance is suboptimal,
92 # but XTS is meant to be used with larger blocks...
98 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105 die "can't locate x86_64-xlate.pl";
107 open STDOUT,"| $^X $xlate $flavour $output";
109 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
110 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
113 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
116 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
117 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
122 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
123 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
127 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
128 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
150 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
151 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
171 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
172 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176 &InvInBasisChange (@b);
177 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
178 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
181 sub InvInBasisChange { # OutBasisChange in reverse
182 my @b=@_[5,1,2,6,3,7,0,4];
200 sub InvOutBasisChange { # InBasisChange in reverse
201 my @b=@_[2,5,7,3,6,1,0,4];
222 #;*************************************************************
223 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
224 #;*************************************************************
225 my ($x0,$x1,$y0,$y1,$t0)=@_;
238 sub Mul_GF4_N { # not used, see next subroutine
239 # multiply and scale by N
240 my ($x0,$x1,$y0,$y1,$t0)=@_;
254 # interleaved Mul_GF4_N and Mul_GF4
255 my ($x0,$x1,$y0,$y1,$t0,
256 $x2,$x3,$y2,$y3,$t1)=@_;
284 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
291 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
292 @x[2], @x[3], @y[2], @y[3], @t[2]);
304 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
305 @x[6], @x[7], @y[2], @y[3], @t[2]);
310 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
319 #;********************************************************************
320 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
321 #;********************************************************************
325 # direct optimizations from hardware
380 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
382 # new smaller inversion
416 # output in s3, s2, s1, t1
418 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
420 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
421 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
423 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
426 # AES linear components
432 pxor 0x00($key),@x[0]
433 pxor 0x10($key),@x[1]
435 pxor 0x20($key),@x[2]
437 pxor 0x30($key),@x[3]
439 pxor 0x40($key),@x[4]
441 pxor 0x50($key),@x[5]
443 pxor 0x60($key),@x[6]
445 pxor 0x70($key),@x[7]
453 # modified to emit output in order suitable for feeding back to aesenc[last]
457 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
458 pshufd \$0x93, @x[1], @t[1]
459 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
460 pshufd \$0x93, @x[2], @t[2]
462 pshufd \$0x93, @x[3], @t[3]
464 pshufd \$0x93, @x[4], @t[4]
466 pshufd \$0x93, @x[5], @t[5]
468 pshufd \$0x93, @x[6], @t[6]
470 pshufd \$0x93, @x[7], @t[7]
477 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
479 pshufd \$0x4E, @x[1], @x[1]
485 pshufd \$0x4E, @x[4], @t[0]
487 pshufd \$0x4E, @x[5], @t[1]
489 pshufd \$0x4E, @x[3], @x[4]
491 pshufd \$0x4E, @x[7], @x[5]
493 pshufd \$0x4E, @x[6], @x[3]
495 pshufd \$0x4E, @x[2], @x[6]
512 # multiplication by 0x0e
513 pshufd \$0x93, @x[7], @t[7]
515 pxor @x[5], @x[7] # 7 5
516 pxor @x[5], @x[2] # 2 5
517 pshufd \$0x93, @x[0], @t[0]
519 pxor @x[0], @x[5] # 5 0 [1]
520 pxor @x[1], @x[0] # 0 1
521 pshufd \$0x93, @x[1], @t[1]
522 pxor @x[2], @x[1] # 1 25
523 pxor @x[6], @x[0] # 01 6 [2]
524 pxor @x[3], @x[1] # 125 3 [4]
525 pshufd \$0x93, @x[3], @t[3]
526 pxor @x[0], @x[2] # 25 016 [3]
527 pxor @x[7], @x[3] # 3 75
528 pxor @x[6], @x[7] # 75 6 [0]
529 pshufd \$0x93, @x[6], @t[6]
531 pxor @x[4], @x[6] # 6 4
532 pxor @x[3], @x[4] # 4 375 [6]
533 pxor @x[7], @x[3] # 375 756=36
534 pxor @t[5], @x[6] # 64 5 [7]
535 pxor @t[2], @x[3] # 36 2
536 pxor @t[4], @x[3] # 362 4 [5]
537 pshufd \$0x93, @t[5], @t[5]
539 my @y = @x[7,5,0,2,1,3,4,6];
541 # multiplication by 0x0b
545 pshufd \$0x93, @t[2], @t[2]
549 pshufd \$0x93, @t[4], @t[4]
550 pxor @t[6], @t[7] # clobber t[7]
554 pshufd \$0x93, @t[0], @t[0]
558 pshufd \$0x93, @t[1], @t[1]
562 pshufd \$0x93, @t[2], @t[2]
566 pshufd \$0x93, @t[3], @t[3]
572 pxor @t[5], @t[7] # clobber t[7] even more
575 pshufd \$0x93, @t[4], @t[4]
580 pshufd \$0x93, @t[5], @t[5]
581 pxor @t[6], @t[7] # restore t[7]
583 # multiplication by 0x0d
586 pshufd \$0x93, @t[6], @t[6]
590 pshufd \$0x93, @t[7], @t[7]
599 pshufd \$0x93, @t[0], @t[0]
603 pshufd \$0x93, @t[1], @t[1]
608 pshufd \$0x93, @t[2], @t[2]
610 pxor @t[3], @t[6] # clobber t[6]
617 pshufd \$0x93, @t[4], @t[4]
620 pxor @t[3], @t[6] # restore t[6]
622 pshufd \$0x93, @t[5], @t[5]
623 pshufd \$0x93, @t[6], @t[6]
624 pshufd \$0x93, @t[7], @t[7]
625 pshufd \$0x93, @t[3], @t[3]
627 # multiplication by 0x09
629 pxor @y[1], @t[1] # t[1]=y[1]
630 pxor @t[5], @t[0] # clobber t[0]
633 pxor @y[0], @t[0] # t[0]=y[0]
635 pxor @t[7], @t[6] # clobber t[6]
638 pxor @y[4], @t[4] # t[4]=y[4]
640 pxor @y[3], @t[3] # t[3]=y[3]
642 pxor @y[2], @t[2] # t[2]=y[2]
644 pxor @y[5], @t[5] # t[5]=y[5]
647 pxor @y[6], @t[6] # t[6]=y[6]
648 pxor @y[7], @t[7] # t[7]=y[7]
661 sub aesenc { # not used
665 movdqa 0x30($const),@t[0] # .LSR
667 &ShiftRows (@b,@t[0]);
669 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
672 sub aesenclast { # not used
676 movdqa 0x40($const),@t[0] # .LSRM0
678 &ShiftRows (@b,@t[0]);
681 pxor 0x00($key),@b[0]
682 pxor 0x10($key),@b[1]
683 pxor 0x20($key),@b[4]
684 pxor 0x30($key),@b[6]
685 pxor 0x40($key),@b[3]
686 pxor 0x50($key),@b[7]
687 pxor 0x60($key),@b[2]
688 pxor 0x70($key),@b[5]
693 my ($a,$b,$n,$mask,$t)=@_;
705 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
725 my @x=reverse(@_[0..7]);
726 my ($t0,$t1,$t2,$t3)=@_[8..11];
728 movdqa 0x00($const),$t0 # .LBS0
729 movdqa 0x10($const),$t1 # .LBS1
731 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
732 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
734 movdqa 0x20($const),$t0 # .LBS2
736 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
737 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
739 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
740 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
749 .type _bsaes_encrypt8,\@abi-omnipotent
752 lea .LBS0(%rip), $const # constants table
754 movdqa ($key), @XMM[9] # round 0 key
756 movdqa 0x60($const), @XMM[8] # .LM0SR
757 pxor @XMM[9], @XMM[0] # xor with round0 key
758 pxor @XMM[9], @XMM[1]
759 pshufb @XMM[8], @XMM[0]
760 pxor @XMM[9], @XMM[2]
761 pshufb @XMM[8], @XMM[1]
762 pxor @XMM[9], @XMM[3]
763 pshufb @XMM[8], @XMM[2]
764 pxor @XMM[9], @XMM[4]
765 pshufb @XMM[8], @XMM[3]
766 pxor @XMM[9], @XMM[5]
767 pshufb @XMM[8], @XMM[4]
768 pxor @XMM[9], @XMM[6]
769 pshufb @XMM[8], @XMM[5]
770 pxor @XMM[9], @XMM[7]
771 pshufb @XMM[8], @XMM[6]
772 pshufb @XMM[8], @XMM[7]
773 _bsaes_encrypt8_bitslice:
775 &bitslice (@XMM[0..7, 8..11]);
782 &ShiftRows (@XMM[0..7, 8]);
783 $code.=".Lenc_sbox:\n";
784 &Sbox (@XMM[0..7, 8..15]);
789 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
791 movdqa 0x30($const), @XMM[8] # .LSR
793 movdqa 0x40($const), @XMM[8] # .LSRM0
798 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
799 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
801 movdqa ($key), @XMM[8] # last round key
802 pxor @XMM[8], @XMM[4]
803 pxor @XMM[8], @XMM[6]
804 pxor @XMM[8], @XMM[3]
805 pxor @XMM[8], @XMM[7]
806 pxor @XMM[8], @XMM[2]
807 pxor @XMM[8], @XMM[5]
808 pxor @XMM[8], @XMM[0]
809 pxor @XMM[8], @XMM[1]
811 .size _bsaes_encrypt8,.-_bsaes_encrypt8
813 .type _bsaes_decrypt8,\@abi-omnipotent
816 lea .LBS0(%rip), $const # constants table
818 movdqa ($key), @XMM[9] # round 0 key
820 movdqa -0x30($const), @XMM[8] # .LM0ISR
821 pxor @XMM[9], @XMM[0] # xor with round0 key
822 pxor @XMM[9], @XMM[1]
823 pshufb @XMM[8], @XMM[0]
824 pxor @XMM[9], @XMM[2]
825 pshufb @XMM[8], @XMM[1]
826 pxor @XMM[9], @XMM[3]
827 pshufb @XMM[8], @XMM[2]
828 pxor @XMM[9], @XMM[4]
829 pshufb @XMM[8], @XMM[3]
830 pxor @XMM[9], @XMM[5]
831 pshufb @XMM[8], @XMM[4]
832 pxor @XMM[9], @XMM[6]
833 pshufb @XMM[8], @XMM[5]
834 pxor @XMM[9], @XMM[7]
835 pshufb @XMM[8], @XMM[6]
836 pshufb @XMM[8], @XMM[7]
838 &bitslice (@XMM[0..7, 8..11]);
845 &ShiftRows (@XMM[0..7, 8]);
846 $code.=".Ldec_sbox:\n";
847 &InvSbox (@XMM[0..7, 8..15]);
852 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
854 movdqa -0x10($const), @XMM[8] # .LISR
856 movdqa -0x20($const), @XMM[8] # .LISRM0
861 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
863 movdqa ($key), @XMM[8] # last round key
864 pxor @XMM[8], @XMM[6]
865 pxor @XMM[8], @XMM[4]
866 pxor @XMM[8], @XMM[2]
867 pxor @XMM[8], @XMM[7]
868 pxor @XMM[8], @XMM[3]
869 pxor @XMM[8], @XMM[5]
870 pxor @XMM[8], @XMM[0]
871 pxor @XMM[8], @XMM[1]
873 .size _bsaes_decrypt8,.-_bsaes_decrypt8
877 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
880 my @x=reverse(@_[0..7]);
881 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
883 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
885 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
889 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
891 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
893 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
899 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
900 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
904 .type _bsaes_key_convert,\@abi-omnipotent
907 lea .LBS1(%rip), $const
908 movdqu ($inp), %xmm7 # load round 0 key
909 movdqa -0x10($const), %xmm8 # .LBS0
910 movdqa 0x00($const), %xmm9 # .LBS1
911 movdqa 0x10($const), %xmm10 # .LBS2
912 movdqa 0x40($const), %xmm13 # .LM0
913 movdqa 0x60($const), %xmm14 # .LNOT
915 movdqu 0x10($inp), %xmm6 # load round 1 key
917 movdqa %xmm7, ($out) # save round 0 key
923 pshufb %xmm13, %xmm6 # .LM0
926 &bitslice_key (map("%xmm$_",(0..7, 8..12)));
928 pxor %xmm14, %xmm5 # "pnot"
933 movdqa %xmm0, 0x00($out) # write bit-sliced round key
934 movdqa %xmm1, 0x10($out)
935 movdqa %xmm2, 0x20($out)
936 movdqa %xmm3, 0x30($out)
937 movdqa %xmm4, 0x40($out)
938 movdqa %xmm5, 0x50($out)
939 movdqa %xmm6, 0x60($out)
940 movdqa %xmm7, 0x70($out)
942 movdqu ($inp), %xmm6 # load next round key
946 movdqa 0x70($const), %xmm7 # .L63
947 #movdqa %xmm6, ($out) # don't save last round key
949 .size _bsaes_key_convert,.-_bsaes_key_convert
953 if (1 && !$win64) { # following four functions are unsupported interface
954 # used for benchmarking...
956 .globl bsaes_enc_key_convert
957 .type bsaes_enc_key_convert,\@function,2
959 bsaes_enc_key_convert:
960 mov 240($inp),%r10d # pass rounds
961 mov $inp,%rcx # pass key
962 mov $out,%rax # pass key schedule
963 call _bsaes_key_convert
964 pxor %xmm6,%xmm7 # fix up last round key
965 movdqa %xmm7,(%rax) # save last round key
967 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
969 .globl bsaes_encrypt_128
970 .type bsaes_encrypt_128,\@function,4
974 movdqu 0x00($inp), @XMM[0] # load input
975 movdqu 0x10($inp), @XMM[1]
976 movdqu 0x20($inp), @XMM[2]
977 movdqu 0x30($inp), @XMM[3]
978 movdqu 0x40($inp), @XMM[4]
979 movdqu 0x50($inp), @XMM[5]
980 movdqu 0x60($inp), @XMM[6]
981 movdqu 0x70($inp), @XMM[7]
982 mov $key, %rax # pass the $key
988 movdqu @XMM[0], 0x00($out) # write output
989 movdqu @XMM[1], 0x10($out)
990 movdqu @XMM[4], 0x20($out)
991 movdqu @XMM[6], 0x30($out)
992 movdqu @XMM[3], 0x40($out)
993 movdqu @XMM[7], 0x50($out)
994 movdqu @XMM[2], 0x60($out)
995 movdqu @XMM[5], 0x70($out)
1000 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1002 .globl bsaes_dec_key_convert
1003 .type bsaes_dec_key_convert,\@function,2
1005 bsaes_dec_key_convert:
1006 mov 240($inp),%r10d # pass rounds
1007 mov $inp,%rcx # pass key
1008 mov $out,%rax # pass key schedule
1009 call _bsaes_key_convert
1010 pxor ($out),%xmm7 # fix up round 0 key
1011 movdqa %xmm6,(%rax) # save last round key
1014 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1016 .globl bsaes_decrypt_128
1017 .type bsaes_decrypt_128,\@function,4
1021 movdqu 0x00($inp), @XMM[0] # load input
1022 movdqu 0x10($inp), @XMM[1]
1023 movdqu 0x20($inp), @XMM[2]
1024 movdqu 0x30($inp), @XMM[3]
1025 movdqu 0x40($inp), @XMM[4]
1026 movdqu 0x50($inp), @XMM[5]
1027 movdqu 0x60($inp), @XMM[6]
1028 movdqu 0x70($inp), @XMM[7]
1029 mov $key, %rax # pass the $key
1030 lea 0x80($inp), $inp
1033 call _bsaes_decrypt8
1035 movdqu @XMM[0], 0x00($out) # write output
1036 movdqu @XMM[1], 0x10($out)
1037 movdqu @XMM[6], 0x20($out)
1038 movdqu @XMM[4], 0x30($out)
1039 movdqu @XMM[2], 0x40($out)
1040 movdqu @XMM[7], 0x50($out)
1041 movdqu @XMM[3], 0x60($out)
1042 movdqu @XMM[5], 0x70($out)
1043 lea 0x80($out), $out
1047 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1051 ######################################################################
1055 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1056 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1057 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1059 if (0) { # suppress unreferenced ECB subroutines, spare some space...
1061 .globl bsaes_ecb_encrypt_blocks
1062 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1064 bsaes_ecb_encrypt_blocks:
1071 lea -0x48(%rsp),%rsp
1073 $code.=<<___ if ($win64);
1074 lea -0xa0(%rsp), %rsp
1075 movaps %xmm6, 0x40(%rsp)
1076 movaps %xmm7, 0x50(%rsp)
1077 movaps %xmm8, 0x60(%rsp)
1078 movaps %xmm9, 0x70(%rsp)
1079 movaps %xmm10, 0x80(%rsp)
1080 movaps %xmm11, 0x90(%rsp)
1081 movaps %xmm12, 0xa0(%rsp)
1082 movaps %xmm13, 0xb0(%rsp)
1083 movaps %xmm14, 0xc0(%rsp)
1084 movaps %xmm15, 0xd0(%rsp)
1088 mov %rsp,%rbp # backup %rsp
1089 mov 240($arg4),%eax # rounds
1090 mov $arg1,$inp # backup arguments
1097 mov %eax,%ebx # backup rounds
1098 shl \$7,%rax # 128 bytes per inner round key
1099 sub \$`128-32`,%rax # size of bit-sliced key schedule
1101 mov %rsp,%rax # pass key schedule
1102 mov $key,%rcx # pass key
1103 mov %ebx,%r10d # pass rounds
1104 call _bsaes_key_convert
1105 pxor %xmm6,%xmm7 # fix up last round key
1106 movdqa %xmm7,(%rax) # save last round key
1110 movdqu 0x00($inp), @XMM[0] # load input
1111 movdqu 0x10($inp), @XMM[1]
1112 movdqu 0x20($inp), @XMM[2]
1113 movdqu 0x30($inp), @XMM[3]
1114 movdqu 0x40($inp), @XMM[4]
1115 movdqu 0x50($inp), @XMM[5]
1116 mov %rsp, %rax # pass key schedule
1117 movdqu 0x60($inp), @XMM[6]
1118 mov %ebx,%r10d # pass rounds
1119 movdqu 0x70($inp), @XMM[7]
1120 lea 0x80($inp), $inp
1122 call _bsaes_encrypt8
1124 movdqu @XMM[0], 0x00($out) # write output
1125 movdqu @XMM[1], 0x10($out)
1126 movdqu @XMM[4], 0x20($out)
1127 movdqu @XMM[6], 0x30($out)
1128 movdqu @XMM[3], 0x40($out)
1129 movdqu @XMM[7], 0x50($out)
1130 movdqu @XMM[2], 0x60($out)
1131 movdqu @XMM[5], 0x70($out)
1132 lea 0x80($out), $out
1139 movdqu 0x00($inp), @XMM[0] # load input
1140 mov %rsp, %rax # pass key schedule
1141 mov %ebx,%r10d # pass rounds
1144 movdqu 0x10($inp), @XMM[1]
1146 movdqu 0x20($inp), @XMM[2]
1149 movdqu 0x30($inp), @XMM[3]
1151 movdqu 0x40($inp), @XMM[4]
1154 movdqu 0x50($inp), @XMM[5]
1156 movdqu 0x60($inp), @XMM[6]
1157 call _bsaes_encrypt8
1158 movdqu @XMM[0], 0x00($out) # write output
1159 movdqu @XMM[1], 0x10($out)
1160 movdqu @XMM[4], 0x20($out)
1161 movdqu @XMM[6], 0x30($out)
1162 movdqu @XMM[3], 0x40($out)
1163 movdqu @XMM[7], 0x50($out)
1164 movdqu @XMM[2], 0x60($out)
1168 call _bsaes_encrypt8
1169 movdqu @XMM[0], 0x00($out) # write output
1170 movdqu @XMM[1], 0x10($out)
1171 movdqu @XMM[4], 0x20($out)
1172 movdqu @XMM[6], 0x30($out)
1173 movdqu @XMM[3], 0x40($out)
1174 movdqu @XMM[7], 0x50($out)
1178 call _bsaes_encrypt8
1179 movdqu @XMM[0], 0x00($out) # write output
1180 movdqu @XMM[1], 0x10($out)
1181 movdqu @XMM[4], 0x20($out)
1182 movdqu @XMM[6], 0x30($out)
1183 movdqu @XMM[3], 0x40($out)
1187 call _bsaes_encrypt8
1188 movdqu @XMM[0], 0x00($out) # write output
1189 movdqu @XMM[1], 0x10($out)
1190 movdqu @XMM[4], 0x20($out)
1191 movdqu @XMM[6], 0x30($out)
1195 call _bsaes_encrypt8
1196 movdqu @XMM[0], 0x00($out) # write output
1197 movdqu @XMM[1], 0x10($out)
1198 movdqu @XMM[4], 0x20($out)
1202 call _bsaes_encrypt8
1203 movdqu @XMM[0], 0x00($out) # write output
1204 movdqu @XMM[1], 0x10($out)
1208 call _bsaes_encrypt8
1209 movdqu @XMM[0], 0x00($out) # write output
1225 .Lecb_enc_bzero: # wipe key schedule [if any]
1226 movdqa %xmm0, 0x00(%rax)
1227 movdqa %xmm0, 0x10(%rax)
1228 lea 0x20(%rax), %rax
1232 lea (%rbp),%rsp # restore %rsp
1234 $code.=<<___ if ($win64);
1235 movaps 0x40(%rbp), %xmm6
1236 movaps 0x50(%rbp), %xmm7
1237 movaps 0x60(%rbp), %xmm8
1238 movaps 0x70(%rbp), %xmm9
1239 movaps 0x80(%rbp), %xmm10
1240 movaps 0x90(%rbp), %xmm11
1241 movaps 0xa0(%rbp), %xmm12
1242 movaps 0xb0(%rbp), %xmm13
1243 movaps 0xc0(%rbp), %xmm14
1244 movaps 0xd0(%rbp), %xmm15
1245 lea 0xa0(%rbp), %rsp
1248 mov 0x48(%rsp), %r15
1249 mov 0x50(%rsp), %r14
1250 mov 0x58(%rsp), %r13
1251 mov 0x60(%rsp), %r12
1252 mov 0x68(%rsp), %rbx
1253 mov 0x70(%rsp), %rbp
1254 lea 0x78(%rsp), %rsp
1257 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1259 .globl bsaes_ecb_decrypt_blocks
1260 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1262 bsaes_ecb_decrypt_blocks:
1269 lea -0x48(%rsp),%rsp
1271 $code.=<<___ if ($win64);
1272 lea -0xa0(%rsp), %rsp
1273 movaps %xmm6, 0x40(%rsp)
1274 movaps %xmm7, 0x50(%rsp)
1275 movaps %xmm8, 0x60(%rsp)
1276 movaps %xmm9, 0x70(%rsp)
1277 movaps %xmm10, 0x80(%rsp)
1278 movaps %xmm11, 0x90(%rsp)
1279 movaps %xmm12, 0xa0(%rsp)
1280 movaps %xmm13, 0xb0(%rsp)
1281 movaps %xmm14, 0xc0(%rsp)
1282 movaps %xmm15, 0xd0(%rsp)
1286 mov %rsp,%rbp # backup %rsp
1287 mov 240($arg4),%eax # rounds
1288 mov $arg1,$inp # backup arguments
1295 mov %eax,%ebx # backup rounds
1296 shl \$7,%rax # 128 bytes per inner round key
1297 sub \$`128-32`,%rax # size of bit-sliced key schedule
1299 mov %rsp,%rax # pass key schedule
1300 mov $key,%rcx # pass key
1301 mov %ebx,%r10d # pass rounds
1302 call _bsaes_key_convert
1303 pxor (%rsp),%xmm7 # fix up 0 round key
1304 movdqa %xmm6,(%rax) # save last round key
1309 movdqu 0x00($inp), @XMM[0] # load input
1310 movdqu 0x10($inp), @XMM[1]
1311 movdqu 0x20($inp), @XMM[2]
1312 movdqu 0x30($inp), @XMM[3]
1313 movdqu 0x40($inp), @XMM[4]
1314 movdqu 0x50($inp), @XMM[5]
1315 mov %rsp, %rax # pass key schedule
1316 movdqu 0x60($inp), @XMM[6]
1317 mov %ebx,%r10d # pass rounds
1318 movdqu 0x70($inp), @XMM[7]
1319 lea 0x80($inp), $inp
1321 call _bsaes_decrypt8
1323 movdqu @XMM[0], 0x00($out) # write output
1324 movdqu @XMM[1], 0x10($out)
1325 movdqu @XMM[6], 0x20($out)
1326 movdqu @XMM[4], 0x30($out)
1327 movdqu @XMM[2], 0x40($out)
1328 movdqu @XMM[7], 0x50($out)
1329 movdqu @XMM[3], 0x60($out)
1330 movdqu @XMM[5], 0x70($out)
1331 lea 0x80($out), $out
1338 movdqu 0x00($inp), @XMM[0] # load input
1339 mov %rsp, %rax # pass key schedule
1340 mov %ebx,%r10d # pass rounds
1343 movdqu 0x10($inp), @XMM[1]
1345 movdqu 0x20($inp), @XMM[2]
1348 movdqu 0x30($inp), @XMM[3]
1350 movdqu 0x40($inp), @XMM[4]
1353 movdqu 0x50($inp), @XMM[5]
1355 movdqu 0x60($inp), @XMM[6]
1356 call _bsaes_decrypt8
1357 movdqu @XMM[0], 0x00($out) # write output
1358 movdqu @XMM[1], 0x10($out)
1359 movdqu @XMM[6], 0x20($out)
1360 movdqu @XMM[4], 0x30($out)
1361 movdqu @XMM[2], 0x40($out)
1362 movdqu @XMM[7], 0x50($out)
1363 movdqu @XMM[3], 0x60($out)
1367 call _bsaes_decrypt8
1368 movdqu @XMM[0], 0x00($out) # write output
1369 movdqu @XMM[1], 0x10($out)
1370 movdqu @XMM[6], 0x20($out)
1371 movdqu @XMM[4], 0x30($out)
1372 movdqu @XMM[2], 0x40($out)
1373 movdqu @XMM[7], 0x50($out)
1377 call _bsaes_decrypt8
1378 movdqu @XMM[0], 0x00($out) # write output
1379 movdqu @XMM[1], 0x10($out)
1380 movdqu @XMM[6], 0x20($out)
1381 movdqu @XMM[4], 0x30($out)
1382 movdqu @XMM[2], 0x40($out)
1386 call _bsaes_decrypt8
1387 movdqu @XMM[0], 0x00($out) # write output
1388 movdqu @XMM[1], 0x10($out)
1389 movdqu @XMM[6], 0x20($out)
1390 movdqu @XMM[4], 0x30($out)
1394 call _bsaes_decrypt8
1395 movdqu @XMM[0], 0x00($out) # write output
1396 movdqu @XMM[1], 0x10($out)
1397 movdqu @XMM[6], 0x20($out)
1401 call _bsaes_decrypt8
1402 movdqu @XMM[0], 0x00($out) # write output
1403 movdqu @XMM[1], 0x10($out)
1407 call _bsaes_decrypt8
1408 movdqu @XMM[0], 0x00($out) # write output
1424 .Lecb_dec_bzero: # wipe key schedule [if any]
1425 movdqa %xmm0, 0x00(%rax)
1426 movdqa %xmm0, 0x10(%rax)
1427 lea 0x20(%rax), %rax
1431 lea (%rbp),%rsp # restore %rsp
1433 $code.=<<___ if ($win64);
1434 movaps 0x40(%rbp), %xmm6
1435 movaps 0x50(%rbp), %xmm7
1436 movaps 0x60(%rbp), %xmm8
1437 movaps 0x70(%rbp), %xmm9
1438 movaps 0x80(%rbp), %xmm10
1439 movaps 0x90(%rbp), %xmm11
1440 movaps 0xa0(%rbp), %xmm12
1441 movaps 0xb0(%rbp), %xmm13
1442 movaps 0xc0(%rbp), %xmm14
1443 movaps 0xd0(%rbp), %xmm15
1444 lea 0xa0(%rbp), %rsp
1447 mov 0x48(%rsp), %r15
1448 mov 0x50(%rsp), %r14
1449 mov 0x58(%rsp), %r13
1450 mov 0x60(%rsp), %r12
1451 mov 0x68(%rsp), %rbx
1452 mov 0x70(%rsp), %rbp
1453 lea 0x78(%rsp), %rsp
1456 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1460 .extern AES_cbc_encrypt
1461 .globl bsaes_cbc_encrypt
1462 .type bsaes_cbc_encrypt,\@abi-omnipotent
1466 $code.=<<___ if ($win64);
1467 mov 48(%rsp),$arg6 # pull direction flag
1481 lea -0x48(%rsp), %rsp
1483 $code.=<<___ if ($win64);
1484 mov 0xa0(%rsp),$arg5 # pull ivp
1485 lea -0xa0(%rsp), %rsp
1486 movaps %xmm6, 0x40(%rsp)
1487 movaps %xmm7, 0x50(%rsp)
1488 movaps %xmm8, 0x60(%rsp)
1489 movaps %xmm9, 0x70(%rsp)
1490 movaps %xmm10, 0x80(%rsp)
1491 movaps %xmm11, 0x90(%rsp)
1492 movaps %xmm12, 0xa0(%rsp)
1493 movaps %xmm13, 0xb0(%rsp)
1494 movaps %xmm14, 0xc0(%rsp)
1495 movaps %xmm15, 0xd0(%rsp)
1499 mov %rsp, %rbp # backup %rsp
1500 mov 240($arg4), %eax # rounds
1501 mov $arg1, $inp # backup arguments
1506 shr \$4, $len # bytes to blocks
1508 mov %eax, %edx # rounds
1509 shl \$7, %rax # 128 bytes per inner round key
1510 sub \$`128-32`, %rax # size of bit-sliced key schedule
1513 mov %rsp, %rax # pass key schedule
1514 mov $key, %rcx # pass key
1515 mov %edx, %r10d # pass rounds
1516 call _bsaes_key_convert
1517 pxor (%rsp),%xmm7 # fix up 0 round key
1518 movdqa %xmm6,(%rax) # save last round key
1521 movdqu (%rbx), @XMM[15] # load IV
1524 movdqu 0x00($inp), @XMM[0] # load input
1525 movdqu 0x10($inp), @XMM[1]
1526 movdqu 0x20($inp), @XMM[2]
1527 movdqu 0x30($inp), @XMM[3]
1528 movdqu 0x40($inp), @XMM[4]
1529 movdqu 0x50($inp), @XMM[5]
1530 mov %rsp, %rax # pass key schedule
1531 movdqu 0x60($inp), @XMM[6]
1532 mov %edx,%r10d # pass rounds
1533 movdqu 0x70($inp), @XMM[7]
1534 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1536 call _bsaes_decrypt8
1538 pxor 0x20(%rbp), @XMM[0] # ^= IV
1539 movdqu 0x00($inp), @XMM[8] # re-load input
1540 movdqu 0x10($inp), @XMM[9]
1541 pxor @XMM[8], @XMM[1]
1542 movdqu 0x20($inp), @XMM[10]
1543 pxor @XMM[9], @XMM[6]
1544 movdqu 0x30($inp), @XMM[11]
1545 pxor @XMM[10], @XMM[4]
1546 movdqu 0x40($inp), @XMM[12]
1547 pxor @XMM[11], @XMM[2]
1548 movdqu 0x50($inp), @XMM[13]
1549 pxor @XMM[12], @XMM[7]
1550 movdqu 0x60($inp), @XMM[14]
1551 pxor @XMM[13], @XMM[3]
1552 movdqu 0x70($inp), @XMM[15] # IV
1553 pxor @XMM[14], @XMM[5]
1554 movdqu @XMM[0], 0x00($out) # write output
1555 lea 0x80($inp), $inp
1556 movdqu @XMM[1], 0x10($out)
1557 movdqu @XMM[6], 0x20($out)
1558 movdqu @XMM[4], 0x30($out)
1559 movdqu @XMM[2], 0x40($out)
1560 movdqu @XMM[7], 0x50($out)
1561 movdqu @XMM[3], 0x60($out)
1562 movdqu @XMM[5], 0x70($out)
1563 lea 0x80($out), $out
1570 movdqu 0x00($inp), @XMM[0] # load input
1571 mov %rsp, %rax # pass key schedule
1572 mov %edx, %r10d # pass rounds
1575 movdqu 0x10($inp), @XMM[1]
1577 movdqu 0x20($inp), @XMM[2]
1580 movdqu 0x30($inp), @XMM[3]
1582 movdqu 0x40($inp), @XMM[4]
1585 movdqu 0x50($inp), @XMM[5]
1587 movdqu 0x60($inp), @XMM[6]
1588 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1589 call _bsaes_decrypt8
1590 pxor 0x20(%rbp), @XMM[0] # ^= IV
1591 movdqu 0x00($inp), @XMM[8] # re-load input
1592 movdqu 0x10($inp), @XMM[9]
1593 pxor @XMM[8], @XMM[1]
1594 movdqu 0x20($inp), @XMM[10]
1595 pxor @XMM[9], @XMM[6]
1596 movdqu 0x30($inp), @XMM[11]
1597 pxor @XMM[10], @XMM[4]
1598 movdqu 0x40($inp), @XMM[12]
1599 pxor @XMM[11], @XMM[2]
1600 movdqu 0x50($inp), @XMM[13]
1601 pxor @XMM[12], @XMM[7]
1602 movdqu 0x60($inp), @XMM[15] # IV
1603 pxor @XMM[13], @XMM[3]
1604 movdqu @XMM[0], 0x00($out) # write output
1605 movdqu @XMM[1], 0x10($out)
1606 movdqu @XMM[6], 0x20($out)
1607 movdqu @XMM[4], 0x30($out)
1608 movdqu @XMM[2], 0x40($out)
1609 movdqu @XMM[7], 0x50($out)
1610 movdqu @XMM[3], 0x60($out)
1614 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1615 call _bsaes_decrypt8
1616 pxor 0x20(%rbp), @XMM[0] # ^= IV
1617 movdqu 0x00($inp), @XMM[8] # re-load input
1618 movdqu 0x10($inp), @XMM[9]
1619 pxor @XMM[8], @XMM[1]
1620 movdqu 0x20($inp), @XMM[10]
1621 pxor @XMM[9], @XMM[6]
1622 movdqu 0x30($inp), @XMM[11]
1623 pxor @XMM[10], @XMM[4]
1624 movdqu 0x40($inp), @XMM[12]
1625 pxor @XMM[11], @XMM[2]
1626 movdqu 0x50($inp), @XMM[15] # IV
1627 pxor @XMM[12], @XMM[7]
1628 movdqu @XMM[0], 0x00($out) # write output
1629 movdqu @XMM[1], 0x10($out)
1630 movdqu @XMM[6], 0x20($out)
1631 movdqu @XMM[4], 0x30($out)
1632 movdqu @XMM[2], 0x40($out)
1633 movdqu @XMM[7], 0x50($out)
1637 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1638 call _bsaes_decrypt8
1639 pxor 0x20(%rbp), @XMM[0] # ^= IV
1640 movdqu 0x00($inp), @XMM[8] # re-load input
1641 movdqu 0x10($inp), @XMM[9]
1642 pxor @XMM[8], @XMM[1]
1643 movdqu 0x20($inp), @XMM[10]
1644 pxor @XMM[9], @XMM[6]
1645 movdqu 0x30($inp), @XMM[11]
1646 pxor @XMM[10], @XMM[4]
1647 movdqu 0x40($inp), @XMM[15] # IV
1648 pxor @XMM[11], @XMM[2]
1649 movdqu @XMM[0], 0x00($out) # write output
1650 movdqu @XMM[1], 0x10($out)
1651 movdqu @XMM[6], 0x20($out)
1652 movdqu @XMM[4], 0x30($out)
1653 movdqu @XMM[2], 0x40($out)
1657 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1658 call _bsaes_decrypt8
1659 pxor 0x20(%rbp), @XMM[0] # ^= IV
1660 movdqu 0x00($inp), @XMM[8] # re-load input
1661 movdqu 0x10($inp), @XMM[9]
1662 pxor @XMM[8], @XMM[1]
1663 movdqu 0x20($inp), @XMM[10]
1664 pxor @XMM[9], @XMM[6]
1665 movdqu 0x30($inp), @XMM[15] # IV
1666 pxor @XMM[10], @XMM[4]
1667 movdqu @XMM[0], 0x00($out) # write output
1668 movdqu @XMM[1], 0x10($out)
1669 movdqu @XMM[6], 0x20($out)
1670 movdqu @XMM[4], 0x30($out)
1674 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1675 call _bsaes_decrypt8
1676 pxor 0x20(%rbp), @XMM[0] # ^= IV
1677 movdqu 0x00($inp), @XMM[8] # re-load input
1678 movdqu 0x10($inp), @XMM[9]
1679 pxor @XMM[8], @XMM[1]
1680 movdqu 0x20($inp), @XMM[15] # IV
1681 pxor @XMM[9], @XMM[6]
1682 movdqu @XMM[0], 0x00($out) # write output
1683 movdqu @XMM[1], 0x10($out)
1684 movdqu @XMM[6], 0x20($out)
1688 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1689 call _bsaes_decrypt8
1690 pxor 0x20(%rbp), @XMM[0] # ^= IV
1691 movdqu 0x00($inp), @XMM[8] # re-load input
1692 movdqu 0x10($inp), @XMM[15] # IV
1693 pxor @XMM[8], @XMM[1]
1694 movdqu @XMM[0], 0x00($out) # write output
1695 movdqu @XMM[1], 0x10($out)
1700 lea 0x20(%rbp), $arg2 # buffer output
1702 call AES_decrypt # doesn't touch %xmm
1703 pxor 0x20(%rbp), @XMM[15] # ^= IV
1704 movdqu @XMM[15], ($out) # write output
1705 movdqa @XMM[0], @XMM[15] # IV
1708 movdqu @XMM[15], (%rbx) # return IV
1711 .Lcbc_dec_bzero: # wipe key schedule [if any]
1712 movdqa %xmm0, 0x00(%rax)
1713 movdqa %xmm0, 0x10(%rax)
1714 lea 0x20(%rax), %rax
1718 lea (%rbp),%rsp # restore %rsp
1720 $code.=<<___ if ($win64);
1721 movaps 0x40(%rbp), %xmm6
1722 movaps 0x50(%rbp), %xmm7
1723 movaps 0x60(%rbp), %xmm8
1724 movaps 0x70(%rbp), %xmm9
1725 movaps 0x80(%rbp), %xmm10
1726 movaps 0x90(%rbp), %xmm11
1727 movaps 0xa0(%rbp), %xmm12
1728 movaps 0xb0(%rbp), %xmm13
1729 movaps 0xc0(%rbp), %xmm14
1730 movaps 0xd0(%rbp), %xmm15
1731 lea 0xa0(%rbp), %rsp
1734 mov 0x48(%rsp), %r15
1735 mov 0x50(%rsp), %r14
1736 mov 0x58(%rsp), %r13
1737 mov 0x60(%rsp), %r12
1738 mov 0x68(%rsp), %rbx
1739 mov 0x70(%rsp), %rbp
1740 lea 0x78(%rsp), %rsp
1743 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1745 .globl bsaes_ctr32_encrypt_blocks
1746 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1748 bsaes_ctr32_encrypt_blocks:
1755 lea -0x48(%rsp), %rsp
1757 $code.=<<___ if ($win64);
1758 mov 0xa0(%rsp),$arg5 # pull ivp
1759 lea -0xa0(%rsp), %rsp
1760 movaps %xmm6, 0x40(%rsp)
1761 movaps %xmm7, 0x50(%rsp)
1762 movaps %xmm8, 0x60(%rsp)
1763 movaps %xmm9, 0x70(%rsp)
1764 movaps %xmm10, 0x80(%rsp)
1765 movaps %xmm11, 0x90(%rsp)
1766 movaps %xmm12, 0xa0(%rsp)
1767 movaps %xmm13, 0xb0(%rsp)
1768 movaps %xmm14, 0xc0(%rsp)
1769 movaps %xmm15, 0xd0(%rsp)
1773 mov %rsp, %rbp # backup %rsp
1774 movdqu ($arg5), %xmm0 # load counter
1775 mov 240($arg4), %eax # rounds
1776 mov $arg1, $inp # backup arguments
1780 movdqa %xmm0, 0x20(%rbp) # copy counter
1784 mov %eax, %ebx # rounds
1785 shl \$7, %rax # 128 bytes per inner round key
1786 sub \$`128-32`, %rax # size of bit-sliced key schedule
1789 mov %rsp, %rax # pass key schedule
1790 mov $key, %rcx # pass key
1791 mov %ebx, %r10d # pass rounds
1792 call _bsaes_key_convert
1793 pxor %xmm6,%xmm7 # fix up last round key
1794 movdqa %xmm7,(%rax) # save last round key
1796 movdqa (%rsp), @XMM[9] # load round0 key
1797 lea .LADD1(%rip), %r11
1798 movdqa 0x20(%rbp), @XMM[0] # counter copy
1799 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1800 pshufb @XMM[8], @XMM[9] # byte swap upper part
1801 pshufb @XMM[8], @XMM[0]
1802 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1806 movdqa @XMM[0], 0x20(%rbp) # save counter
1807 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1808 movdqa @XMM[0], @XMM[2]
1809 paddd 0x00(%r11), @XMM[1] # .LADD1
1810 movdqa @XMM[0], @XMM[3]
1811 paddd 0x10(%r11), @XMM[2] # .LADD2
1812 movdqa @XMM[0], @XMM[4]
1813 paddd 0x20(%r11), @XMM[3] # .LADD3
1814 movdqa @XMM[0], @XMM[5]
1815 paddd 0x30(%r11), @XMM[4] # .LADD4
1816 movdqa @XMM[0], @XMM[6]
1817 paddd 0x40(%r11), @XMM[5] # .LADD5
1818 movdqa @XMM[0], @XMM[7]
1819 paddd 0x50(%r11), @XMM[6] # .LADD6
1820 paddd 0x60(%r11), @XMM[7] # .LADD7
1822 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1823 # to flip byte order in 32-bit counter
1824 movdqa (%rsp), @XMM[9] # round 0 key
1825 lea 0x10(%rsp), %rax # pass key schedule
1826 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1827 pxor @XMM[9], @XMM[0] # xor with round0 key
1828 pxor @XMM[9], @XMM[1]
1829 pshufb @XMM[8], @XMM[0]
1830 pxor @XMM[9], @XMM[2]
1831 pshufb @XMM[8], @XMM[1]
1832 pxor @XMM[9], @XMM[3]
1833 pshufb @XMM[8], @XMM[2]
1834 pxor @XMM[9], @XMM[4]
1835 pshufb @XMM[8], @XMM[3]
1836 pxor @XMM[9], @XMM[5]
1837 pshufb @XMM[8], @XMM[4]
1838 pxor @XMM[9], @XMM[6]
1839 pshufb @XMM[8], @XMM[5]
1840 pxor @XMM[9], @XMM[7]
1841 pshufb @XMM[8], @XMM[6]
1842 lea .LBS0(%rip), %r11 # constants table
1843 pshufb @XMM[8], @XMM[7]
1844 mov %ebx,%r10d # pass rounds
1846 call _bsaes_encrypt8_bitslice
1849 jc .Lctr_enc_loop_done
1851 movdqu 0x00($inp), @XMM[8] # load input
1852 movdqu 0x10($inp), @XMM[9]
1853 movdqu 0x20($inp), @XMM[10]
1854 movdqu 0x30($inp), @XMM[11]
1855 movdqu 0x40($inp), @XMM[12]
1856 movdqu 0x50($inp), @XMM[13]
1857 movdqu 0x60($inp), @XMM[14]
1858 movdqu 0x70($inp), @XMM[15]
1860 pxor @XMM[0], @XMM[8]
1861 movdqa 0x20(%rbp), @XMM[0] # load counter
1862 pxor @XMM[9], @XMM[1]
1863 movdqu @XMM[8], 0x00($out) # write output
1864 pxor @XMM[10], @XMM[4]
1865 movdqu @XMM[1], 0x10($out)
1866 pxor @XMM[11], @XMM[6]
1867 movdqu @XMM[4], 0x20($out)
1868 pxor @XMM[12], @XMM[3]
1869 movdqu @XMM[6], 0x30($out)
1870 pxor @XMM[13], @XMM[7]
1871 movdqu @XMM[3], 0x40($out)
1872 pxor @XMM[14], @XMM[2]
1873 movdqu @XMM[7], 0x50($out)
1874 pxor @XMM[15], @XMM[5]
1875 movdqu @XMM[2], 0x60($out)
1876 lea .LADD1(%rip), %r11
1877 movdqu @XMM[5], 0x70($out)
1878 lea 0x80($out), $out
1879 paddd 0x70(%r11), @XMM[0] # .LADD8
1884 .Lctr_enc_loop_done:
1885 movdqu 0x00($inp), @XMM[8] # load input
1886 pxor @XMM[8], @XMM[0]
1887 movdqu @XMM[0], 0x00($out) # write output
1890 movdqu 0x10($inp), @XMM[9]
1891 pxor @XMM[9], @XMM[1]
1892 movdqu @XMM[1], 0x10($out)
1894 movdqu 0x20($inp), @XMM[10]
1895 pxor @XMM[10], @XMM[4]
1896 movdqu @XMM[4], 0x20($out)
1899 movdqu 0x30($inp), @XMM[11]
1900 pxor @XMM[11], @XMM[6]
1901 movdqu @XMM[6], 0x30($out)
1903 movdqu 0x40($inp), @XMM[12]
1904 pxor @XMM[12], @XMM[3]
1905 movdqu @XMM[3], 0x40($out)
1908 movdqu 0x50($inp), @XMM[13]
1909 pxor @XMM[13], @XMM[7]
1910 movdqu @XMM[7], 0x50($out)
1912 movdqu 0x60($inp), @XMM[14]
1913 pxor @XMM[14], @XMM[2]
1914 movdqu @XMM[2], 0x60($out)
1919 lea 0x20(%rbp), $arg1
1920 lea 0x30(%rbp), $arg2
1923 movdqu ($inp), @XMM[1]
1925 mov 0x2c(%rbp), %eax # load 32-bit counter
1927 pxor 0x30(%rbp), @XMM[1]
1928 inc %eax # increment
1929 movdqu @XMM[1], ($out)
1932 mov %eax, 0x2c(%rsp) # save 32-bit counter
1939 .Lctr_enc_bzero: # wipe key schedule [if any]
1940 movdqa %xmm0, 0x00(%rax)
1941 movdqa %xmm0, 0x10(%rax)
1942 lea 0x20(%rax), %rax
1946 lea (%rbp),%rsp # restore %rsp
1948 $code.=<<___ if ($win64);
1949 movaps 0x40(%rbp), %xmm6
1950 movaps 0x50(%rbp), %xmm7
1951 movaps 0x60(%rbp), %xmm8
1952 movaps 0x70(%rbp), %xmm9
1953 movaps 0x80(%rbp), %xmm10
1954 movaps 0x90(%rbp), %xmm11
1955 movaps 0xa0(%rbp), %xmm12
1956 movaps 0xb0(%rbp), %xmm13
1957 movaps 0xc0(%rbp), %xmm14
1958 movaps 0xd0(%rbp), %xmm15
1959 lea 0xa0(%rbp), %rsp
1962 mov 0x48(%rsp), %r15
1963 mov 0x50(%rsp), %r14
1964 mov 0x58(%rsp), %r13
1965 mov 0x60(%rsp), %r12
1966 mov 0x68(%rsp), %rbx
1967 mov 0x70(%rsp), %rbp
1968 lea 0x78(%rsp), %rsp
1971 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1973 ######################################################################
1974 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1975 # const AES_KEY *key1, const AES_KEY *key2,
1976 # const unsigned char iv[16]);
1978 my ($twmask,$twres,$twtmp)=@XMM[13..15];
1980 .globl bsaes_xts_encrypt
1981 .type bsaes_xts_encrypt,\@abi-omnipotent
1990 lea -0x48(%rsp), %rsp
1992 $code.=<<___ if ($win64);
1993 mov 0xa0(%rsp),$arg5 # pull key2
1994 mov 0xa8(%rsp),$arg6 # pull ivp
1995 lea -0xa0(%rsp), %rsp
1996 movaps %xmm6, 0x40(%rsp)
1997 movaps %xmm7, 0x50(%rsp)
1998 movaps %xmm8, 0x60(%rsp)
1999 movaps %xmm9, 0x70(%rsp)
2000 movaps %xmm10, 0x80(%rsp)
2001 movaps %xmm11, 0x90(%rsp)
2002 movaps %xmm12, 0xa0(%rsp)
2003 movaps %xmm13, 0xb0(%rsp)
2004 movaps %xmm14, 0xc0(%rsp)
2005 movaps %xmm15, 0xd0(%rsp)
2009 mov %rsp, %rbp # backup %rsp
2010 mov $arg1, $inp # backup arguments
2016 lea 0x20(%rbp), $arg2
2018 call AES_encrypt # generate initial tweak
2020 mov 240($key), %eax # rounds
2021 mov $len, %rbx # backup $len
2023 mov %eax, %edx # rounds
2024 shl \$7, %rax # 128 bytes per inner round key
2025 sub \$`128-32`, %rax # size of bit-sliced key schedule
2028 mov %rsp, %rax # pass key schedule
2029 mov $key, %rcx # pass key
2030 mov %edx, %r10d # pass rounds
2031 call _bsaes_key_convert
2032 pxor %xmm6, %xmm7 # fix up last round key
2033 movdqa %xmm7, (%rax) # save last round key
2036 sub \$0x80, %rsp # place for tweak[8]
2037 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2040 movdqa .Lxts_magic(%rip), $twmask
2041 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2050 for ($i=0;$i<7;$i++) {
2052 pshufd \$0x13, $twtmp, $twres
2054 movdqa @XMM[7], @XMM[$i]
2055 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2056 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2057 pand $twmask, $twres # isolate carry and residue
2058 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2059 pxor $twres, @XMM[7]
2061 $code.=<<___ if ($i>=1);
2062 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2064 $code.=<<___ if ($i>=2);
2065 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2069 movdqu 0x60($inp), @XMM[8+6]
2070 pxor @XMM[8+5], @XMM[5]
2071 movdqu 0x70($inp), @XMM[8+7]
2072 lea 0x80($inp), $inp
2073 movdqa @XMM[7], 0x70(%rsp)
2074 pxor @XMM[8+6], @XMM[6]
2075 lea 0x80(%rsp), %rax # pass key schedule
2076 pxor @XMM[8+7], @XMM[7]
2077 mov %edx, %r10d # pass rounds
2079 call _bsaes_encrypt8
2081 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2082 pxor 0x10(%rsp), @XMM[1]
2083 movdqu @XMM[0], 0x00($out) # write output
2084 pxor 0x20(%rsp), @XMM[4]
2085 movdqu @XMM[1], 0x10($out)
2086 pxor 0x30(%rsp), @XMM[6]
2087 movdqu @XMM[4], 0x20($out)
2088 pxor 0x40(%rsp), @XMM[3]
2089 movdqu @XMM[6], 0x30($out)
2090 pxor 0x50(%rsp), @XMM[7]
2091 movdqu @XMM[3], 0x40($out)
2092 pxor 0x60(%rsp), @XMM[2]
2093 movdqu @XMM[7], 0x50($out)
2094 pxor 0x70(%rsp), @XMM[5]
2095 movdqu @XMM[2], 0x60($out)
2096 movdqu @XMM[5], 0x70($out)
2097 lea 0x80($out), $out
2099 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2101 movdqa .Lxts_magic(%rip), $twmask
2102 pcmpgtd @XMM[7], $twtmp
2103 pshufd \$0x13, $twtmp, $twres
2105 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2106 pand $twmask, $twres # isolate carry and residue
2107 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2108 pxor $twres, @XMM[7]
2117 for ($i=0;$i<7;$i++) {
2119 pshufd \$0x13, $twtmp, $twres
2121 movdqa @XMM[7], @XMM[$i]
2122 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2123 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2124 pand $twmask, $twres # isolate carry and residue
2125 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2126 pxor $twres, @XMM[7]
2128 $code.=<<___ if ($i>=1);
2129 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2130 cmp \$`0x10*$i`,$len
2133 $code.=<<___ if ($i>=2);
2134 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2138 movdqu 0x60($inp), @XMM[8+6]
2139 pxor @XMM[8+5], @XMM[5]
2140 movdqa @XMM[7], 0x70(%rsp)
2141 lea 0x70($inp), $inp
2142 pxor @XMM[8+6], @XMM[6]
2143 lea 0x80(%rsp), %rax # pass key schedule
2144 mov %edx, %r10d # pass rounds
2146 call _bsaes_encrypt8
2148 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2149 pxor 0x10(%rsp), @XMM[1]
2150 movdqu @XMM[0], 0x00($out) # write output
2151 pxor 0x20(%rsp), @XMM[4]
2152 movdqu @XMM[1], 0x10($out)
2153 pxor 0x30(%rsp), @XMM[6]
2154 movdqu @XMM[4], 0x20($out)
2155 pxor 0x40(%rsp), @XMM[3]
2156 movdqu @XMM[6], 0x30($out)
2157 pxor 0x50(%rsp), @XMM[7]
2158 movdqu @XMM[3], 0x40($out)
2159 pxor 0x60(%rsp), @XMM[2]
2160 movdqu @XMM[7], 0x50($out)
2161 movdqu @XMM[2], 0x60($out)
2162 lea 0x70($out), $out
2164 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2168 pxor @XMM[8+4], @XMM[4]
2169 lea 0x60($inp), $inp
2170 pxor @XMM[8+5], @XMM[5]
2171 lea 0x80(%rsp), %rax # pass key schedule
2172 mov %edx, %r10d # pass rounds
2174 call _bsaes_encrypt8
2176 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2177 pxor 0x10(%rsp), @XMM[1]
2178 movdqu @XMM[0], 0x00($out) # write output
2179 pxor 0x20(%rsp), @XMM[4]
2180 movdqu @XMM[1], 0x10($out)
2181 pxor 0x30(%rsp), @XMM[6]
2182 movdqu @XMM[4], 0x20($out)
2183 pxor 0x40(%rsp), @XMM[3]
2184 movdqu @XMM[6], 0x30($out)
2185 pxor 0x50(%rsp), @XMM[7]
2186 movdqu @XMM[3], 0x40($out)
2187 movdqu @XMM[7], 0x50($out)
2188 lea 0x60($out), $out
2190 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2194 pxor @XMM[8+3], @XMM[3]
2195 lea 0x50($inp), $inp
2196 pxor @XMM[8+4], @XMM[4]
2197 lea 0x80(%rsp), %rax # pass key schedule
2198 mov %edx, %r10d # pass rounds
2200 call _bsaes_encrypt8
2202 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2203 pxor 0x10(%rsp), @XMM[1]
2204 movdqu @XMM[0], 0x00($out) # write output
2205 pxor 0x20(%rsp), @XMM[4]
2206 movdqu @XMM[1], 0x10($out)
2207 pxor 0x30(%rsp), @XMM[6]
2208 movdqu @XMM[4], 0x20($out)
2209 pxor 0x40(%rsp), @XMM[3]
2210 movdqu @XMM[6], 0x30($out)
2211 movdqu @XMM[3], 0x40($out)
2212 lea 0x50($out), $out
2214 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2218 pxor @XMM[8+2], @XMM[2]
2219 lea 0x40($inp), $inp
2220 pxor @XMM[8+3], @XMM[3]
2221 lea 0x80(%rsp), %rax # pass key schedule
2222 mov %edx, %r10d # pass rounds
2224 call _bsaes_encrypt8
2226 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2227 pxor 0x10(%rsp), @XMM[1]
2228 movdqu @XMM[0], 0x00($out) # write output
2229 pxor 0x20(%rsp), @XMM[4]
2230 movdqu @XMM[1], 0x10($out)
2231 pxor 0x30(%rsp), @XMM[6]
2232 movdqu @XMM[4], 0x20($out)
2233 movdqu @XMM[6], 0x30($out)
2234 lea 0x40($out), $out
2236 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2240 pxor @XMM[8+1], @XMM[1]
2241 lea 0x30($inp), $inp
2242 pxor @XMM[8+2], @XMM[2]
2243 lea 0x80(%rsp), %rax # pass key schedule
2244 mov %edx, %r10d # pass rounds
2246 call _bsaes_encrypt8
2248 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2249 pxor 0x10(%rsp), @XMM[1]
2250 movdqu @XMM[0], 0x00($out) # write output
2251 pxor 0x20(%rsp), @XMM[4]
2252 movdqu @XMM[1], 0x10($out)
2253 movdqu @XMM[4], 0x20($out)
2254 lea 0x30($out), $out
2256 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2260 pxor @XMM[8+0], @XMM[0]
2261 lea 0x20($inp), $inp
2262 pxor @XMM[8+1], @XMM[1]
2263 lea 0x80(%rsp), %rax # pass key schedule
2264 mov %edx, %r10d # pass rounds
2266 call _bsaes_encrypt8
2268 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2269 pxor 0x10(%rsp), @XMM[1]
2270 movdqu @XMM[0], 0x00($out) # write output
2271 movdqu @XMM[1], 0x10($out)
2272 lea 0x20($out), $out
2274 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2278 pxor @XMM[0], @XMM[8]
2279 lea 0x10($inp), $inp
2280 movdqa @XMM[8], 0x20(%rbp)
2281 lea 0x20(%rbp), $arg1
2282 lea 0x20(%rbp), $arg2
2284 call AES_encrypt # doesn't touch %xmm
2285 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2286 #pxor @XMM[8], @XMM[0]
2287 #lea 0x80(%rsp), %rax # pass key schedule
2288 #mov %edx, %r10d # pass rounds
2289 #call _bsaes_encrypt8
2290 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2291 movdqu @XMM[0], 0x00($out) # write output
2292 lea 0x10($out), $out
2294 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2303 movzb -16(%rdx), %ecx
2311 movdqu -16($out), @XMM[0]
2312 lea 0x20(%rbp), $arg1
2313 pxor @XMM[7], @XMM[0]
2314 lea 0x20(%rbp), $arg2
2315 movdqa @XMM[0], 0x20(%rbp)
2317 call AES_encrypt # doesn't touch %xmm
2318 pxor 0x20(%rbp), @XMM[7]
2319 movdqu @XMM[7], -16($out)
2324 .Lxts_enc_bzero: # wipe key schedule [if any]
2325 movdqa %xmm0, 0x00(%rax)
2326 movdqa %xmm0, 0x10(%rax)
2327 lea 0x20(%rax), %rax
2331 lea (%rbp),%rsp # restore %rsp
2333 $code.=<<___ if ($win64);
2334 movaps 0x40(%rbp), %xmm6
2335 movaps 0x50(%rbp), %xmm7
2336 movaps 0x60(%rbp), %xmm8
2337 movaps 0x70(%rbp), %xmm9
2338 movaps 0x80(%rbp), %xmm10
2339 movaps 0x90(%rbp), %xmm11
2340 movaps 0xa0(%rbp), %xmm12
2341 movaps 0xb0(%rbp), %xmm13
2342 movaps 0xc0(%rbp), %xmm14
2343 movaps 0xd0(%rbp), %xmm15
2344 lea 0xa0(%rbp), %rsp
2347 mov 0x48(%rsp), %r15
2348 mov 0x50(%rsp), %r14
2349 mov 0x58(%rsp), %r13
2350 mov 0x60(%rsp), %r12
2351 mov 0x68(%rsp), %rbx
2352 mov 0x70(%rsp), %rbp
2353 lea 0x78(%rsp), %rsp
2356 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2358 .globl bsaes_xts_decrypt
2359 .type bsaes_xts_decrypt,\@abi-omnipotent
2368 lea -0x48(%rsp), %rsp
2370 $code.=<<___ if ($win64);
2371 mov 0xa0(%rsp),$arg5 # pull key2
2372 mov 0xa8(%rsp),$arg6 # pull ivp
2373 lea -0xa0(%rsp), %rsp
2374 movaps %xmm6, 0x40(%rsp)
2375 movaps %xmm7, 0x50(%rsp)
2376 movaps %xmm8, 0x60(%rsp)
2377 movaps %xmm9, 0x70(%rsp)
2378 movaps %xmm10, 0x80(%rsp)
2379 movaps %xmm11, 0x90(%rsp)
2380 movaps %xmm12, 0xa0(%rsp)
2381 movaps %xmm13, 0xb0(%rsp)
2382 movaps %xmm14, 0xc0(%rsp)
2383 movaps %xmm15, 0xd0(%rsp)
2387 mov %rsp, %rbp # backup %rsp
2388 mov $arg1, $inp # backup arguments
2394 lea 0x20(%rbp), $arg2
2396 call AES_encrypt # generate initial tweak
2398 mov 240($key), %eax # rounds
2399 mov $len, %rbx # backup $len
2401 mov %eax, %edx # rounds
2402 shl \$7, %rax # 128 bytes per inner round key
2403 sub \$`128-32`, %rax # size of bit-sliced key schedule
2406 mov %rsp, %rax # pass key schedule
2407 mov $key, %rcx # pass key
2408 mov %edx, %r10d # pass rounds
2409 call _bsaes_key_convert
2410 pxor (%rsp), %xmm7 # fix up round 0 key
2411 movdqa %xmm6, (%rax) # save last round key
2412 movdqa %xmm7, (%rsp)
2414 xor %eax, %eax # if ($len%16) len-=16;
2421 sub \$0x80, %rsp # place for tweak[8]
2422 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2425 movdqa .Lxts_magic(%rip), $twmask
2426 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2435 for ($i=0;$i<7;$i++) {
2437 pshufd \$0x13, $twtmp, $twres
2439 movdqa @XMM[7], @XMM[$i]
2440 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2441 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2442 pand $twmask, $twres # isolate carry and residue
2443 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2444 pxor $twres, @XMM[7]
2446 $code.=<<___ if ($i>=1);
2447 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2449 $code.=<<___ if ($i>=2);
2450 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2454 movdqu 0x60($inp), @XMM[8+6]
2455 pxor @XMM[8+5], @XMM[5]
2456 movdqu 0x70($inp), @XMM[8+7]
2457 lea 0x80($inp), $inp
2458 movdqa @XMM[7], 0x70(%rsp)
2459 pxor @XMM[8+6], @XMM[6]
2460 lea 0x80(%rsp), %rax # pass key schedule
2461 pxor @XMM[8+7], @XMM[7]
2462 mov %edx, %r10d # pass rounds
2464 call _bsaes_decrypt8
2466 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2467 pxor 0x10(%rsp), @XMM[1]
2468 movdqu @XMM[0], 0x00($out) # write output
2469 pxor 0x20(%rsp), @XMM[6]
2470 movdqu @XMM[1], 0x10($out)
2471 pxor 0x30(%rsp), @XMM[4]
2472 movdqu @XMM[6], 0x20($out)
2473 pxor 0x40(%rsp), @XMM[2]
2474 movdqu @XMM[4], 0x30($out)
2475 pxor 0x50(%rsp), @XMM[7]
2476 movdqu @XMM[2], 0x40($out)
2477 pxor 0x60(%rsp), @XMM[3]
2478 movdqu @XMM[7], 0x50($out)
2479 pxor 0x70(%rsp), @XMM[5]
2480 movdqu @XMM[3], 0x60($out)
2481 movdqu @XMM[5], 0x70($out)
2482 lea 0x80($out), $out
2484 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2486 movdqa .Lxts_magic(%rip), $twmask
2487 pcmpgtd @XMM[7], $twtmp
2488 pshufd \$0x13, $twtmp, $twres
2490 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2491 pand $twmask, $twres # isolate carry and residue
2492 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2493 pxor $twres, @XMM[7]
2502 for ($i=0;$i<7;$i++) {
2504 pshufd \$0x13, $twtmp, $twres
2506 movdqa @XMM[7], @XMM[$i]
2507 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2508 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2509 pand $twmask, $twres # isolate carry and residue
2510 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2511 pxor $twres, @XMM[7]
2513 $code.=<<___ if ($i>=1);
2514 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2515 cmp \$`0x10*$i`,$len
2518 $code.=<<___ if ($i>=2);
2519 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2523 movdqu 0x60($inp), @XMM[8+6]
2524 pxor @XMM[8+5], @XMM[5]
2525 movdqa @XMM[7], 0x70(%rsp)
2526 lea 0x70($inp), $inp
2527 pxor @XMM[8+6], @XMM[6]
2528 lea 0x80(%rsp), %rax # pass key schedule
2529 mov %edx, %r10d # pass rounds
2531 call _bsaes_decrypt8
2533 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2534 pxor 0x10(%rsp), @XMM[1]
2535 movdqu @XMM[0], 0x00($out) # write output
2536 pxor 0x20(%rsp), @XMM[6]
2537 movdqu @XMM[1], 0x10($out)
2538 pxor 0x30(%rsp), @XMM[4]
2539 movdqu @XMM[6], 0x20($out)
2540 pxor 0x40(%rsp), @XMM[2]
2541 movdqu @XMM[4], 0x30($out)
2542 pxor 0x50(%rsp), @XMM[7]
2543 movdqu @XMM[2], 0x40($out)
2544 pxor 0x60(%rsp), @XMM[3]
2545 movdqu @XMM[7], 0x50($out)
2546 movdqu @XMM[3], 0x60($out)
2547 lea 0x70($out), $out
2549 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2553 pxor @XMM[8+4], @XMM[4]
2554 lea 0x60($inp), $inp
2555 pxor @XMM[8+5], @XMM[5]
2556 lea 0x80(%rsp), %rax # pass key schedule
2557 mov %edx, %r10d # pass rounds
2559 call _bsaes_decrypt8
2561 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2562 pxor 0x10(%rsp), @XMM[1]
2563 movdqu @XMM[0], 0x00($out) # write output
2564 pxor 0x20(%rsp), @XMM[6]
2565 movdqu @XMM[1], 0x10($out)
2566 pxor 0x30(%rsp), @XMM[4]
2567 movdqu @XMM[6], 0x20($out)
2568 pxor 0x40(%rsp), @XMM[2]
2569 movdqu @XMM[4], 0x30($out)
2570 pxor 0x50(%rsp), @XMM[7]
2571 movdqu @XMM[2], 0x40($out)
2572 movdqu @XMM[7], 0x50($out)
2573 lea 0x60($out), $out
2575 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2579 pxor @XMM[8+3], @XMM[3]
2580 lea 0x50($inp), $inp
2581 pxor @XMM[8+4], @XMM[4]
2582 lea 0x80(%rsp), %rax # pass key schedule
2583 mov %edx, %r10d # pass rounds
2585 call _bsaes_decrypt8
2587 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2588 pxor 0x10(%rsp), @XMM[1]
2589 movdqu @XMM[0], 0x00($out) # write output
2590 pxor 0x20(%rsp), @XMM[6]
2591 movdqu @XMM[1], 0x10($out)
2592 pxor 0x30(%rsp), @XMM[4]
2593 movdqu @XMM[6], 0x20($out)
2594 pxor 0x40(%rsp), @XMM[2]
2595 movdqu @XMM[4], 0x30($out)
2596 movdqu @XMM[2], 0x40($out)
2597 lea 0x50($out), $out
2599 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2603 pxor @XMM[8+2], @XMM[2]
2604 lea 0x40($inp), $inp
2605 pxor @XMM[8+3], @XMM[3]
2606 lea 0x80(%rsp), %rax # pass key schedule
2607 mov %edx, %r10d # pass rounds
2609 call _bsaes_decrypt8
2611 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2612 pxor 0x10(%rsp), @XMM[1]
2613 movdqu @XMM[0], 0x00($out) # write output
2614 pxor 0x20(%rsp), @XMM[6]
2615 movdqu @XMM[1], 0x10($out)
2616 pxor 0x30(%rsp), @XMM[4]
2617 movdqu @XMM[6], 0x20($out)
2618 movdqu @XMM[4], 0x30($out)
2619 lea 0x40($out), $out
2621 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2625 pxor @XMM[8+1], @XMM[1]
2626 lea 0x30($inp), $inp
2627 pxor @XMM[8+2], @XMM[2]
2628 lea 0x80(%rsp), %rax # pass key schedule
2629 mov %edx, %r10d # pass rounds
2631 call _bsaes_decrypt8
2633 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2634 pxor 0x10(%rsp), @XMM[1]
2635 movdqu @XMM[0], 0x00($out) # write output
2636 pxor 0x20(%rsp), @XMM[6]
2637 movdqu @XMM[1], 0x10($out)
2638 movdqu @XMM[6], 0x20($out)
2639 lea 0x30($out), $out
2641 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2645 pxor @XMM[8+0], @XMM[0]
2646 lea 0x20($inp), $inp
2647 pxor @XMM[8+1], @XMM[1]
2648 lea 0x80(%rsp), %rax # pass key schedule
2649 mov %edx, %r10d # pass rounds
2651 call _bsaes_decrypt8
2653 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2654 pxor 0x10(%rsp), @XMM[1]
2655 movdqu @XMM[0], 0x00($out) # write output
2656 movdqu @XMM[1], 0x10($out)
2657 lea 0x20($out), $out
2659 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2663 pxor @XMM[0], @XMM[8]
2664 lea 0x10($inp), $inp
2665 movdqa @XMM[8], 0x20(%rbp)
2666 lea 0x20(%rbp), $arg1
2667 lea 0x20(%rbp), $arg2
2669 call AES_decrypt # doesn't touch %xmm
2670 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2671 #pxor @XMM[8], @XMM[0]
2672 #lea 0x80(%rsp), %rax # pass key schedule
2673 #mov %edx, %r10d # pass rounds
2674 #call _bsaes_decrypt8
2675 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2676 movdqu @XMM[0], 0x00($out) # write output
2677 lea 0x10($out), $out
2679 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2686 movdqa .Lxts_magic(%rip), $twmask
2687 pcmpgtd @XMM[7], $twtmp
2688 pshufd \$0x13, $twtmp, $twres
2689 movdqa @XMM[7], @XMM[6]
2690 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2691 pand $twmask, $twres # isolate carry and residue
2692 movdqu ($inp), @XMM[0]
2693 pxor $twres, @XMM[7]
2695 lea 0x20(%rbp), $arg1
2696 pxor @XMM[7], @XMM[0]
2697 lea 0x20(%rbp), $arg2
2698 movdqa @XMM[0], 0x20(%rbp)
2700 call AES_decrypt # doesn't touch %xmm
2701 pxor 0x20(%rbp), @XMM[7]
2703 movdqu @XMM[7], ($out)
2706 movzb 16($inp), %eax
2715 movdqu ($out), @XMM[0]
2716 lea 0x20(%rbp), $arg1
2717 pxor @XMM[6], @XMM[0]
2718 lea 0x20(%rbp), $arg2
2719 movdqa @XMM[0], 0x20(%rbp)
2721 call AES_decrypt # doesn't touch %xmm
2722 pxor 0x20(%rbp), @XMM[6]
2723 movdqu @XMM[6], ($out)
2728 .Lxts_dec_bzero: # wipe key schedule [if any]
2729 movdqa %xmm0, 0x00(%rax)
2730 movdqa %xmm0, 0x10(%rax)
2731 lea 0x20(%rax), %rax
2735 lea (%rbp),%rsp # restore %rsp
2737 $code.=<<___ if ($win64);
2738 movaps 0x40(%rbp), %xmm6
2739 movaps 0x50(%rbp), %xmm7
2740 movaps 0x60(%rbp), %xmm8
2741 movaps 0x70(%rbp), %xmm9
2742 movaps 0x80(%rbp), %xmm10
2743 movaps 0x90(%rbp), %xmm11
2744 movaps 0xa0(%rbp), %xmm12
2745 movaps 0xb0(%rbp), %xmm13
2746 movaps 0xc0(%rbp), %xmm14
2747 movaps 0xd0(%rbp), %xmm15
2748 lea 0xa0(%rbp), %rsp
2751 mov 0x48(%rsp), %r15
2752 mov 0x50(%rsp), %r14
2753 mov 0x58(%rsp), %r13
2754 mov 0x60(%rsp), %r12
2755 mov 0x68(%rsp), %rbx
2756 mov 0x70(%rsp), %rbp
2757 lea 0x78(%rsp), %rsp
2760 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2764 .type _bsaes_const,\@object
2767 .LM0ISR: # InvShiftRows constants
2768 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2770 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2772 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2773 .LBS0: # bit-slice constants
2774 .quad 0x5555555555555555, 0x5555555555555555
2776 .quad 0x3333333333333333, 0x3333333333333333
2778 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2779 .LSR: # shiftrows constants
2780 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2782 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2784 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2786 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2787 .LNOT: # magic constants
2788 .quad 0xffffffffffffffff, 0xffffffffffffffff
2790 .quad 0x6363636363636363, 0x6363636363636363
2791 .LSWPUP: # byte-swap upper dword
2792 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2794 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2795 .LADD1: # counter increment constants
2796 .quad 0x0000000000000000, 0x0000000100000000
2798 .quad 0x0000000000000000, 0x0000000200000000
2800 .quad 0x0000000000000000, 0x0000000300000000
2802 .quad 0x0000000000000000, 0x0000000400000000
2804 .quad 0x0000000000000000, 0x0000000500000000
2806 .quad 0x0000000000000000, 0x0000000600000000
2808 .quad 0x0000000000000000, 0x0000000700000000
2810 .quad 0x0000000000000000, 0x0000000800000000
2813 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2815 .size _bsaes_const,.-_bsaes_const
2818 $code =~ s/\`([^\`]*)\`/eval($1)/gem;