3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
67 # conversion conversion/8x block
72 # The ratio values mean that 128-byte blocks will be processed
73 # 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
85 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
87 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
89 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
90 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
91 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
92 die "can't locate x86_64-xlate.pl";
94 open STDOUT,"| $^X $xlate $flavour $output";
96 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
97 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
100 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
103 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
104 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
109 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
110 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
114 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
115 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
137 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
138 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
158 #;*************************************************************
159 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
160 #;*************************************************************
161 my ($x0,$x1,$y0,$y1,$t0)=@_;
174 sub Mul_GF4_N { # not used, see next subroutine
175 # multiply and scale by N
176 my ($x0,$x1,$y0,$y1,$t0)=@_;
190 # interleaved Mul_GF4_N and Mul_GF4
191 my ($x0,$x1,$y0,$y1,$t0,
192 $x2,$x3,$y2,$y3,$t1)=@_;
220 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
227 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
228 @x[2], @x[3], @y[2], @y[3], @t[2]);
240 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
241 @x[6], @x[7], @y[2], @y[3], @t[2]);
246 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
255 #;********************************************************************
256 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
257 #;********************************************************************
261 # direct optimizations from hardware
316 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
318 # new smaller inversion
352 # output in s3, s2, s1, t1
354 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
356 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
357 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
359 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
362 # AES linear components
368 pxor 0x00($key),@x[0]
369 pxor 0x10($key),@x[1]
371 pxor 0x20($key),@x[2]
373 pxor 0x30($key),@x[3]
375 pxor 0x40($key),@x[4]
377 pxor 0x50($key),@x[5]
379 pxor 0x60($key),@x[6]
381 pxor 0x70($key),@x[7]
389 # modified to emit output in order suitable for feeding back to aesenc[last]
393 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
394 pshufd \$0x93, @x[1], @t[1]
395 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
396 pshufd \$0x93, @x[2], @t[2]
398 pshufd \$0x93, @x[3], @t[3]
400 pshufd \$0x93, @x[4], @t[4]
402 pshufd \$0x93, @x[5], @t[5]
404 pshufd \$0x93, @x[6], @t[6]
406 pshufd \$0x93, @x[7], @t[7]
413 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
415 pshufd \$0x4E, @x[1], @x[1]
421 pshufd \$0x4E, @x[4], @t[0]
423 pshufd \$0x4E, @x[5], @t[1]
425 pshufd \$0x4E, @x[3], @x[4]
427 pshufd \$0x4E, @x[7], @x[5]
429 pshufd \$0x4E, @x[6], @x[3]
431 pshufd \$0x4E, @x[2], @x[6]
443 sub aesenc { # not used
447 movdqa 0x30($const),@t[0] # .LSR
449 &shiftrows (@b,@t[0]);
451 &mixcolumns (@b[0,1,4,6,3,7,2,5],@t);
454 sub aesenclast { # not used
458 movdqa 0x40($const),@t[0] # .LSRM0
460 &shiftrows (@b,@t[0]);
463 pxor 0x00($key),@b[0]
464 pxor 0x10($key),@b[1]
465 pxor 0x20($key),@b[4]
466 pxor 0x30($key),@b[6]
467 pxor 0x40($key),@b[3]
468 pxor 0x50($key),@b[7]
469 pxor 0x60($key),@b[2]
470 pxor 0x70($key),@b[5]
475 my ($a,$b,$n,$mask,$t)=@_;
487 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
507 my @x=reverse(@_[0..7]);
508 my ($t0,$t1,$t2,$t3)=@_[8..11];
510 movdqa 0x00($const),$t0 # .LBS0
511 movdqa 0x10($const),$t1 # .LBS1
513 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
514 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
516 movdqa 0x20($const),$t0 # .LBS2
518 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
519 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
521 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
522 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
530 .type _bsaes_encrypt8,\@abi-omnipotent
533 lea .LBS0(%rip), $const # constants table
535 movdqa ($key), @XMM[9] # round 0 key
537 movdqa 0x60($const), @XMM[8] # .LM0SR
538 pxor @XMM[9], @XMM[0] # xor with round0 key
539 pxor @XMM[9], @XMM[1]
540 pshufb @XMM[8], @XMM[0]
541 pxor @XMM[9], @XMM[2]
542 pshufb @XMM[8], @XMM[1]
543 pxor @XMM[9], @XMM[3]
544 pshufb @XMM[8], @XMM[2]
545 pxor @XMM[9], @XMM[4]
546 pshufb @XMM[8], @XMM[3]
547 pxor @XMM[9], @XMM[5]
548 pshufb @XMM[8], @XMM[4]
549 pxor @XMM[9], @XMM[6]
550 pshufb @XMM[8], @XMM[5]
551 pxor @XMM[9], @XMM[7]
552 pshufb @XMM[8], @XMM[6]
553 pshufb @XMM[8], @XMM[7]
554 _bsaes_encrypt8_bitslice:
556 &bitslice (@XMM[0..7, 8..11]);
563 &shiftrows (@XMM[0..7, 8]);
564 $code.=".Lenc_sbox:\n";
565 &sbox (@XMM[0..7, 8..15]);
570 &mixcolumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
572 movdqa 0x30($const), @XMM[8] # .LSR
574 movdqa 0x40($const), @XMM[8] # .LSRM0
579 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
580 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
582 movdqa ($key), @XMM[8] # last round key
583 pxor @XMM[8], @XMM[0]
584 pxor @XMM[8], @XMM[1]
585 pxor @XMM[8], @XMM[4]
586 pxor @XMM[8], @XMM[6]
587 pxor @XMM[8], @XMM[3]
588 pxor @XMM[8], @XMM[7]
589 pxor @XMM[8], @XMM[2]
590 pxor @XMM[8], @XMM[5]
592 .size _bsaes_encrypt8,.-_bsaes_encrypt8
596 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
599 my @x=reverse(@_[0..7]);
600 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
602 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
604 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
608 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
610 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
612 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
618 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
619 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
623 .type _bsaes_enc_key_convert,\@abi-omnipotent
625 _bsaes_enc_key_convert:
626 lea .LBS1(%rip), $const
627 movdqu ($inp), %xmm7 # load round 0 key
628 movdqa -0x10($const), %xmm8 # .LBS0
629 movdqa 0x00($const), %xmm9 # .LBS1
630 movdqa 0x10($const), %xmm10 # .LBS2
631 movdqa 0x40($const), %xmm13 # .LM0
632 movdqa 0x60($const),%xmm14 # .LNOT
634 movdqu 0x10($inp), %xmm6 # load round 1 key
636 movdqa %xmm7, ($out) # save round 0 key
645 &bitslice_key (map("%xmm$_",(0..7, 8..12)));
647 pxor %xmm14, %xmm5 # "pnot"
652 movdqa %xmm0, 0x00($out) # write bit-sliced round key
653 movdqa %xmm1, 0x10($out)
654 movdqa %xmm2, 0x20($out)
655 movdqa %xmm3, 0x30($out)
656 movdqa %xmm4, 0x40($out)
657 movdqa %xmm5, 0x50($out)
658 movdqa %xmm6, 0x60($out)
659 movdqa %xmm7, 0x70($out)
661 movdqu ($inp), %xmm6 # load next round key
665 pxor 0x70($const), %xmm6 # .L63
666 movdqa %xmm6, ($out) # save last round key
668 .size _bsaes_enc_key_convert,.-_bsaes_enc_key_convert
672 if (1 && !$win64) { # following two functions are unsupported interface
673 # used for benchmarking...
675 .globl bsaes_enc_key_convert
676 .type bsaes_enc_key_convert,\@function,2
678 bsaes_enc_key_convert:
679 mov 240($inp),%r10d # pass rounds
680 mov $inp,%rcx # pass key
681 mov $out,%rax # pass key schedule
682 call _bsaes_enc_key_convert
684 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
686 .globl bsaes_encrypt_128
687 .type bsaes_encrypt_128,\@function,4
691 movdqu 0x00($inp), @XMM[0] # load input
692 movdqu 0x10($inp), @XMM[1]
693 movdqu 0x20($inp), @XMM[2]
694 movdqu 0x30($inp), @XMM[3]
695 movdqu 0x40($inp), @XMM[4]
696 movdqu 0x50($inp), @XMM[5]
697 movdqu 0x60($inp), @XMM[6]
698 movdqu 0x70($inp), @XMM[7]
699 mov $key, %rax # pass the $key
705 movdqu @XMM[0], 0x00($out) # write output
706 movdqu @XMM[1], 0x10($out)
707 movdqu @XMM[4], 0x20($out)
708 movdqu @XMM[6], 0x30($out)
709 movdqu @XMM[3], 0x40($out)
710 movdqu @XMM[7], 0x50($out)
711 movdqu @XMM[2], 0x60($out)
712 movdqu @XMM[5], 0x70($out)
717 .size bsaes_encrypt_128,.-bsaes_encrypt_128
721 ######################################################################
725 my ($arg1,$arg2,$arg3,$arg4,$arg5) = $win64 ? ("%rcx","%rdx","%r8","%r9","%r10")
726 : ("%rdi","%rsi","%rdx","%rcx","%r8");
727 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
730 .globl bsaes_ecb_encrypt_blocks
731 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
733 bsaes_ecb_encrypt_blocks:
742 $code.=<<___ if ($win64);
743 lea -0xa0(%rsp), %rsp
744 movaps %xmm6, 0x40(%rsp)
745 movaps %xmm7, 0x50(%rsp)
746 movaps %xmm8, 0x60(%rsp)
747 movaps %xmm9, 0x70(%rsp)
748 movaps %xmm10, 0x80(%rsp)
749 movaps %xmm11, 0x90(%rsp)
750 movaps %xmm12, 0xa0(%rsp)
751 movaps %xmm13, 0xb0(%rsp)
752 movaps %xmm14, 0xc0(%rsp)
753 movaps %xmm15, 0xd0(%rsp)
757 mov %rsp,%rbp # backup %rsp
758 mov 240($arg4),%eax # rounds
759 mov $arg1,$inp # backup arguments
766 mov %eax,%ebx # backup rounds
767 shl \$7,%rax # 128 bytes per inner round key
768 sub \$`128-32`,%rax # size of bit-sliced key schedule
770 mov %rsp,%rax # pass key schedule
771 mov $key,%rcx # pass key
772 mov %ebx,%r10d # pass rounds
773 call _bsaes_enc_key_convert
777 movdqu 0x00($inp), @XMM[0] # load input
778 movdqu 0x10($inp), @XMM[1]
779 movdqu 0x20($inp), @XMM[2]
780 movdqu 0x30($inp), @XMM[3]
781 movdqu 0x40($inp), @XMM[4]
782 movdqu 0x50($inp), @XMM[5]
783 mov %rsp, %rax # pass key schedule
784 movdqu 0x60($inp), @XMM[6]
785 mov %ebx,%r10d # pass rounds
786 movdqu 0x70($inp), @XMM[7]
791 movdqu @XMM[0], 0x00($out) # write output
792 movdqu @XMM[1], 0x10($out)
793 movdqu @XMM[4], 0x20($out)
794 movdqu @XMM[6], 0x30($out)
795 movdqu @XMM[3], 0x40($out)
796 movdqu @XMM[7], 0x50($out)
797 movdqu @XMM[2], 0x60($out)
798 movdqu @XMM[5], 0x70($out)
806 movdqu 0x00($inp), @XMM[0] # load input
807 mov %rsp, %rax # pass key schedule
808 mov %ebx,%r10d # pass rounds
811 movdqu 0x10($inp), @XMM[1]
813 movdqu 0x20($inp), @XMM[2]
816 movdqu 0x30($inp), @XMM[3]
818 movdqu 0x40($inp), @XMM[4]
821 movdqu 0x50($inp), @XMM[5]
823 movdqu 0x60($inp), @XMM[6]
825 movdqu @XMM[0], 0x00($out) # write output
826 movdqu @XMM[1], 0x10($out)
827 movdqu @XMM[4], 0x20($out)
828 movdqu @XMM[6], 0x30($out)
829 movdqu @XMM[3], 0x40($out)
830 movdqu @XMM[7], 0x50($out)
831 movdqu @XMM[2], 0x60($out)
836 movdqu @XMM[0], 0x00($out) # write output
837 movdqu @XMM[1], 0x10($out)
838 movdqu @XMM[4], 0x20($out)
839 movdqu @XMM[6], 0x30($out)
840 movdqu @XMM[3], 0x40($out)
841 movdqu @XMM[7], 0x50($out)
846 movdqu @XMM[0], 0x00($out) # write output
847 movdqu @XMM[1], 0x10($out)
848 movdqu @XMM[4], 0x20($out)
849 movdqu @XMM[6], 0x30($out)
850 movdqu @XMM[3], 0x40($out)
855 movdqu @XMM[0], 0x00($out) # write output
856 movdqu @XMM[1], 0x10($out)
857 movdqu @XMM[4], 0x20($out)
858 movdqu @XMM[6], 0x30($out)
863 movdqu @XMM[0], 0x00($out) # write output
864 movdqu @XMM[1], 0x10($out)
865 movdqu @XMM[4], 0x20($out)
870 movdqu @XMM[0], 0x00($out) # write output
871 movdqu @XMM[1], 0x10($out)
876 movdqu @XMM[0], 0x00($out) # write output
892 .Lecb_enc_bzero: # wipe key schedule [if any]
893 movdqa %xmm0, 0x00(%rax)
894 movdqa %xmm0, 0x10(%rax)
899 lea (%rbp),%rsp # restore %rsp
901 $code.=<<___ if ($win64);
902 movaps 0x40(%rbp), %xmm6
903 movaps 0x50(%rbp), %xmm7
904 movaps 0x60(%rbp), %xmm8
905 movaps 0x70(%rbp), %xmm9
906 movaps 0x80(%rbp), %xmm10
907 movaps 0x90(%rbp), %xmm11
908 movaps 0xa0(%rbp), %xmm12
909 movaps 0xb0(%rbp), %xmm13
910 movaps 0xc0(%rbp), %xmm14
911 movaps 0xd0(%rbp), %xmm15
924 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
926 .globl bsaes_ctr32_encrypt_blocks
927 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
929 bsaes_ctr32_encrypt_blocks:
936 lea -0x48(%rsp), %rsp
938 $code.=<<___ if ($win64);
939 mov 0xa0(%rsp),$arg5 # pull ivp
940 lea -0xa0(%rsp), %rsp
941 movaps %xmm6, 0x40(%rsp)
942 movaps %xmm7, 0x50(%rsp)
943 movaps %xmm8, 0x60(%rsp)
944 movaps %xmm9, 0x70(%rsp)
945 movaps %xmm10, 0x80(%rsp)
946 movaps %xmm11, 0x90(%rsp)
947 movaps %xmm12, 0xa0(%rsp)
948 movaps %xmm13, 0xb0(%rsp)
949 movaps %xmm14, 0xc0(%rsp)
950 movaps %xmm15, 0xd0(%rsp)
954 mov %rsp, %rbp # backup %rsp
955 movdqu ($arg5), %xmm0 # load counter
956 mov 240($arg4), %eax # rounds
957 mov $arg1, $inp # backup arguments
961 movdqa %xmm0, 0x20(%rbp) # copy counter
965 mov %eax, %ebx # rounds
966 shl \$7, %rax # 128 bytes per inner round key
967 sub \$`128-32`, %rax # size of bit-sliced key schedule
970 mov %rsp, %rax # pass key schedule
971 mov $key, %rcx # pass key
972 mov %ebx, %r10d # pass rounds
973 call _bsaes_enc_key_convert
975 movdqa (%rsp), @XMM[9] # load round0 key
976 lea .LADD1(%rip), %r11
977 movdqa 0x20(%rbp), @XMM[0] # counter copy
978 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
979 pshufb @XMM[8], @XMM[9] # byte swap upper part
980 pshufb @XMM[8], @XMM[0]
981 movdqa @XMM[9], (%rsp) # save adjusted round0 key
985 movdqa @XMM[0], 0x20(%rbp) # save counter
986 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
987 movdqa @XMM[0], @XMM[2]
988 paddd 0x00(%r11), @XMM[1] # .LADD1
989 movdqa @XMM[0], @XMM[3]
990 paddd 0x10(%r11), @XMM[2] # .LADD2
991 movdqa @XMM[0], @XMM[4]
992 paddd 0x20(%r11), @XMM[3] # .LADD3
993 movdqa @XMM[0], @XMM[5]
994 paddd 0x30(%r11), @XMM[4] # .LADD4
995 movdqa @XMM[0], @XMM[6]
996 paddd 0x40(%r11), @XMM[5] # .LADD5
997 movdqa @XMM[0], @XMM[7]
998 paddd 0x50(%r11), @XMM[6] # .LADD6
999 paddd 0x60(%r11), @XMM[7] # .LADD7
1001 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1002 # to flip byte order in 32-bit counter
1003 movdqa (%rsp), @XMM[9] # round 0 key
1004 lea 0x10(%rsp), %rax # pass key schedule
1005 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1006 pxor @XMM[9], @XMM[0] # xor with round0 key
1007 pxor @XMM[9], @XMM[1]
1008 pshufb @XMM[8], @XMM[0]
1009 pxor @XMM[9], @XMM[2]
1010 pshufb @XMM[8], @XMM[1]
1011 pxor @XMM[9], @XMM[3]
1012 pshufb @XMM[8], @XMM[2]
1013 pxor @XMM[9], @XMM[4]
1014 pshufb @XMM[8], @XMM[3]
1015 pxor @XMM[9], @XMM[5]
1016 pshufb @XMM[8], @XMM[4]
1017 pxor @XMM[9], @XMM[6]
1018 pshufb @XMM[8], @XMM[5]
1019 pxor @XMM[9], @XMM[7]
1020 pshufb @XMM[8], @XMM[6]
1021 lea .LBS0(%rip), %r11 # constants table
1022 pshufb @XMM[8], @XMM[7]
1023 mov %ebx,%r10d # pass rounds
1025 call _bsaes_encrypt8_bitslice
1028 jc .Lctr_enc_loop_done
1030 movdqu 0x00($inp), @XMM[8] # load input
1031 movdqu 0x10($inp), @XMM[9]
1032 movdqu 0x20($inp), @XMM[10]
1033 movdqu 0x30($inp), @XMM[11]
1034 movdqu 0x40($inp), @XMM[12]
1035 movdqu 0x50($inp), @XMM[13]
1036 movdqu 0x60($inp), @XMM[14]
1037 movdqu 0x70($inp), @XMM[15]
1039 pxor @XMM[0], @XMM[8]
1040 movdqa 0x20(%rbp), @XMM[0] # load counter
1041 pxor @XMM[9], @XMM[1]
1042 movdqu @XMM[8], 0x00($out) # write output
1043 pxor @XMM[10], @XMM[4]
1044 movdqu @XMM[1], 0x10($out)
1045 pxor @XMM[11], @XMM[6]
1046 movdqu @XMM[4], 0x20($out)
1047 pxor @XMM[12], @XMM[3]
1048 movdqu @XMM[6], 0x30($out)
1049 pxor @XMM[13], @XMM[7]
1050 movdqu @XMM[3], 0x40($out)
1051 pxor @XMM[14], @XMM[2]
1052 movdqu @XMM[7], 0x50($out)
1053 pxor @XMM[15], @XMM[5]
1054 movdqu @XMM[2], 0x60($out)
1055 lea .LADD1(%rip), %r11
1056 movdqu @XMM[5], 0x70($out)
1057 lea 0x80($out), $out
1058 paddd 0x70(%r11), @XMM[0] # .LADD8
1063 .Lctr_enc_loop_done:
1064 movdqu 0x00($inp), @XMM[8] # load input
1065 pxor @XMM[8], @XMM[0]
1066 movdqu @XMM[0], 0x00($out) # write output
1069 movdqu 0x10($inp), @XMM[9]
1070 pxor @XMM[9], @XMM[1]
1071 movdqu @XMM[1], 0x10($out)
1073 movdqu 0x20($inp), @XMM[10]
1074 pxor @XMM[10], @XMM[4]
1075 movdqu @XMM[4], 0x20($out)
1078 movdqu 0x30($inp), @XMM[11]
1079 pxor @XMM[11], @XMM[6]
1080 movdqu @XMM[6], 0x30($out)
1082 movdqu 0x40($inp), @XMM[12]
1083 pxor @XMM[12], @XMM[3]
1084 movdqu @XMM[3], 0x40($out)
1087 movdqu 0x50($inp), @XMM[13]
1088 pxor @XMM[13], @XMM[7]
1089 movdqu @XMM[7], 0x50($out)
1091 movdqu 0x60($inp), @XMM[14]
1092 pxor @XMM[14], @XMM[2]
1093 movdqu @XMM[2], 0x60($out)
1098 lea 0x20(%rbp), $arg1
1099 lea 0x30(%rbp), $arg2
1102 movdqu ($inp), @XMM[1]
1104 mov 0x2c(%rbp), %eax # load 32-bit counter
1106 pxor 0x30(%rbp), @XMM[1]
1107 inc %eax # increment
1108 movdqu @XMM[1], ($out)
1111 mov %eax, 0x2c(%rsp) # save 32-bit counter
1118 .Lctr_enc_bzero: # wipe key schedule [if any]
1119 movdqa %xmm0, 0x00(%rax)
1120 movdqa %xmm0, 0x10(%rax)
1121 lea 0x20(%rax), %rax
1125 lea (%rbp),%rsp # restore %rsp
1127 $code.=<<___ if ($win64);
1128 movaps 0x40(%rbp), %xmm6
1129 movaps 0x50(%rbp), %xmm7
1130 movaps 0x60(%rbp), %xmm8
1131 movaps 0x70(%rbp), %xmm9
1132 movaps 0x80(%rbp), %xmm10
1133 movaps 0x90(%rbp), %xmm11
1134 movaps 0xa0(%rbp), %xmm12
1135 movaps 0xb0(%rbp), %xmm13
1136 movaps 0xc0(%rbp), %xmm14
1137 movaps 0xd0(%rbp), %xmm15
1138 lea 0xa0(%rbp), %rsp
1141 mov 0x48(%rsp), %r15
1142 mov 0x50(%rsp), %r14
1143 mov 0x58(%rsp), %r13
1144 mov 0x60(%rsp), %r12
1145 mov 0x68(%rsp), %rbx
1146 mov 0x70(%rsp), %rbp
1147 lea 0x78(%rsp), %rsp
1150 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1155 .LBS0: # bit-slice constants
1156 .quad 0x5555555555555555, 0x5555555555555555
1158 .quad 0x3333333333333333, 0x3333333333333333
1160 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
1161 .LSR: # shiftrows constants
1162 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
1164 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
1166 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
1168 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
1169 .LNOT: # magic constants
1170 .quad 0xffffffffffffffff, 0xffffffffffffffff
1172 .quad 0x6363636363636363, 0x6363636363636363
1173 .LSWPUP: # byte-swap upper dword
1174 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
1176 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
1177 .LADD1: # counter increment constants
1178 .quad 0x0000000000000000, 0x0000000100000000
1180 .quad 0x0000000000000000, 0x0000000200000000
1182 .quad 0x0000000000000000, 0x0000000300000000
1184 .quad 0x0000000000000000, 0x0000000400000000
1186 .quad 0x0000000000000000, 0x0000000500000000
1188 .quad 0x0000000000000000, 0x0000000600000000
1190 .quad 0x0000000000000000, 0x0000000700000000
1192 .quad 0x0000000000000000, 0x0000000800000000
1193 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper and Peter Schwabe"
1197 $code =~ s/\`([^\`]*)\`/eval($1)/gem;