ghash-x86_64.pl: add AVX code path.
authorAndy Polyakov <appro@openssl.org>
Sun, 24 Mar 2013 22:44:35 +0000 (23:44 +0100)
committerAndy Polyakov <appro@openssl.org>
Sun, 24 Mar 2013 22:44:35 +0000 (23:44 +0100)
crypto/modes/asm/ghash-x86_64.pl
crypto/modes/gcm128.c

index 3c131c4..f4af85d 100644 (file)
 # Ivy Bridge   1.79(+8%)
 # Bulldozer    1.52(+25%)
 
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we know that
+# it will perform better on upcoming Haswell processor. [Exact
+# performance numbers to be added at launch.]
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -75,6 +87,21 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+               =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+           `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+           `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+       $avx = ($1>=10) + ($1>=11);
+}
+
 open OUT,"| \"$^X\" $xlate $flavour $output";
 *STDOUT=*OUT;
 
@@ -442,12 +469,22 @@ ___
 }
 \f
 { my ($Htbl,$Xip)=@_4args;
+  my $HK="%xmm6";
 
 $code.=<<___;
 .globl gcm_init_clmul
 .type  gcm_init_clmul,\@abi-omnipotent
 .align 16
 gcm_init_clmul:
+.L_init_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_clmul:
+       # I can't trust assembler to use specific encoding:-(
+       .byte   0x48,0x83,0xec,0x18             #sub    $0x18,%rsp
+       .byte   0x0f,0x29,0x34,0x24             #movaps %xmm6,(%rsp)
+___
+$code.=<<___;
        movdqu          ($Xip),$Hkey
        pshufd          \$0b01001110,$Hkey,$Hkey        # dword swap
 
@@ -466,9 +503,11 @@ gcm_init_clmul:
        pxor            $T3,$Hkey               # if(carry) H^=0x1c2_polynomial
 
        # calculate H^2
+       pshufd          \$0b01001110,$Hkey,$HK
        movdqa          $Hkey,$Xi
+       pxor            $Hkey,$HK
 ___
-       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey,$HK);
        &reduction_alg9 ($Xhi,$Xi);
 $code.=<<___;
        pshufd          \$0b01001110,$Hkey,$T1
@@ -481,12 +520,12 @@ $code.=<<___;
        movdqu          $T2,0x20($Htbl)         # save Karatsuba "salt"
 ___
 if ($do4xaggr) {
-       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H^3
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey,$HK);   # H^3
        &reduction_alg9 ($Xhi,$Xi);
 $code.=<<___;
        movdqa          $Xi,$T3
 ___
-       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H^4
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey,$HK);   # H^4
        &reduction_alg9 ($Xhi,$Xi);
 $code.=<<___;
        pshufd          \$0b01001110,$T3,$T1
@@ -495,10 +534,15 @@ $code.=<<___;
        movdqu          $T3,0x30($Htbl)         # save H^3
        pxor            $Xi,$T2                 # Karatsuba pre-processing
        movdqu          $Xi,0x40($Htbl)         # save H^4
-       palignr         \$8,$T1,$T2             # low part is H.lo^H.hi...
+       palignr         \$8,$T1,$T2             # low part is H^3.lo^H^3.hi...
        movdqu          $T2,0x50($Htbl)         # save Karatsuba "salt"
 ___
 }
+$code.=<<___ if ($win64);
+       movaps  (%rsp),%xmm6
+       lea     0x18(%rsp),%rsp
+.LSEH_end_gcm_init_clmul:
+___
 $code.=<<___;
        ret
 .size  gcm_init_clmul,.-gcm_init_clmul
@@ -512,6 +556,7 @@ $code.=<<___;
 .type  gcm_gmult_clmul,\@abi-omnipotent
 .align 16
 gcm_gmult_clmul:
+.L_gmult_clmul:
        movdqu          ($Xip),$Xi
        movdqa          .Lbswap_mask(%rip),$T3
        movdqu          ($Htbl),$Hkey
@@ -559,6 +604,7 @@ $code.=<<___;
 .type  gcm_ghash_clmul,\@abi-omnipotent
 .align 32
 gcm_ghash_clmul:
+.L_ghash_clmul:
 ___
 $code.=<<___ if ($win64);
        lea     -0x88(%rsp),%rax
@@ -893,14 +939,591 @@ $code.=<<___ if ($win64);
        movaps  0x80(%rsp),%xmm14
        movaps  0x90(%rsp),%xmm15
        lea     0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_clmul:
 ___
 $code.=<<___;
        ret
-.LSEH_end_gcm_ghash_clmul:
 .size  gcm_ghash_clmul,.-gcm_ghash_clmul
 ___
 }
+\f
+$code.=<<___;
+.globl gcm_init_avx
+.type  gcm_init_avx,\@abi-omnipotent
+.align 32
+gcm_init_avx:
+___
+if ($avx) {
+my ($Htbl,$Xip)=@_4args;
+my $HK="%xmm6";
+
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_avx:
+       # I can't trust assembler to use specific encoding:-(
+       .byte   0x48,0x83,0xec,0x18             #sub    $0x18,%rsp
+       .byte   0x0f,0x29,0x34,0x24             #movaps %xmm6,(%rsp)
+___
+$code.=<<___;
+       vzeroupper
+
+       vmovdqu         ($Xip),$Hkey
+       vpshufd         \$0b01001110,$Hkey,$Hkey        # dword swap
+
+       # <<1 twist
+       vpshufd         \$0b11111111,$Hkey,$T2  # broadcast uppermost dword
+       vpsrlq          \$63,$Hkey,$T1
+       vpsllq          \$1,$Hkey,$Hkey
+       vpxor           $T3,$T3,$T3             #
+       vpcmpgtd        $T2,$T3,$T3             # broadcast carry bit
+       vpslldq         \$8,$T1,$T1
+       vpor            $T1,$Hkey,$Hkey         # H<<=1
+
+       # magic reduction
+       vpand           .L0x1c2_polynomial(%rip),$T3,$T3
+       vpxor           $T3,$Hkey,$Hkey         # if(carry) H^=0x1c2_polynomial
+
+       vpunpckhqdq     $Hkey,$Hkey,$HK
+       vmovdqa         $Hkey,$Xi
+       vpxor           $Hkey,$HK,$HK
+       mov             \$4,%r10                # up to H^8
+       jmp             .Linit_start_avx
+___
+
+sub clmul64x64_avx {
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) {   $HK = $T2;
+$code.=<<___;
+       vpunpckhqdq     $Xi,$Xi,$T1
+       vpunpckhqdq     $Hkey,$Hkey,$T2
+       vpxor           $Xi,$T1,$T1             #
+       vpxor           $Hkey,$T2,$T2
+___
+} else {
+$code.=<<___;
+       vpunpckhqdq     $Xi,$Xi,$T1
+       vpxor           $Xi,$T1,$T1             #
+___
+}
+$code.=<<___;
+       vpclmulqdq      \$0x11,$Hkey,$Xi,$Xhi   #######
+       vpclmulqdq      \$0x00,$Hkey,$Xi,$Xi    #######
+       vpclmulqdq      \$0x00,$HK,$T1,$T1      #######
+       vpxor           $Xi,$Xhi,$T2            #
+       vpxor           $T2,$T1,$T1             #
+
+       vpslldq         \$8,$T1,$T2             #
+       vpsrldq         \$8,$T1,$T1
+       vpxor           $T2,$Xi,$Xi             #
+       vpxor           $T1,$Xhi,$Xhi
+___
+}
+
+sub reduction_avx {
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+       vpsllq          \$57,$Xi,$T1            # 1st phase
+       vpsllq          \$62,$Xi,$T2
+       vpxor           $T1,$T2,$T2             #
+       vpsllq          \$63,$Xi,$T1
+       vpxor           $T1,$T2,$T2             #
+       vpslldq         \$8,$T2,$T1             #
+       vpsrldq         \$8,$T2,$T2
+       vpxor           $T1,$Xi,$Xi             #
+       vpxor           $T2,$Xhi,$Xhi
+
+       vpsrlq          \$1,$Xi,$T2             # 2nd phase
+       vpxor           $Xi,$Xhi,$Xhi
+       vpxor           $T2,$Xi,$Xi             #
+       vpsrlq          \$5,$T2,$T2
+       vpxor           $T2,$Xi,$Xi             #
+       vpsrlq          \$1,$Xi,$Xi             #
+       vpxor           $Xhi,$Xi,$Xi            #
+___
+}
+
+$code.=<<___;
+.align 32
+.Linit_loop_avx:
+       vpalignr        \$8,$T1,$T2,$T3         # low part is H.lo^H.hi...
+       vmovdqu         $T3,-0x10($Htbl)        # save Karatsuba "salt"
+___
+       &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK);   # calculate H^3,5,7
+       &reduction_avx  ($Xhi,$Xi);
+$code.=<<___;
+.Linit_start_avx:
+       vmovdqa         $Xi,$T3
+___
+       &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK);   # calculate H^2,4,6,8
+       &reduction_avx  ($Xhi,$Xi);
+$code.=<<___;
+       vpshufd         \$0b01001110,$T3,$T1
+       vpshufd         \$0b01001110,$Xi,$T2
+       vpxor           $T3,$T1,$T1             # Karatsuba pre-processing
+       vmovdqu         $T3,0x00($Htbl)         # save H^1,3,5,7
+       vpxor           $Xi,$T2,$T2             # Karatsuba pre-processing
+       vmovdqu         $Xi,0x10($Htbl)         # save H^2,4,6,8
+       lea             0x30($Htbl),$Htbl
+       sub             \$1,%r10
+       jnz             .Linit_loop_avx
+
+       vpalignr        \$8,$T2,$T1,$T3         # last "salt" is flipped
+       vmovdqu         $T3,-0x10($Htbl)
+
+       vzeroupper
+___
+$code.=<<___ if ($win64);
+       movaps  (%rsp),%xmm6
+       lea     0x18(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+___
+$code.=<<___;
+       ret
+.size  gcm_init_avx,.-gcm_init_avx
+___
+} else {
+$code.=<<___;
+       jmp     .L_init_clmul
+.size  gcm_init_avx,.-gcm_init_avx
+___
+}
+
+$code.=<<___;
+.globl gcm_gmult_avx
+.type  gcm_gmult_avx,\@abi-omnipotent
+.align 32
+gcm_gmult_avx:
+       jmp     .L_gmult_clmul
+.size  gcm_gmult_avx,.-gcm_gmult_avx
+___
+\f
+$code.=<<___;
+.globl gcm_ghash_avx
+.type  gcm_ghash_avx,\@abi-omnipotent
+.align 32
+gcm_ghash_avx:
+___
+if ($avx) {
+my ($Xip,$Htbl,$inp,$len)=@_4args;
+my ($Xlo,$Xhi,$Xmi,
+    $Zlo,$Zhi,$Zmi,
+    $Hkey,$HK,$T1,$T2,
+    $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
+
+$code.=<<___ if ($win64);
+       lea     -0x88(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+       # I can't trust assembler to use specific encoding:-(
+       .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax),%rsp
+       .byte   0x0f,0x29,0x70,0xe0             #movaps %xmm6,-0x20(%rax)
+       .byte   0x0f,0x29,0x78,0xf0             #movaps %xmm7,-0x10(%rax)
+       .byte   0x44,0x0f,0x29,0x00             #movaps %xmm8,0(%rax)
+       .byte   0x44,0x0f,0x29,0x48,0x10        #movaps %xmm9,0x10(%rax)
+       .byte   0x44,0x0f,0x29,0x50,0x20        #movaps %xmm10,0x20(%rax)
+       .byte   0x44,0x0f,0x29,0x58,0x30        #movaps %xmm11,0x30(%rax)
+       .byte   0x44,0x0f,0x29,0x60,0x40        #movaps %xmm12,0x40(%rax)
+       .byte   0x44,0x0f,0x29,0x68,0x50        #movaps %xmm13,0x50(%rax)
+       .byte   0x44,0x0f,0x29,0x70,0x60        #movaps %xmm14,0x60(%rax)
+       .byte   0x44,0x0f,0x29,0x78,0x70        #movaps %xmm15,0x70(%rax)
+___
+$code.=<<___;
+       vzeroupper
+
+       vmovdqu         ($Xip),$Xi              # load $Xi
+       lea             .L0x1c2_polynomial(%rip),%r10
+       lea             0x40($Htbl),$Htbl       # size optimization
+       vmovdqu         .Lbswap_mask(%rip),$bswap
+       vpshufb         $bswap,$Xi,$Xi
+       cmp             \$0x80,$len
+       jb              .Lshort_avx
+       sub             \$0x80,$len
+
+       vmovdqu         0x70($inp),$Ii          # I[7]
+       vmovdqu         0x00-0x40($Htbl),$Hkey  # $Hkey^1
+       vpshufb         $bswap,$Ii,$Ii
+       vmovdqu         0x20-0x40($Htbl),$HK
+
+       vpunpckhqdq     $Ii,$Ii,$T2
+        vmovdqu        0x60($inp),$Ij          # I[6]
+       vpclmulqdq      \$0x00,$Hkey,$Ii,$Xlo
+       vpxor           $Ii,$T2,$T2
+        vpshufb        $bswap,$Ij,$Ij
+       vpclmulqdq      \$0x11,$Hkey,$Ii,$Xhi
+        vmovdqu        0x10-0x40($Htbl),$Hkey  # $Hkey^2
+        vpunpckhqdq    $Ij,$Ij,$T1
+        vmovdqu        0x50($inp),$Ii          # I[5]
+       vpclmulqdq      \$0x00,$HK,$T2,$Xmi
+        vpxor          $Ij,$T1,$T1
+
+        vpshufb        $bswap,$Ii,$Ii
+       vpclmulqdq      \$0x00,$Hkey,$Ij,$Zlo
+        vpunpckhqdq    $Ii,$Ii,$T2
+       vpclmulqdq      \$0x11,$Hkey,$Ij,$Zhi
+        vmovdqu        0x30-0x40($Htbl),$Hkey  # $Hkey^3
+        vpxor          $Ii,$T2,$T2
+        vmovdqu        0x40($inp),$Ij          # I[4]
+       vpclmulqdq      \$0x10,$HK,$T1,$Zmi
+        vmovdqu        0x50-0x40($Htbl),$HK
+
+        vpshufb        $bswap,$Ij,$Ij
+       vpxor           $Xlo,$Zlo,$Zlo
+       vpclmulqdq      \$0x00,$Hkey,$Ii,$Xlo
+       vpxor           $Xhi,$Zhi,$Zhi
+        vpunpckhqdq    $Ij,$Ij,$T1
+       vpclmulqdq      \$0x11,$Hkey,$Ii,$Xhi
+        vmovdqu        0x40-0x40($Htbl),$Hkey  # $Hkey^4
+       vpxor           $Xmi,$Zmi,$Zmi
+       vpclmulqdq      \$0x00,$HK,$T2,$Xmi
+        vpxor          $Ij,$T1,$T1
+
+        vmovdqu        0x30($inp),$Ii          # I[3]
+       vpxor           $Zlo,$Xlo,$Xlo
+       vpclmulqdq      \$0x00,$Hkey,$Ij,$Zlo
+       vpxor           $Zhi,$Xhi,$Xhi
+        vpshufb        $bswap,$Ii,$Ii
+       vpclmulqdq      \$0x11,$Hkey,$Ij,$Zhi
+        vmovdqu        0x60-0x40($Htbl),$Hkey  # $Hkey^5
+       vpxor           $Zmi,$Xmi,$Xmi
+        vpunpckhqdq    $Ii,$Ii,$T2
+       vpclmulqdq      \$0x10,$HK,$T1,$Zmi
+        vmovdqu        0x80-0x40($Htbl),$HK
+        vpxor          $Ii,$T2,$T2
+
+        vmovdqu        0x20($inp),$Ij          # I[2]
+       vpxor           $Xlo,$Zlo,$Zlo
+       vpclmulqdq      \$0x00,$Hkey,$Ii,$Xlo
+       vpxor           $Xhi,$Zhi,$Zhi
+        vpshufb        $bswap,$Ij,$Ij
+       vpclmulqdq      \$0x11,$Hkey,$Ii,$Xhi
+        vmovdqu        0x70-0x40($Htbl),$Hkey  # $Hkey^6
+       vpxor           $Xmi,$Zmi,$Zmi
+        vpunpckhqdq    $Ij,$Ij,$T1
+       vpclmulqdq      \$0x00,$HK,$T2,$Xmi
+        vpxor          $Ij,$T1,$T1
+
+        vmovdqu        0x10($inp),$Ii          # I[1]
+       vpxor           $Zlo,$Xlo,$Xlo
+       vpclmulqdq      \$0x00,$Hkey,$Ij,$Zlo
+       vpxor           $Zhi,$Xhi,$Xhi
+        vpshufb        $bswap,$Ii,$Ii
+       vpclmulqdq      \$0x11,$Hkey,$Ij,$Zhi
+        vmovdqu        0x90-0x40($Htbl),$Hkey  # $Hkey^7
+       vpxor           $Zmi,$Xmi,$Xmi
+        vpunpckhqdq    $Ii,$Ii,$T2
+       vpclmulqdq      \$0x10,$HK,$T1,$Zmi
+        vmovdqu        0xb0-0x40($Htbl),$HK
+        vpxor          $Ii,$T2,$T2
+
+        vmovdqu        ($inp),$Ij              # I[0]
+       vpxor           $Xlo,$Zlo,$Zlo
+       vpclmulqdq      \$0x00,$Hkey,$Ii,$Xlo
+       vpxor           $Xhi,$Zhi,$Zhi
+        vpshufb        $bswap,$Ij,$Ij
+       vpclmulqdq      \$0x11,$Hkey,$Ii,$Xhi
+        vmovdqu        0xa0-0x40($Htbl),$Hkey  # $Hkey^8
+       vpxor           $Xmi,$Zmi,$Zmi
+       vpclmulqdq      \$0x10,$HK,$T2,$Xmi
+
+       lea             0x80($inp),$inp
+       cmp             \$0x80,$len
+       jb              .Ltail_avx
+
+       vpxor           $Xi,$Ij,$Ij             # accumulate $Xi
+       sub             \$0x80,$len
+       jmp             .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+       vpunpckhqdq     $Ij,$Ij,$T1
+        vmovdqu        0x70($inp),$Ii          # I[7]
+       vpxor           $Xlo,$Zlo,$Zlo
+       vpxor           $Ij,$T1,$T1
+       vpclmulqdq      \$0x00,$Hkey,$Ij,$Xi
+        vpshufb        $bswap,$Ii,$Ii
+       vpxor           $Xhi,$Zhi,$Zhi
+       vpclmulqdq      \$0x11,$Hkey,$Ij,$Xo
+        vmovdqu        0x00-0x40($Htbl),$Hkey  # $Hkey^1
+        vpunpckhqdq    $Ii,$Ii,$T2
+       vpxor           $Xmi,$Zmi,$Zmi
+       vpclmulqdq      \$0x00,$HK,$T1,$Tred
+        vmovdqu        0x20-0x40($Htbl),$HK
+        vpxor          $Ii,$T2,$T2
+
+         vmovdqu       0x60($inp),$Ij          # I[6]
+        vpclmulqdq     \$0x00,$Hkey,$Ii,$Xlo
+       vpxor           $Zlo,$Xi,$Xi            # collect result
+         vpshufb       $bswap,$Ij,$Ij
+        vpclmulqdq     \$0x11,$Hkey,$Ii,$Xhi
+       vxorps          $Zhi,$Xo,$Xo
+         vmovdqu       0x10-0x40($Htbl),$Hkey  # $Hkey^2
+        vpunpckhqdq    $Ij,$Ij,$T1
+        vpclmulqdq     \$0x00,$HK,  $T2,$Xmi
+       vpxor           $Zmi,$Tred,$Tred
+        vxorps         $Ij,$T1,$T1
+
+         vmovdqu       0x50($inp),$Ii          # I[5]
+       vpxor           $Xi,$Tred,$Tred         # aggregated Karatsuba post-processing
+        vpclmulqdq     \$0x00,$Hkey,$Ij,$Zlo
+       vpxor           $Xo,$Tred,$Tred
+       vpslldq         \$8,$Tred,$T2
+        vpxor          $Xlo,$Zlo,$Zlo
+        vpclmulqdq     \$0x11,$Hkey,$Ij,$Zhi
+       vpsrldq         \$8,$Tred,$Tred
+       vpxor           $T2, $Xi, $Xi
+         vmovdqu       0x30-0x40($Htbl),$Hkey  # $Hkey^3
+         vpshufb       $bswap,$Ii,$Ii
+       vxorps          $Tred,$Xo, $Xo
+        vpxor          $Xhi,$Zhi,$Zhi
+        vpunpckhqdq    $Ii,$Ii,$T2
+        vpclmulqdq     \$0x10,$HK,  $T1,$Zmi
+         vmovdqu       0x50-0x40($Htbl),$HK
+        vpxor          $Ii,$T2,$T2
+        vpxor          $Xmi,$Zmi,$Zmi
+
+         vmovdqu       0x40($inp),$Ij          # I[4]
+       vpalignr        \$8,$Xi,$Xi,$Tred       # 1st phase
+        vpclmulqdq     \$0x00,$Hkey,$Ii,$Xlo
+         vpshufb       $bswap,$Ij,$Ij
+        vpxor          $Zlo,$Xlo,$Xlo
+        vpclmulqdq     \$0x11,$Hkey,$Ii,$Xhi
+         vmovdqu       0x40-0x40($Htbl),$Hkey  # $Hkey^4
+        vpunpckhqdq    $Ij,$Ij,$T1
+        vpxor          $Zhi,$Xhi,$Xhi
+        vpclmulqdq     \$0x00,$HK,  $T2,$Xmi
+        vxorps         $Ij,$T1,$T1
+        vpxor          $Zmi,$Xmi,$Xmi
+
+         vmovdqu       0x30($inp),$Ii          # I[3]
+       vpclmulqdq      \$0x10,(%r10),$Xi,$Xi
+        vpclmulqdq     \$0x00,$Hkey,$Ij,$Zlo
+         vpshufb       $bswap,$Ii,$Ii
+        vpxor          $Xlo,$Zlo,$Zlo
+        vpclmulqdq     \$0x11,$Hkey,$Ij,$Zhi
+         vmovdqu       0x60-0x40($Htbl),$Hkey  # $Hkey^5
+        vpunpckhqdq    $Ii,$Ii,$T2
+        vpxor          $Xhi,$Zhi,$Zhi
+        vpclmulqdq     \$0x10,$HK,  $T1,$Zmi
+         vmovdqu       0x80-0x40($Htbl),$HK
+        vpxor          $Ii,$T2,$T2
+        vpxor          $Xmi,$Zmi,$Zmi
+
+         vmovdqu       0x20($inp),$Ij          # I[2]
+        vpclmulqdq     \$0x00,$Hkey,$Ii,$Xlo
+         vpshufb       $bswap,$Ij,$Ij
+        vpxor          $Zlo,$Xlo,$Xlo
+        vpclmulqdq     \$0x11,$Hkey,$Ii,$Xhi
+         vmovdqu       0x70-0x40($Htbl),$Hkey  # $Hkey^6
+        vpunpckhqdq    $Ij,$Ij,$T1
+        vpxor          $Zhi,$Xhi,$Xhi
+        vpclmulqdq     \$0x00,$HK,  $T2,$Xmi
+        vpxor          $Ij,$T1,$T1
+        vpxor          $Zmi,$Xmi,$Xmi
+       vxorps          $Tred,$Xi,$Xi
+
+         vmovdqu       0x10($inp),$Ii          # I[1]
+       vpalignr        \$8,$Xi,$Xi,$Tred       # 2nd phase
+        vpclmulqdq     \$0x00,$Hkey,$Ij,$Zlo
+         vpshufb       $bswap,$Ii,$Ii
+        vpxor          $Xlo,$Zlo,$Zlo
+        vpclmulqdq     \$0x11,$Hkey,$Ij,$Zhi
+         vmovdqu       0x90-0x40($Htbl),$Hkey  # $Hkey^7
+       vpclmulqdq      \$0x10,(%r10),$Xi,$Xi
+       vxorps          $Xo,$Tred,$Tred
+        vpunpckhqdq    $Ii,$Ii,$T2
+        vpxor          $Xhi,$Zhi,$Zhi
+        vpclmulqdq     \$0x10,$HK,  $T1,$Zmi
+         vmovdqu       0xb0-0x40($Htbl),$HK
+        vpxor          $Ii,$T2,$T2
+        vpxor          $Xmi,$Zmi,$Zmi
+
+         vmovdqu       ($inp),$Ij              # I[0]
+        vpclmulqdq     \$0x00,$Hkey,$Ii,$Xlo
+         vpshufb       $bswap,$Ij,$Ij
+        vpclmulqdq     \$0x11,$Hkey,$Ii,$Xhi
+         vmovdqu       0xa0-0x40($Htbl),$Hkey  # $Hkey^8
+       vpxor           $Tred,$Ij,$Ij
+        vpclmulqdq     \$0x10,$HK,  $T2,$Xmi
+       vpxor           $Xi,$Ij,$Ij             # accumulate $Xi
+
+       lea             0x80($inp),$inp
+       sub             \$0x80,$len
+       jnc             .Loop8x_avx
+
+       add             \$0x80,$len
+       jmp             .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+       vmovdqu         -0x10($inp,$len),$Ii    # very last word
+       lea             ($inp,$len),$inp
+       vmovdqu         0x00-0x40($Htbl),$Hkey  # $Hkey^1
+       vmovdqu         0x20-0x40($Htbl),$HK
+       vpshufb         $bswap,$Ii,$Ij
+
+       vmovdqa         $Xlo,$Zlo               # subtle way to zero $Zlo,
+       vmovdqa         $Xhi,$Zhi               # $Zhi and
+       vmovdqa         $Xmi,$Zmi               # $Zmi
+       sub             \$0x10,$len
+       jz              .Ltail_avx
+
+       vpunpckhqdq     $Ij,$Ij,$T1
+       vpxor           $Xlo,$Zlo,$Zlo
+       vpclmulqdq      \$0x00,$Hkey,$Ij,$Xlo
+       vpxor           $Ij,$T1,$T1
+        vmovdqu        -0x20($inp),$Ii
+       vpxor           $Xhi,$Zhi,$Zhi
+       vpclmulqdq      \$0x11,$Hkey,$Ij,$Xhi
+       vmovdqu         0x10-0x40($Htbl),$Hkey  # $Hkey^2
+        vpshufb        $bswap,$Ii,$Ij
+       vpxor           $Xmi,$Zmi,$Zmi
+       vpclmulqdq      \$0x00,$HK,$T1,$Xmi
+       vpsrldq         \$8,$HK,$HK
+       sub             \$0x10,$len
+       jz              .Ltail_avx
+
+       vpunpckhqdq     $Ij,$Ij,$T1
+       vpxor           $Xlo,$Zlo,$Zlo
+       vpclmulqdq      \$0x00,$Hkey,$Ij,$Xlo
+       vpxor           $Ij,$T1,$T1
+        vmovdqu        -0x30($inp),$Ii
+       vpxor           $Xhi,$Zhi,$Zhi
+       vpclmulqdq      \$0x11,$Hkey,$Ij,$Xhi
+       vmovdqu         0x30-0x40($Htbl),$Hkey  # $Hkey^3
+        vpshufb        $bswap,$Ii,$Ij
+       vpxor           $Xmi,$Zmi,$Zmi
+       vpclmulqdq      \$0x00,$HK,$T1,$Xmi
+       vmovdqu         0x50-0x40($Htbl),$HK
+       sub             \$0x10,$len
+       jz              .Ltail_avx
+
+       vpunpckhqdq     $Ij,$Ij,$T1
+       vpxor           $Xlo,$Zlo,$Zlo
+       vpclmulqdq      \$0x00,$Hkey,$Ij,$Xlo
+       vpxor           $Ij,$T1,$T1
+        vmovdqu        -0x40($inp),$Ii
+       vpxor           $Xhi,$Zhi,$Zhi
+       vpclmulqdq      \$0x11,$Hkey,$Ij,$Xhi
+       vmovdqu         0x40-0x40($Htbl),$Hkey  # $Hkey^4
+        vpshufb        $bswap,$Ii,$Ij
+       vpxor           $Xmi,$Zmi,$Zmi
+       vpclmulqdq      \$0x00,$HK,$T1,$Xmi
+       vpsrldq         \$8,$HK,$HK
+       sub             \$0x10,$len
+       jz              .Ltail_avx
+
+       vpunpckhqdq     $Ij,$Ij,$T1
+       vpxor           $Xlo,$Zlo,$Zlo
+       vpclmulqdq      \$0x00,$Hkey,$Ij,$Xlo
+       vpxor           $Ij,$T1,$T1
+        vmovdqu        -0x50($inp),$Ii
+       vpxor           $Xhi,$Zhi,$Zhi
+       vpclmulqdq      \$0x11,$Hkey,$Ij,$Xhi
+       vmovdqu         0x60-0x40($Htbl),$Hkey  # $Hkey^5
+        vpshufb        $bswap,$Ii,$Ij
+       vpxor           $Xmi,$Zmi,$Zmi
+       vpclmulqdq      \$0x00,$HK,$T1,$Xmi
+       vmovdqu         0x80-0x40($Htbl),$HK
+       sub             \$0x10,$len
+       jz              .Ltail_avx
+
+       vpunpckhqdq     $Ij,$Ij,$T1
+       vpxor           $Xlo,$Zlo,$Zlo
+       vpclmulqdq      \$0x00,$Hkey,$Ij,$Xlo
+       vpxor           $Ij,$T1,$T1
+        vmovdqu        -0x60($inp),$Ii
+       vpxor           $Xhi,$Zhi,$Zhi
+       vpclmulqdq      \$0x11,$Hkey,$Ij,$Xhi
+       vmovdqu         0x70-0x40($Htbl),$Hkey  # $Hkey^6
+        vpshufb        $bswap,$Ii,$Ij
+       vpxor           $Xmi,$Zmi,$Zmi
+       vpclmulqdq      \$0x00,$HK,$T1,$Xmi
+       vpsrldq         \$8,$HK,$HK
+       sub             \$0x10,$len
+       jz              .Ltail_avx
+
+       vpunpckhqdq     $Ij,$Ij,$T1
+       vpxor           $Xlo,$Zlo,$Zlo
+       vpclmulqdq      \$0x00,$Hkey,$Ij,$Xlo
+       vpxor           $Ij,$T1,$T1
+        vmovdqu        -0x70($inp),$Ii
+       vpxor           $Xhi,$Zhi,$Zhi
+       vpclmulqdq      \$0x11,$Hkey,$Ij,$Xhi
+       vmovdqu         0x90-0x40($Htbl),$Hkey  # $Hkey^7
+        vpshufb        $bswap,$Ii,$Ij
+       vpxor           $Xmi,$Zmi,$Zmi
+       vpclmulqdq      \$0x00,$HK,$T1,$Xmi
+       vmovq           0xb8-0x40($Htbl),$HK
+       sub             \$0x10,$len
+       jmp             .Ltail_avx
 
+.align 32
+.Ltail_avx:
+       vpxor           $Xi,$Ij,$Ij             # accumulate $Xi
+.Ltail_no_xor_avx:
+       vpunpckhqdq     $Ij,$Ij,$T1
+       vpxor           $Xlo,$Zlo,$Zlo
+       vpclmulqdq      \$0x00,$Hkey,$Ij,$Xlo
+       vpxor           $Ij,$T1,$T1
+       vpxor           $Xhi,$Zhi,$Zhi
+       vpclmulqdq      \$0x11,$Hkey,$Ij,$Xhi
+       vpxor           $Xmi,$Zmi,$Zmi
+       vpclmulqdq      \$0x00,$HK,$T1,$Xmi
+
+       vmovdqu         (%r10),$Tred
+
+       vpxor           $Xlo,$Zlo,$Xi
+       vpxor           $Xhi,$Zhi,$Xo
+       vpxor           $Xmi,$Zmi,$Zmi
+
+       vpxor           $Xi, $Zmi,$Zmi          # aggregated Karatsuba post-processing
+       vpxor           $Xo, $Zmi,$Zmi
+       vpslldq         \$8, $Zmi,$T2
+       vpsrldq         \$8, $Zmi,$Zmi
+       vpxor           $T2, $Xi, $Xi
+       vpxor           $Zmi,$Xo, $Xo
+
+       vpclmulqdq      \$0x10,$Tred,$Xi,$T2    # 1st phase
+       vpalignr        \$8,$Xi,$Xi,$Xi
+       vpxor           $T2,$Xi,$Xi
+
+       vpclmulqdq      \$0x10,$Tred,$Xi,$T2    # 2nd phase
+       vpalignr        \$8,$Xi,$Xi,$Xi
+       vpxor           $Xo,$Xi,$Xi
+       vpxor           $T2,$Xi,$Xi
+
+       cmp             \$0,$len
+       jne             .Lshort_avx
+
+       vpshufb         $bswap,$Xi,$Xi
+       vmovdqu         $Xi,($Xip)
+       vzeroupper
+___
+$code.=<<___ if ($win64);
+       movaps  (%rsp),%xmm6
+       movaps  0x10(%rsp),%xmm7
+       movaps  0x20(%rsp),%xmm8
+       movaps  0x30(%rsp),%xmm9
+       movaps  0x40(%rsp),%xmm10
+       movaps  0x50(%rsp),%xmm11
+       movaps  0x60(%rsp),%xmm12
+       movaps  0x70(%rsp),%xmm13
+       movaps  0x80(%rsp),%xmm14
+       movaps  0x90(%rsp),%xmm15
+       lea     0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+___
+$code.=<<___;
+       ret
+.size  gcm_ghash_avx,.-gcm_ghash_avx
+___
+} else {
+$code.=<<___;
+       jmp     .L_ghash_clmul
+.size  gcm_ghash_avx,.-gcm_ghash_avx
+___
+}
+\f
 $code.=<<___;
 .align 64
 .Lbswap_mask:
@@ -1058,10 +1681,24 @@ se_handler:
        .rva    .LSEH_end_gcm_ghash_4bit
        .rva    .LSEH_info_gcm_ghash_4bit
 
+       .rva    .LSEH_begin_gcm_init_clmul
+       .rva    .LSEH_end_gcm_init_clmul
+       .rva    .LSEH_info_gcm_init_clmul
+
        .rva    .LSEH_begin_gcm_ghash_clmul
        .rva    .LSEH_end_gcm_ghash_clmul
        .rva    .LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___   if ($avx);
+       .rva    .LSEH_begin_gcm_init_avx
+       .rva    .LSEH_end_gcm_init_avx
+       .rva    .LSEH_info_gcm_init_clmul
 
+       .rva    .LSEH_begin_gcm_ghash_avx
+       .rva    .LSEH_end_gcm_ghash_avx
+       .rva    .LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___;
 .section       .xdata
 .align 8
 .LSEH_info_gcm_gmult_4bit:
@@ -1072,6 +1709,10 @@ se_handler:
        .byte   9,0,0,0
        .rva    se_handler
        .rva    .Lghash_prologue,.Lghash_epilogue       # HandlerData
+.LSEH_info_gcm_init_clmul:
+       .byte   0x01,0x08,0x03,0x00
+       .byte   0x08,0x68,0x00,0x00     #movaps 0x00(rsp),xmm6
+       .byte   0x04,0x22,0x00,0x00     #sub    rsp,0x18
 .LSEH_info_gcm_ghash_clmul:
        .byte   0x01,0x33,0x16,0x00
        .byte   0x33,0xf8,0x09,0x00     #movaps 0x90(rsp),xmm15
@@ -1084,7 +1725,7 @@ se_handler:
        .byte   0x10,0x88,0x02,0x00     #movaps 0x20(rsp),xmm8
        .byte   0x0c,0x78,0x01,0x00     #movaps 0x10(rsp),xmm7
        .byte   0x08,0x68,0x00,0x00     #movaps 0x00(rsp),xmm6
-       .byte   0x04,0x01,0x15,0x00     #sub    0xa8,rsp
+       .byte   0x04,0x01,0x15,0x00     #sub    rsp,0xa8
 ___
 }
 \f
index a6e2af1..ae5fab1 100644 (file)
@@ -658,6 +658,16 @@ void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 
+#if defined(__i386) || defined(__i386__)
+# define gcm_init_avx  gcm_init_clmul
+# define gcm_gmult_avx gcm_gmult_clmul
+# define gcm_ghash_avx gcm_ghash_clmul
+#else
+void gcm_init_avx(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_avx(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_avx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+
 #  if  defined(__i386) || defined(__i386__) || defined(_M_IX86)
 #   define GHASH_ASM_X86
 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
@@ -726,9 +736,15 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
 #  if  !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
        if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
            OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
-               gcm_init_clmul(ctx->Htable,ctx->H.u);
-               ctx->gmult = gcm_gmult_clmul;
-               ctx->ghash = gcm_ghash_clmul;
+               if (((OPENSSL_ia32cap_P[1]>>22)&0x41)==0x41) {  /* AVX+MOVBE */
+                       gcm_init_avx(ctx->Htable,ctx->H.u);
+                       ctx->gmult = gcm_gmult_avx;
+                       ctx->ghash = gcm_ghash_avx;
+               } else {
+                       gcm_init_clmul(ctx->Htable,ctx->H.u);
+                       ctx->gmult = gcm_gmult_clmul;
+                       ctx->ghash = gcm_ghash_clmul;
+               }
                return;
        }
 #  endif
@@ -1718,6 +1734,31 @@ static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0
                        0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
                T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
 
+/* Test Case 20 */
+#define K20 K1
+#define A20 A1
+static const u8 IV20[64]={0xff,0xff,0xff,0xff},        /* this results in 0xff in counter LSB */
+               P20[288],
+               C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
+                       0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
+                       0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
+                       0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
+                       0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
+                       0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
+                       0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
+                       0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
+                       0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
+                       0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
+                       0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
+                       0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
+                       0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
+                       0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
+                       0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
+                       0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
+                       0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
+                       0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
+               T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
+
 #define TEST_CASE(n)   do {                                    \
        u8 out[sizeof(P##n)];                                   \
        AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
@@ -1763,6 +1804,7 @@ int main()
        TEST_CASE(17);
        TEST_CASE(18);
        TEST_CASE(19);
+       TEST_CASE(20);
 
 #ifdef OPENSSL_CPUID_OBJ
        {