Fix race for X509 store found by thread sanitizer

[openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl

index 2d6424fecde3298b149e749178ec65d4dc6164e0..5c8bb0fbccc6009b4efe58b64c33916d62da4dfb 100644 (file)
--- a/crypto/aes/asm/aesni-sha256-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha256-x86_64.pl
@@ -1,7 +1,7 @@
  #! /usr/bin/env perl
-# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  #
-# Licensed under the OpenSSL license (the "License").  You may not use
+# Licensed under the Apache License 2.0 (the "License").  You may not use
  # this file except in compliance with the License.  You can obtain a copy
  # in the file LICENSE in the source distribution or at
  # https://www.openssl.org/source/license.html
@@ -28,22 +28,26 @@
  # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  # subroutine:
  #
-#               AES-128/-192/-256+SHA256       this(**)gain
-# Sandy Bridge     5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
-# Ivy Bridge       5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
-# Haswell          4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
-# Skylake          2.62/3.14/3.62+7.70         8.10    +27%/34%/40%
-# Bulldozer        5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
+#               AES-128/-192/-256+SHA256   this(**)    gain
+# Sandy Bridge     5.05/6.05/7.05+11.6     13.0        +28%/36%/43%
+# Ivy Bridge       5.05/6.05/7.05+10.3     11.6        +32%/41%/50%
+# Haswell          4.43/5.29/6.19+7.80     8.79        +39%/49%/59%
+# Skylake          2.62/3.14/3.62+7.70     8.10        +27%/34%/40%
+# Bulldozer        5.77/6.89/8.00+13.7     13.7        +42%/50%/58%
+# Ryzen(***)       2.71/-/3.71+2.05        2.74/-/3.73 +74%/-/54%
+# Goldmont(***)            3.82/-/5.35+4.16        4.73/-/5.94 +69%/-/60%
  #
  # (*)  there are XOP, AVX1 and AVX2 code paths, meaning that
  #      Westmere is omitted from loop, this is because gain was not
  #      estimated high enough to justify the effort;
  # (**) these are EVP-free results, results obtained with 'speed
  #      -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
+# (***)        these are SHAEXT results;
  
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  
  $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  
@@ -67,14 +71,15 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
         $avx = ($1>=10) + ($1>=12);
  }
  
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
         $avx = ($2>=3.0) + ($2>3.0);
  }
  
  $shaext=$avx;  ### set to zero if compiling for 1.0.1
  $avx=1         if (!$shaext && $avx);
  
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
  *STDOUT=*OUT;
  
  $func="aesni_cbc_sha256_enc";
@@ -109,7 +114,7 @@ $_key="16*$SZ+3*8(%rsp)";
  $_ivp="16*$SZ+4*8(%rsp)";
  $_ctx="16*$SZ+5*8(%rsp)";
  $_in0="16*$SZ+6*8(%rsp)";
-$_rsp="16*$SZ+7*8(%rsp)";
+$_rsp="`16*$SZ+7*8`(%rsp)";
  $framesz=16*$SZ+8*8;
  
  $code=<<___;
@@ -120,6 +125,7 @@ $code=<<___;
  .type  $func,\@abi-omnipotent
  .align 16
  $func:
+.cfi_startproc
  ___
                                                 if ($avx) {
  $code.=<<___;
@@ -159,8 +165,10 @@ $code.=<<___;
         ud2
  .Lprobe:
         ret
+.cfi_endproc
  .size  $func,.-$func
  
+.section .rodata align=64
  .align 64
  .type  $TABLE,\@object
  $TABLE:
@@ -203,6 +211,7 @@ $TABLE:
         .long   0,0,0,0,   0,0,0,0
         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  .align 64
+.previous
  ___
  
  ######################################################################
@@ -339,15 +348,23 @@ $code.=<<___;
  .type  ${func}_xop,\@function,6
  .align 64
  ${func}_xop:
+.cfi_startproc
  .Lxop_shortcut:
         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
         mov     %rsp,%rax               # copy %rsp
+.cfi_def_cfa_register  %rax
         push    %rbx
+.cfi_push      %rbx
         push    %rbp
+.cfi_push      %rbp
         push    %r12
+.cfi_push      %r12
         push    %r13
+.cfi_push      %r13
         push    %r14
+.cfi_push      %r14
         push    %r15
+.cfi_push      %r15
         sub     \$`$framesz+$win64*16*10`,%rsp
         and     \$-64,%rsp              # align stack frame
  
@@ -364,6 +381,7 @@ ${func}_xop:
         mov     $ctx,$_ctx
         mov     $in0,$_in0
         mov     %rax,$_rsp
+.cfi_cfa_expression    $_rsp,deref,+8
  ___
  $code.=<<___ if ($win64);
         movaps  %xmm6,`$framesz+16*0`(%rsp)
@@ -601,6 +619,7 @@ $code.=<<___;
  
         mov     $_ivp,$ivp
         mov     $_rsp,%rsi
+.cfi_def_cfa   %rsi,8
         vmovdqu $iv,($ivp)              # output IV
         vzeroall
  ___
@@ -618,14 +637,22 @@ $code.=<<___ if ($win64);
  ___
  $code.=<<___;
         mov     -48(%rsi),%r15
+.cfi_restore   %r15
         mov     -40(%rsi),%r14
+.cfi_restore   %r14
         mov     -32(%rsi),%r13
+.cfi_restore   %r13
         mov     -24(%rsi),%r12
+.cfi_restore   %r12
         mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
         mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
         lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
  .Lepilogue_xop:
         ret
+.cfi_endproc
  .size  ${func}_xop,.-${func}_xop
  ___
  ######################################################################
@@ -637,15 +664,23 @@ $code.=<<___;
  .type  ${func}_avx,\@function,6
  .align 64
  ${func}_avx:
+.cfi_startproc
  .Lavx_shortcut:
         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
         mov     %rsp,%rax               # copy %rsp
+.cfi_def_cfa_register  %rax
         push    %rbx
+.cfi_push      %rbx
         push    %rbp
+.cfi_push      %rbp
         push    %r12
+.cfi_push      %r12
         push    %r13
+.cfi_push      %r13
         push    %r14
+.cfi_push      %r14
         push    %r15
+.cfi_push      %r15
         sub     \$`$framesz+$win64*16*10`,%rsp
         and     \$-64,%rsp              # align stack frame
  
@@ -662,6 +697,7 @@ ${func}_avx:
         mov     $ctx,$_ctx
         mov     $in0,$_in0
         mov     %rax,$_rsp
+.cfi_cfa_expression    $_rsp,deref,+8
  ___
  $code.=<<___ if ($win64);
         movaps  %xmm6,`$framesz+16*0`(%rsp)
@@ -852,6 +888,7 @@ $code.=<<___;
  
         mov     $_ivp,$ivp
         mov     $_rsp,%rsi
+.cfi_def_cfa   %rsi,8
         vmovdqu $iv,($ivp)              # output IV
         vzeroall
  ___
@@ -869,14 +906,22 @@ $code.=<<___ if ($win64);
  ___
  $code.=<<___;
         mov     -48(%rsi),%r15
+.cfi_restore   %r15
         mov     -40(%rsi),%r14
+.cfi_restore   %r14
         mov     -32(%rsi),%r13
+.cfi_restore   %r13
         mov     -24(%rsi),%r12
+.cfi_restore   %r12
         mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
         mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
         lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
  .Lepilogue_avx:
         ret
+.cfi_endproc
  .size  ${func}_avx,.-${func}_avx
  ___
  
@@ -933,15 +978,23 @@ $code.=<<___;
  .type  ${func}_avx2,\@function,6
  .align 64
  ${func}_avx2:
+.cfi_startproc
  .Lavx2_shortcut:
         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
         mov     %rsp,%rax               # copy %rsp
+.cfi_def_cfa_register  %rax
         push    %rbx
+.cfi_push      %rbx
         push    %rbp
+.cfi_push      %rbp
         push    %r12
+.cfi_push      %r12
         push    %r13
+.cfi_push      %r13
         push    %r14
+.cfi_push      %r14
         push    %r15
+.cfi_push      %r15
         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
         and     \$-256*$SZ,%rsp         # align stack frame
         add     \$`2*$SZ*($rounds-8)`,%rsp
@@ -959,6 +1012,7 @@ ${func}_avx2:
         mov     $ctx,$_ctx
         mov     $in0,$_in0
         mov     %rax,$_rsp
+.cfi_cfa_expression    $_rsp,deref,+8
  ___
  $code.=<<___ if ($win64);
         movaps  %xmm6,`$framesz+16*0`(%rsp)
@@ -1036,7 +1090,23 @@ $code.=<<___;
         vmovdqa $t0,0x00(%rsp)
         xor     $a1,$a1
         vmovdqa $t1,0x20(%rsp)
+___
+$code.=<<___ if (!$win64);
+# temporarily use %rsi as frame pointer
+        mov     $_rsp,%rsi
+.cfi_def_cfa    %rsi,8
+___
+$code.=<<___;
         lea     -$PUSH8(%rsp),%rsp
+___
+$code.=<<___ if (!$win64);
+# the frame info is at $_rsp, but the stack is moving...
+# so a second frame pointer is saved at -8(%rsp)
+# that is in the red zone
+        mov     %rsi,-8(%rsp)
+.cfi_cfa_expression     %rsp-8,deref,+8
+___
+$code.=<<___;
         mov     $B,$a3
         vmovdqa $t2,0x00(%rsp)
         xor     $C,$a3                  # magic
@@ -1058,7 +1128,17 @@ my @X = @_;
  my @insns = (&$body,&$body,&$body,&$body);     # 96 instructions
  my $base = "+2*$PUSH8(%rsp)";
  
-       &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
+       if (($j%2)==0) {
+       &lea    ("%rsp","-$PUSH8(%rsp)");
+$code.=<<___ if (!$win64);
+.cfi_cfa_expression     %rsp+`$PUSH8-8`,deref,+8
+# copy secondary frame pointer to new location again at -8(%rsp)
+        pushq   $PUSH8-8(%rsp)
+.cfi_cfa_expression     %rsp,deref,+8
+        lea     8(%rsp),%rsp
+.cfi_cfa_expression     %rsp-8,deref,+8
+___
+       }
         foreach (Xupdate_256_AVX()) {           # 29 instructions
             eval;
             eval(shift(@insns));
@@ -1184,36 +1264,47 @@ $code.=<<___;
  
         jbe     .Loop_avx2
         lea     (%rsp),$Tbl
+# temporarily use $Tbl as index to $_rsp
+# this avoids the need to save a secondary frame pointer at -8(%rsp)
+.cfi_cfa_expression     $Tbl+`16*$SZ+7*8`,deref,+8
  
  .Ldone_avx2:
-       lea     ($Tbl),%rsp
-       mov     $_ivp,$ivp
-       mov     $_rsp,%rsi
+       mov     16*$SZ+4*8($Tbl),$ivp
+       mov     16*$SZ+7*8($Tbl),%rsi
+.cfi_def_cfa   %rsi,8
         vmovdqu $iv,($ivp)              # output IV
         vzeroall
  ___
  $code.=<<___ if ($win64);
-       movaps  `$framesz+16*0`(%rsp),%xmm6
-       movaps  `$framesz+16*1`(%rsp),%xmm7
-       movaps  `$framesz+16*2`(%rsp),%xmm8
-       movaps  `$framesz+16*3`(%rsp),%xmm9
-       movaps  `$framesz+16*4`(%rsp),%xmm10
-       movaps  `$framesz+16*5`(%rsp),%xmm11
-       movaps  `$framesz+16*6`(%rsp),%xmm12
-       movaps  `$framesz+16*7`(%rsp),%xmm13
-       movaps  `$framesz+16*8`(%rsp),%xmm14
-       movaps  `$framesz+16*9`(%rsp),%xmm15
+       movaps  `$framesz+16*0`($Tbl),%xmm6
+       movaps  `$framesz+16*1`($Tbl),%xmm7
+       movaps  `$framesz+16*2`($Tbl),%xmm8
+       movaps  `$framesz+16*3`($Tbl),%xmm9
+       movaps  `$framesz+16*4`($Tbl),%xmm10
+       movaps  `$framesz+16*5`($Tbl),%xmm11
+       movaps  `$framesz+16*6`($Tbl),%xmm12
+       movaps  `$framesz+16*7`($Tbl),%xmm13
+       movaps  `$framesz+16*8`($Tbl),%xmm14
+       movaps  `$framesz+16*9`($Tbl),%xmm15
  ___
  $code.=<<___;
         mov     -48(%rsi),%r15
+.cfi_restore   %r15
         mov     -40(%rsi),%r14
+.cfi_restore   %r14
         mov     -32(%rsi),%r13
+.cfi_restore   %r13
         mov     -24(%rsi),%r12
+.cfi_restore   %r12
         mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
         mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
         lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
  .Lepilogue_avx2:
         ret
+.cfi_endproc
  .size  ${func}_avx2,.-${func}_avx2
  ___
  }}
@@ -1282,6 +1373,7 @@ $code.=<<___;
  .type  ${func}_shaext,\@function,6
  .align 32
  ${func}_shaext:
+.cfi_startproc
         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
  ___
  $code.=<<___ if ($win64);
@@ -1307,6 +1399,7 @@ $code.=<<___;
         mov             240($key),$rounds
         sub             $in0,$out
         movups          ($key),$rndkey0         # $key[0]
+       movups          ($ivp),$iv              # load IV
         movups          16($key),$rndkey[0]     # forward reference
         lea             112($key),$key          # size optimization
  
@@ -1497,6 +1590,7 @@ $code.=<<___ if ($win64);
  ___
  $code.=<<___;
         ret
+.cfi_endproc
  .size  ${func}_shaext,.-${func}_shaext
  ___
  }
@@ -1709,4 +1803,4 @@ sub rex {
  $code =~ s/\`([^\`]*)\`/eval $1/gem;
  $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
  print $code;
-close STDOUT;
+close STDOUT or die "error closing STDOUT: $!";