sha512-x86_64.pl: +15% better performance on Westmere and incidentally Atom.
authorAndy Polyakov <appro@openssl.org>
Sat, 17 Sep 2011 11:30:28 +0000 (11:30 +0000)
committerAndy Polyakov <appro@openssl.org>
Sat, 17 Sep 2011 11:30:28 +0000 (11:30 +0000)
Other Intel processors +5%, Opteron -2%.

crypto/sha/asm/sha512-x86_64.pl

index e6643f8cf613d2addb4591f60c0962bacd6c28db..f611a2d898e2ff265f0cb41b8d51927e0de7b6e0 100755 (executable)
@@ -95,50 +95,44 @@ sub ROUND_00_15()
 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-       mov     $e,$a0
-       mov     $e,$a1
+       ror     \$`$Sigma1[2]-$Sigma1[1]`,$a0
        mov     $f,$a2
+       mov     $T1,`$SZ*($i&0xf)`(%rsp)
 
-       ror     \$$Sigma1[0],$a0
-       ror     \$$Sigma1[1],$a1
+       ror     \$`$Sigma0[2]-$Sigma0[1]`,$a1
+       xor     $e,$a0
        xor     $g,$a2                  # f^g
 
-       xor     $a1,$a0
-       ror     \$`$Sigma1[2]-$Sigma1[1]`,$a1
+       ror     \$`$Sigma1[1]-$Sigma1[0]`,$a0
+       add     $h,$T1                  # T1+=h
+       xor     $a,$a1
+
+       add     ($Tbl,$round,$SZ),$T1   # T1+=K[round]
        and     $e,$a2                  # (f^g)&e
-       mov     $T1,`$SZ*($i&0xf)`(%rsp)
+       mov     $b,$h
 
-       xor     $a1,$a0                 # Sigma1(e)
+       ror     \$`$Sigma0[1]-$Sigma0[0]`,$a1
+       xor     $e,$a0
        xor     $g,$a2                  # Ch(e,f,g)=((f^g)&e)^g
-       add     $h,$T1                  # T1+=h
-
-       mov     $a,$h
-       add     $a0,$T1                 # T1+=Sigma1(e)
 
+       xor     $c,$h                   # b^c
+       xor     $a,$a1
        add     $a2,$T1                 # T1+=Ch(e,f,g)
-       mov     $a,$a0
-       mov     $a,$a1
+       mov     $b,$a2
 
-       ror     \$$Sigma0[0],$h
-       ror     \$$Sigma0[1],$a0
-       mov     $a,$a2
-       add     ($Tbl,$round,$SZ),$T1   # T1+=K[round]
+       ror     \$$Sigma1[0],$a0        # Sigma1(e)
+       and     $a,$h                   # h=(b^c)&a
+       and     $c,$a2                  # b&c
 
-       xor     $a0,$h
-       ror     \$`$Sigma0[2]-$Sigma0[1]`,$a0
-       or      $c,$a1                  # a|c
+       ror     \$$Sigma0[0],$a1        # Sigma0(a)
+       add     $a0,$T1                 # T1+=Sigma1(e)
+       add     $a2,$h                  # h+=b&c (completes +=Maj(a,b,c)
 
-       xor     $a0,$h                  # h=Sigma0(a)
-       and     $c,$a2                  # a&c
        add     $T1,$d                  # d+=T1
-
-       and     $b,$a1                  # (a|c)&b
        add     $T1,$h                  # h+=T1
-
-       or      $a2,$a1                 # Maj(a,b,c)=((a|c)&b)|(a&c)
        lea     1($round),$round        # round++
+       add     $a1,$h                  # h+=Sigma0(a)
 
-       add     $a1,$h                  # h+=Maj(a,b,c)
 ___
 }
 
@@ -147,32 +141,30 @@ sub ROUND_16_XX()
 
 $code.=<<___;
        mov     `$SZ*(($i+1)&0xf)`(%rsp),$a0
-       mov     `$SZ*(($i+14)&0xf)`(%rsp),$T1
-
-       mov     $a0,$a2
+       mov     `$SZ*(($i+14)&0xf)`(%rsp),$a1
+       mov     $a0,$T1
+       mov     $a1,$a2
 
+       ror     \$`$sigma0[1]-$sigma0[0]`,$T1
+       xor     $a0,$T1
        shr     \$$sigma0[2],$a0
-       ror     \$$sigma0[0],$a2
-
-       xor     $a2,$a0
-       ror     \$`$sigma0[1]-$sigma0[0]`,$a2
 
-       xor     $a2,$a0                 # sigma0(X[(i+1)&0xf])
-       mov     $T1,$a1
+       ror     \$$sigma0[0],$T1
+       xor     $T1,$a0                 # sigma0(X[(i+1)&0xf])
+       mov     `$SZ*(($i+9)&0xf)`(%rsp),$T1
 
-       shr     \$$sigma1[2],$T1
-       ror     \$$sigma1[0],$a1
-
-       xor     $a1,$T1
-       ror     \$`$sigma1[1]-$sigma1[0]`,$a1
-
-       xor     $a1,$T1                 # sigma1(X[(i+14)&0xf])
+       ror     \$`$sigma1[1]-$sigma1[0]`,$a2
+       xor     $a1,$a2
+       shr     \$$sigma1[2],$a1
 
+       ror     \$$sigma1[0],$a2
        add     $a0,$T1
-
-       add     `$SZ*(($i+9)&0xf)`(%rsp),$T1
+       xor     $a2,$a1                 # sigma1(X[(i+14)&0xf])
 
        add     `$SZ*($i&0xf)`(%rsp),$T1
+       mov     $e,$a0
+       add     $a1,$T1
+       mov     $a,$a1
 ___
        &ROUND_00_15(@_);
 }
@@ -219,6 +211,8 @@ $func:
 ___
        for($i=0;$i<16;$i++) {
                $code.="        mov     $SZ*$i($inp),$T1\n";
+               $code.="        mov     @ROT[4],$a0\n";
+               $code.="        mov     @ROT[0],$a1\n";
                $code.="        bswap   $T1\n";
                &ROUND_00_15($i,@ROT);
                unshift(@ROT,pop(@ROT));