sha/asm/sha512p8-ppc.pl: improve POWER9 performance by ~10%.
authorAndy Polyakov <appro@openssl.org>
Sat, 2 Jun 2018 13:25:50 +0000 (15:25 +0200)
committerAndy Polyakov <appro@openssl.org>
Sun, 3 Jun 2018 19:20:40 +0000 (21:20 +0200)
Biggest part, ~7%, of improvement resulted from omitting constants'
table index increment in each round. And minor part from rescheduling
instructions. Apparently POWER9 (and POWER8) manage to dispatch
instructions more efficiently if they are laid down as if they have
no latency...

Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6406)

crypto/sha/asm/sha512p8-ppc.pl

index 7a8d4358f0a80bd7fc54d0c8cfc405fa8b4e7d8f..e3f522cb7c5f414578798e60567b1cc0d9f68950 100755 (executable)
@@ -37,8 +37,8 @@
 # build of sha512-ppc.pl, presented for reference.
 #
 #              POWER8          POWER9
-# SHA256       9.9 [15.8]      12.2 [12.5]
-# SHA512       6.3 [10.3]      7.7 [7.9]
+# SHA256       9.7 [15.8]      11.2 [12.5]
+# SHA512       6.1 [10.3]      7.0 [7.9]
 
 $flavour=shift;
 $output =shift;
@@ -79,7 +79,8 @@ if ($output =~ /512/) {
 }
 
 $func="sha${bits}_block_p8";
-$FRAME=8*$SIZE_T;
+$LOCALS=8*$SIZE_T+8*16;
+$FRAME=$LOCALS+9*16+6*$SIZE_T;
 
 $sp ="r1";
 $toc="r2";
@@ -91,16 +92,17 @@ $idx="r7";
 $lrsave="r8";
 $offload="r11";
 $vrsave="r12";
-($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
- $x00=0 if ($flavour =~ /osx/);
+@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
     $x00=0 if ($flavour =~ /osx/);
 
 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
-@X=map("v$_",(8..23));
-($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
+@X=map("v$_",(8..19,24..27));
+($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
 
 sub ROUND {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 my $j=($i+1)%16;
+my $k=($i+2)%8;
 
 $code.=<<___           if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
        lvx_u           @X[$i+1],0,$inp         ; load X[i] in advance
@@ -112,26 +114,30 @@ ___
 $code.=<<___           if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
        vperm           @X[$i],@X[$i],@X[$i],$lemask
 ___
+$code.=<<___           if ($i>=15);
+       vshasigma${sz}  $Sigma,@X[($j+1)%16],0,0
+       vaddu${sz}m     @X[$j],@X[$j],$Sigma
+       vshasigma${sz}  $Sigma,@X[($j+14)%16],0,15
+       vaddu${sz}m     @X[$j],@X[$j],$Sigma
+       vaddu${sz}m     @X[$j],@X[$j],@X[($j+9)%16]
+___
 $code.=<<___;
-       `"vshasigma${sz}        $s0,@X[($j+1)%16],0,0"          if ($i>=15)`
-       vsel            $Func,$g,$f,$e          ; Ch(e,f,g)
-       vshasigma${sz}  $S1,$e,1,15             ; Sigma1(e)
        vaddu${sz}m     $h,$h,@X[$i%16]         ; h+=X[i]
-       vshasigma${sz}  $S0,$a,1,0              ; Sigma0(a)
-       `"vshasigma${sz}        $s1,@X[($j+14)%16],0,15"        if ($i>=15)`
+       vsel            $Func,$g,$f,$e          ; Ch(e,f,g)
+       vaddu${sz}m     $g,$g,$Ki               ; future h+=K[i]
        vaddu${sz}m     $h,$h,$Func             ; h+=Ch(e,f,g)
+       vshasigma${sz}  $Sigma,$e,1,15          ; Sigma1(e)
+       vaddu${sz}m     $h,$h,$Sigma            ; h+=Sigma1(e)
        vxor            $Func,$a,$b
-       `"vaddu${sz}m           @X[$j],@X[$j],@X[($j+9)%16]"    if ($i>=15)`
-       vaddu${sz}m     $h,$h,$S1               ; h+=Sigma1(e)
        vsel            $Func,$b,$c,$Func       ; Maj(a,b,c)
-       vaddu${sz}m     $g,$g,$Ki               ; future h+=K[i]
        vaddu${sz}m     $d,$d,$h                ; d+=h
-       vaddu${sz}m     $S0,$S0,$Func           ; Sigma0(a)+Maj(a,b,c)
-       `"vaddu${sz}m           @X[$j],@X[$j],$s0"              if ($i>=15)`
-       lvx             $Ki,$idx,$Tbl           ; load next K[i]
-       addi            $idx,$idx,16
-       vaddu${sz}m     $h,$h,$S0               ; h+=Sigma0(a)+Maj(a,b,c)
-       `"vaddu${sz}m           @X[$j],@X[$j],$s1"              if ($i>=15)`
+       vshasigma${sz}  $Sigma,$a,1,0           ; Sigma0(a)
+       vaddu${sz}m     $Sigma,$Sigma,$Func     ; Sigma0(a)+Maj(a,b,c)
+       vaddu${sz}m     $h,$h,$Sigma            ; h+=Sigma0(a)+Maj(a,b,c)
+       lvx             $Ki,@I[$k],$idx         ; load next K[i]
+___
+$code.=<<___           if ($k == 7);
+       addi            $idx,$idx,0x80
 ___
 }
 
@@ -142,21 +148,13 @@ $code=<<___;
 .globl $func
 .align 6
 $func:
-       $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+       $STU            $sp,-$FRAME($sp)
        mflr            $lrsave
-       li              r10,`$FRAME+8*16+15`
-       li              r11,`$FRAME+8*16+31`
-       stvx            v20,r10,$sp             # ABI says so
+       li              r10,`$LOCALS+15`
+       li              r11,`$LOCALS+31`
+       stvx            v24,r10,$sp             # ABI says so
        addi            r10,r10,32
        mfspr           $vrsave,256
-       stvx            v21,r11,$sp
-       addi            r11,r11,32
-       stvx            v22,r10,$sp
-       addi            r10,r10,32
-       stvx            v23,r11,$sp
-       addi            r11,r11,32
-       stvx            v24,r10,$sp
-       addi            r10,r10,32
        stvx            v25,r11,$sp
        addi            r11,r11,32
        stvx            v26,r10,$sp
@@ -169,26 +167,26 @@ $func:
        addi            r11,r11,32
        stvx            v30,r10,$sp
        stvx            v31,r11,$sp
-       li              r11,-1
-       stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
+       li              r11,-4096+255
+       stw             $vrsave,`$FRAME+6*$SIZE_T-4`($sp)       # save vrsave
        li              $x10,0x10
-       $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+       $PUSH           r26,`$FRAME-6*$SIZE_T`($sp)
        li              $x20,0x20
-       $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+       $PUSH           r27,`$FRAME-5*$SIZE_T`($sp)
        li              $x30,0x30
-       $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+       $PUSH           r28,`$FRAME-4*$SIZE_T`($sp)
        li              $x40,0x40
-       $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+       $PUSH           r29,`$FRAME-3*$SIZE_T`($sp)
        li              $x50,0x50
-       $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+       $PUSH           r30,`$FRAME-2*$SIZE_T`($sp)
        li              $x60,0x60
-       $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+       $PUSH           r31,`$FRAME-1*$SIZE_T`($sp)
        li              $x70,0x70
-       $PUSH           $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+       $PUSH           $lrsave,`$FRAME+$LRSAVE`($sp)
        mtspr           256,r11
 
        bl              LPICmeup
-       addi            $offload,$sp,$FRAME+15
+       addi            $offload,$sp,`8*$SIZE_T+15`
 ___
 $code.=<<___           if ($LENDIAN);
        li              $idx,8
@@ -222,9 +220,9 @@ $code.=<<___;
 .align 5
 Loop:
        lvx             $Ki,$x00,$Tbl
-       li              $idx,16
        lvx_u           @X[0],0,$inp
        addi            $inp,$inp,16
+       mr              $idx,$Tbl               # copy $Tbl
        stvx            $A,$x00,$offload        # offload $A-$H
        stvx            $B,$x10,$offload
        stvx            $C,$x20,$offload
@@ -234,8 +232,7 @@ Loop:
        stvx            $G,$x60,$offload
        stvx            $H,$x70,$offload
        vaddu${sz}m     $H,$H,$Ki               # h+K[i]
-       lvx             $Ki,$idx,$Tbl
-       addi            $idx,$idx,16
+       lvx             $Ki,$x10,$Tbl
 ___
 for ($i=0;$i<16;$i++)  { &ROUND($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
@@ -268,10 +265,9 @@ $code.=<<___;
        bne             Loop
 ___
 $code.=<<___           if ($SZ==4);
-       lvx             @X[0],$idx,$Tbl
-       addi            $idx,$idx,16
+       lvx             @X[0],$x20,$idx
        vperm           $A,$A,$B,$Ki            # pack the answer
-       lvx             @X[1],$idx,$Tbl
+       lvx             @X[1],$x30,$idx
        vperm           $E,$E,$F,$Ki
        vperm           $A,$A,$C,@X[0]
        vperm           $E,$E,$G,@X[0]
@@ -291,19 +287,11 @@ $code.=<<___              if ($SZ==8);
        stvx_u          $G,$x30,$ctx
 ___
 $code.=<<___;
-       li              r10,`$FRAME+8*16+15`
+       li              r10,`$LOCALS+15`
        mtlr            $lrsave
-       li              r11,`$FRAME+8*16+31`
+       li              r11,`$LOCALS+31`
        mtspr           256,$vrsave
-       lvx             v20,r10,$sp             # ABI says so
-       addi            r10,r10,32
-       lvx             v21,r11,$sp
-       addi            r11,r11,32
-       lvx             v22,r10,$sp
-       addi            r10,r10,32
-       lvx             v23,r11,$sp
-       addi            r11,r11,32
-       lvx             v24,r10,$sp
+       lvx             v24,r10,$sp             # ABI says so
        addi            r10,r10,32
        lvx             v25,r11,$sp
        addi            r11,r11,32
@@ -317,13 +305,13 @@ $code.=<<___;
        addi            r11,r11,32
        lvx             v30,r10,$sp
        lvx             v31,r11,$sp
-       $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
-       $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
-       $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
-       $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
-       $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
-       $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
-       addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+       $POP            r26,`$FRAME-6*$SIZE_T`($sp)
+       $POP            r27,`$FRAME-5*$SIZE_T`($sp)
+       $POP            r28,`$FRAME-4*$SIZE_T`($sp)
+       $POP            r29,`$FRAME-3*$SIZE_T`($sp)
+       $POP            r30,`$FRAME-2*$SIZE_T`($sp)
+       $POP            r31,`$FRAME-1*$SIZE_T`($sp)
+       addi            $sp,$sp,$FRAME
        blr
        .long           0
        .byte           0,12,4,1,0x80,6,3,0