crypto/sha/asm/sha512-ppc.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # I let hardware handle unaligned input, except on page boundaries
  11 # (see below for details). Otherwise straightforward implementation
  12 # with X vector in register bank. The module is big-endian [which is
  13 # not big deal as there're no little-endian targets left around].
  14
  15 #                       sha256          |       sha512
  16 #                       -m64    -m32    |       -m64    -m32
  17 # --------------------------------------+-----------------------
  18 # PPC970,gcc-4.0.0      +50%    +38%    |       +40%    +410%(*)
  19 # Power6,xlc-7          +150%   +90%    |       +100%   +430%(*)
  20 #
  21 # (*)   64-bit code in 32-bit application context, which actually is
  22 #       on TODO list. It should be noted that for safe deployment in
  23 #       32-bit *mutli-threaded* context asyncronous signals should be
  24 #       blocked upon entry to SHA512 block routine. This is because
  25 #       32-bit signaling procedure invalidates upper halves of GPRs.
  26 #       Context switch procedure preserves them, but not signaling:-(
  27
  28 # Second version is true multi-thread safe. Trouble with the original
  29 # version was that it was using thread local storage pointer register.
  30 # Well, it scrupulously preserved it, but the problem would arise the
  31 # moment asynchronous signal was delivered and signal handler would
  32 # dereference the TLS pointer. While it's never the case in openssl
  33 # application or test suite, we have to respect this scenario and not
  34 # use TLS pointer register. Alternative would be to require caller to
  35 # block signals prior calling this routine. For the record, in 32-bit
  36 # context R2 serves as TLS pointer, while in 64-bit context - R13.
  37
  38 $flavour=shift;
  39 $output =shift;
  40
  41 if ($flavour =~ /64/) {
  42         $SIZE_T=8;
  43         $LRSAVE=2*$SIZE_T;
  44         $STU="stdu";
  45         $UCMP="cmpld";
  46         $SHL="sldi";
  47         $POP="ld";
  48         $PUSH="std";
  49 } elsif ($flavour =~ /32/) {
  50         $SIZE_T=4;
  51         $LRSAVE=$SIZE_T;
  52         $STU="stwu";
  53         $UCMP="cmplw";
  54         $SHL="slwi";
  55         $POP="lwz";
  56         $PUSH="stw";
  57 } else { die "nonsense $flavour"; }
  58
  59 $LITTLE_ENDIAN=0;
  60 if ($flavour =~ /le$/) {
  61         die "little-endian is 64-bit only: $flavour" if ($SIZE_T==4);
  62         $LITTLE_ENDIAN=1;
  63 }
  64
  65 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  66 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  67 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  68 die "can't locate ppc-xlate.pl";
  69
  70 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
  71
  72 if ($output =~ /512/) {
  73         $func="sha512_block_data_order";
  74         $SZ=8;
  75         @Sigma0=(28,34,39);
  76         @Sigma1=(14,18,41);
  77         @sigma0=(1,  8, 7);
  78         @sigma1=(19,61, 6);
  79         $rounds=80;
  80         $LD="ld";
  81         $ST="std";
  82         $ROR="rotrdi";
  83         $SHR="srdi";
  84 } else {
  85         $func="sha256_block_data_order";
  86         $SZ=4;
  87         @Sigma0=( 2,13,22);
  88         @Sigma1=( 6,11,25);
  89         @sigma0=( 7,18, 3);
  90         @sigma1=(17,19,10);
  91         $rounds=64;
  92         $LD="lwz";
  93         $ST="stw";
  94         $ROR="rotrwi";
  95         $SHR="srwi";
  96 }
  97
  98 $FRAME=32*$SIZE_T+16*$SZ;
  99 $LOCALS=6*$SIZE_T;
 100
 101 $sp ="r1";
 102 $toc="r2";
 103 $ctx="r3";      # zapped by $a0
 104 $inp="r4";      # zapped by $a1
 105 $num="r5";      # zapped by $t0
 106
 107 $T  ="r0";
 108 $a0 ="r3";
 109 $a1 ="r4";
 110 $t0 ="r5";
 111 $t1 ="r6";
 112 $Tbl="r7";
 113
 114 $A  ="r8";
 115 $B  ="r9";
 116 $C  ="r10";
 117 $D  ="r11";
 118 $E  ="r12";
 119 $F  =$t1;       $t1 = "r0";     # stay away from "r13";
 120 $G  ="r14";
 121 $H  ="r15";
 122
 123 @V=($A,$B,$C,$D,$E,$F,$G,$H);
 124 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
 125     "r24","r25","r26","r27","r28","r29","r30","r31");
 126
 127 $inp="r31" if($SZ==4 || $SIZE_T==8);    # reassigned $inp! aliases with @X[15]
 128
 129 sub ROUND_00_15 {
 130 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 131 $code.=<<___;
 132         $ROR    $a0,$e,$Sigma1[0]
 133         $ROR    $a1,$e,$Sigma1[1]
 134         and     $t0,$f,$e
 135         xor     $a0,$a0,$a1
 136         add     $h,$h,$t1
 137         andc    $t1,$g,$e
 138         $ROR    $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
 139         or      $t0,$t0,$t1             ; Ch(e,f,g)
 140         add     $h,$h,@X[$i%16]
 141         xor     $a0,$a0,$a1             ; Sigma1(e)
 142         add     $h,$h,$t0
 143         add     $h,$h,$a0
 144
 145         $ROR    $a0,$a,$Sigma0[0]
 146         $ROR    $a1,$a,$Sigma0[1]
 147         and     $t0,$a,$b
 148         and     $t1,$a,$c
 149         xor     $a0,$a0,$a1
 150         $ROR    $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
 151         xor     $t0,$t0,$t1
 152         and     $t1,$b,$c
 153         xor     $a0,$a0,$a1             ; Sigma0(a)
 154         add     $d,$d,$h
 155         xor     $t0,$t0,$t1             ; Maj(a,b,c)
 156 ___
 157 $code.=<<___ if ($i<15);
 158         $LD     $t1,`($i+1)*$SZ`($Tbl)
 159 ___
 160 $code.=<<___;
 161         add     $h,$h,$a0
 162         add     $h,$h,$t0
 163
 164 ___
 165 }
 166
 167 sub ROUND_16_xx {
 168 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 169 $i-=16;
 170 $code.=<<___;
 171         $ROR    $a0,@X[($i+1)%16],$sigma0[0]
 172         $ROR    $a1,@X[($i+1)%16],$sigma0[1]
 173         $ROR    $t0,@X[($i+14)%16],$sigma1[0]
 174         $ROR    $t1,@X[($i+14)%16],$sigma1[1]
 175         xor     $a0,$a0,$a1
 176         $SHR    $a1,@X[($i+1)%16],$sigma0[2]
 177         xor     $t0,$t0,$t1
 178         $SHR    $t1,@X[($i+14)%16],$sigma1[2]
 179         add     @X[$i],@X[$i],@X[($i+9)%16]
 180         xor     $a0,$a0,$a1             ; sigma0(X[(i+1)&0x0f])
 181         xor     $t0,$t0,$t1             ; sigma1(X[(i+14)&0x0f])
 182         $LD     $t1,`$i*$SZ`($Tbl)
 183         add     @X[$i],@X[$i],$a0
 184         add     @X[$i],@X[$i],$t0
 185 ___
 186 &ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
 187 }
 188
 189 $code=<<___;
 190 .machine        "any"
 191 .text
 192
 193 .globl  $func
 194 .align  6
 195 $func:
 196         $STU    $sp,-$FRAME($sp)
 197         mflr    r0
 198         $SHL    $num,$num,`log(16*$SZ)/log(2)`
 199
 200         $PUSH   $ctx,`$FRAME-$SIZE_T*22`($sp)
 201
 202         $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
 203         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
 204         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
 205         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
 206         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
 207         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
 208         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
 209         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
 210         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
 211         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
 212         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
 213         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
 214         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
 215         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
 216         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
 217         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
 218         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
 219         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
 220         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
 221 ___
 222
 223 if ($SZ==4 || $SIZE_T==8) {
 224 $code.=<<___;
 225         $LD     $A,`0*$SZ`($ctx)
 226         mr      $inp,r4                         ; incarnate $inp
 227         $LD     $B,`1*$SZ`($ctx)
 228         $LD     $C,`2*$SZ`($ctx)
 229         $LD     $D,`3*$SZ`($ctx)
 230         $LD     $E,`4*$SZ`($ctx)
 231         $LD     $F,`5*$SZ`($ctx)
 232         $LD     $G,`6*$SZ`($ctx)
 233         $LD     $H,`7*$SZ`($ctx)
 234 ___
 235 } else {
 236   for ($i=16;$i<32;$i++) {
 237     $code.=<<___;
 238         lwz     r$i,`4*($i-16)`($ctx)
 239 ___
 240   }
 241 }
 242
 243 $code.=<<___;
 244         bl      LPICmeup
 245 LPICedup:
 246         andi.   r0,$inp,3
 247         bne     Lunaligned
 248 Laligned:
 249         add     $num,$inp,$num
 250         $PUSH   $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
 251         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
 252         bl      Lsha2_block_private
 253         b       Ldone
 254
 255 ; PowerPC specification allows an implementation to be ill-behaved
 256 ; upon unaligned access which crosses page boundary. "Better safe
 257 ; than sorry" principle makes me treat it specially. But I don't
 258 ; look for particular offending word, but rather for the input
 259 ; block which crosses the boundary. Once found that block is aligned
 260 ; and hashed separately...
 261 .align  4
 262 Lunaligned:
 263         subfic  $t1,$inp,4096
 264         andi.   $t1,$t1,`4096-16*$SZ`   ; distance to closest page boundary
 265         beq     Lcross_page
 266         $UCMP   $num,$t1
 267         ble-    Laligned                ; didn't cross the page boundary
 268         subfc   $num,$t1,$num
 269         add     $t1,$inp,$t1
 270         $PUSH   $num,`$FRAME-$SIZE_T*25`($sp)   ; save real remaining num
 271         $PUSH   $t1,`$FRAME-$SIZE_T*24`($sp)    ; intermediate end pointer
 272         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
 273         bl      Lsha2_block_private
 274         ; $inp equals to the intermediate end pointer here
 275         $POP    $num,`$FRAME-$SIZE_T*25`($sp)   ; restore real remaining num
 276 Lcross_page:
 277         li      $t1,`16*$SZ/4`
 278         mtctr   $t1
 279 ___
 280 if ($SZ==4 || $SIZE_T==8) {
 281 $code.=<<___;
 282         addi    r20,$sp,$LOCALS                 ; aligned spot below the frame
 283 Lmemcpy:
 284         lbz     r16,0($inp)
 285         lbz     r17,1($inp)
 286         lbz     r18,2($inp)
 287         lbz     r19,3($inp)
 288         addi    $inp,$inp,4
 289         stb     r16,0(r20)
 290         stb     r17,1(r20)
 291         stb     r18,2(r20)
 292         stb     r19,3(r20)
 293         addi    r20,r20,4
 294         bdnz    Lmemcpy
 295 ___
 296 } else {
 297 $code.=<<___;
 298         addi    r12,$sp,$LOCALS                 ; aligned spot below the frame
 299 Lmemcpy:
 300         lbz     r8,0($inp)
 301         lbz     r9,1($inp)
 302         lbz     r10,2($inp)
 303         lbz     r11,3($inp)
 304         addi    $inp,$inp,4
 305         stb     r8,0(r12)
 306         stb     r9,1(r12)
 307         stb     r10,2(r12)
 308         stb     r11,3(r12)
 309         addi    r12,r12,4
 310         bdnz    Lmemcpy
 311 ___
 312 }
 313
 314 $code.=<<___;
 315         $PUSH   $inp,`$FRAME-$SIZE_T*26`($sp)   ; save real inp
 316         addi    $t1,$sp,`$LOCALS+16*$SZ`        ; fictitious end pointer
 317         addi    $inp,$sp,$LOCALS                ; fictitious inp pointer
 318         $PUSH   $num,`$FRAME-$SIZE_T*25`($sp)   ; save real num
 319         $PUSH   $t1,`$FRAME-$SIZE_T*24`($sp)    ; end pointer
 320         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
 321         bl      Lsha2_block_private
 322         $POP    $inp,`$FRAME-$SIZE_T*26`($sp)   ; restore real inp
 323         $POP    $num,`$FRAME-$SIZE_T*25`($sp)   ; restore real num
 324         addic.  $num,$num,`-16*$SZ`             ; num--
 325         bne-    Lunaligned
 326
 327 Ldone:
 328         $POP    r0,`$FRAME+$LRSAVE`($sp)
 329         $POP    r14,`$FRAME-$SIZE_T*18`($sp)
 330         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
 331         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
 332         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
 333         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
 334         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
 335         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
 336         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
 337         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
 338         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
 339         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
 340         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
 341         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
 342         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
 343         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
 344         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
 345         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
 346         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
 347         mtlr    r0
 348         addi    $sp,$sp,$FRAME
 349         blr
 350         .long   0
 351         .byte   0,12,4,1,0x80,18,3,0
 352         .long   0
 353 ___
 354
 355 if ($SZ==4 || $SIZE_T==8) {
 356 $code.=<<___;
 357 .align  4
 358 Lsha2_block_private:
 359         $LD     $t1,0($Tbl)
 360 ___
 361 for($i=0;$i<16;$i++) {
 362 $code.=<<___ if ($SZ==4 && !$LITTLE_ENDIAN);
 363         lwz     @X[$i],`$i*$SZ`($inp)
 364 ___
 365 $code.=<<___ if ($SZ==4 && $LITTLE_ENDIAN);
 366         lwz     $a0,`$i*$SZ`($inp)
 367         rotlwi  @X[$i],$a0,8
 368         rlwimi  @X[$i],$a0,24,0,7
 369         rlwimi  @X[$i],$a0,24,16,23
 370 ___
 371 # 64-bit loads are split to 2x32-bit ones, as CPU can't handle
 372 # unaligned 64-bit loads, only 32-bit ones...
 373 $code.=<<___ if ($SZ==8 && !$LITTLE_ENDIAN);
 374         lwz     $t0,`$i*$SZ`($inp)
 375         lwz     @X[$i],`$i*$SZ+4`($inp)
 376         insrdi  @X[$i],$t0,32,0
 377 ___
 378 $code.=<<___ if ($SZ==8 && $LITTLE_ENDIAN);
 379         lwz     $a0,`$i*$SZ`($inp)
 380          lwz    $a1,`$i*$SZ+4`($inp)
 381         rotlwi  $t0,$a0,8
 382          rotlwi @X[$i],$a1,8
 383         rlwimi  $t0,$a0,24,0,7
 384          rlwimi @X[$i],$a1,24,0,7
 385         rlwimi  $t0,$a0,24,16,23
 386          rlwimi @X[$i],$a1,24,16,23
 387         insrdi  @X[$i],$t0,32,0
 388 ___
 389         &ROUND_00_15($i,@V);
 390         unshift(@V,pop(@V));
 391 }
 392 $code.=<<___;
 393         li      $t0,`$rounds/16-1`
 394         mtctr   $t0
 395 .align  4
 396 Lrounds:
 397         addi    $Tbl,$Tbl,`16*$SZ`
 398 ___
 399 for(;$i<32;$i++) {
 400         &ROUND_16_xx($i,@V);
 401         unshift(@V,pop(@V));
 402 }
 403 $code.=<<___;
 404         bdnz-   Lrounds
 405
 406         $POP    $ctx,`$FRAME-$SIZE_T*22`($sp)
 407         $POP    $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
 408         $POP    $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
 409         subi    $Tbl,$Tbl,`($rounds-16)*$SZ`    ; rewind Tbl
 410
 411         $LD     r16,`0*$SZ`($ctx)
 412         $LD     r17,`1*$SZ`($ctx)
 413         $LD     r18,`2*$SZ`($ctx)
 414         $LD     r19,`3*$SZ`($ctx)
 415         $LD     r20,`4*$SZ`($ctx)
 416         $LD     r21,`5*$SZ`($ctx)
 417         $LD     r22,`6*$SZ`($ctx)
 418         addi    $inp,$inp,`16*$SZ`              ; advance inp
 419         $LD     r23,`7*$SZ`($ctx)
 420         add     $A,$A,r16
 421         add     $B,$B,r17
 422         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)
 423         add     $C,$C,r18
 424         $ST     $A,`0*$SZ`($ctx)
 425         add     $D,$D,r19
 426         $ST     $B,`1*$SZ`($ctx)
 427         add     $E,$E,r20
 428         $ST     $C,`2*$SZ`($ctx)
 429         add     $F,$F,r21
 430         $ST     $D,`3*$SZ`($ctx)
 431         add     $G,$G,r22
 432         $ST     $E,`4*$SZ`($ctx)
 433         add     $H,$H,r23
 434         $ST     $F,`5*$SZ`($ctx)
 435         $ST     $G,`6*$SZ`($ctx)
 436         $UCMP   $inp,$num
 437         $ST     $H,`7*$SZ`($ctx)
 438         bne     Lsha2_block_private
 439         blr
 440         .long   0
 441         .byte   0,12,0x14,0,0,0,0,0
 442 .size   $func,.-$func
 443 ___
 444 } else {
 445 ########################################################################
 446 # SHA512 for PPC32, X vector is off-loaded to stack...
 447 #
 448 #                       |       sha512
 449 #                       |       -m32
 450 # ----------------------+-----------------------
 451 # PPC74x0,gcc-4.0.1     |       +48%
 452 # POWER6,gcc-4.4.6      |       +124%(*)
 453 # POWER7,gcc-4.4.6      |       +79%(*)
 454 # e300,gcc-4.1.0        |       +167%
 455 #
 456 # (*)   ~1/3 of -m64 result [and ~20% better than -m32 code generated
 457 #       by xlc-12.1]
 458
 459 my $XOFF=$LOCALS;
 460
 461 my @V=map("r$_",(16..31));      # A..H
 462
 463 my ($s0,$s1,$t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("r$_",(0,5,6,8..12,14,15));
 464 my ($x0,$x1)=("r3","r4");       # zaps $ctx and $inp
 465
 466 sub ROUND_00_15_ppc32 {
 467 my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
 468         $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
 469
 470 $code.=<<___;
 471         lwz     $t2,`$SZ*($i%16)+4`($Tbl)
 472          xor    $a0,$flo,$glo
 473         lwz     $t3,`$SZ*($i%16)+0`($Tbl)
 474          xor    $a1,$fhi,$ghi
 475         addc    $hlo,$hlo,$t0                   ; h+=x[i]
 476         stw     $t0,`$XOFF+0+$SZ*($i%16)`($sp)  ; save x[i]
 477
 478         srwi    $s0,$elo,$Sigma1[0]
 479         srwi    $s1,$ehi,$Sigma1[0]
 480          and    $a0,$a0,$elo
 481         adde    $hhi,$hhi,$t1
 482          and    $a1,$a1,$ehi
 483         stw     $t1,`$XOFF+4+$SZ*($i%16)`($sp)
 484         srwi    $t0,$elo,$Sigma1[1]
 485         srwi    $t1,$ehi,$Sigma1[1]
 486          addc   $hlo,$hlo,$t2                   ; h+=K512[i]
 487         insrwi  $s0,$ehi,$Sigma1[0],0
 488         insrwi  $s1,$elo,$Sigma1[0],0
 489          xor    $a0,$a0,$glo                    ; Ch(e,f,g)
 490          adde   $hhi,$hhi,$t3
 491          xor    $a1,$a1,$ghi
 492         insrwi  $t0,$ehi,$Sigma1[1],0
 493         insrwi  $t1,$elo,$Sigma1[1],0
 494          addc   $hlo,$hlo,$a0                   ; h+=Ch(e,f,g)
 495         srwi    $t2,$ehi,$Sigma1[2]-32
 496         srwi    $t3,$elo,$Sigma1[2]-32
 497         xor     $s0,$s0,$t0
 498         xor     $s1,$s1,$t1
 499         insrwi  $t2,$elo,$Sigma1[2]-32,0
 500         insrwi  $t3,$ehi,$Sigma1[2]-32,0
 501          xor    $a0,$alo,$blo                   ; a^b, b^c in next round
 502          adde   $hhi,$hhi,$a1
 503          xor    $a1,$ahi,$bhi
 504         xor     $s0,$s0,$t2                     ; Sigma1(e)
 505         xor     $s1,$s1,$t3
 506
 507         srwi    $t0,$alo,$Sigma0[0]
 508          and    $a2,$a2,$a0
 509          addc   $hlo,$hlo,$s0                   ; h+=Sigma1(e)
 510          and    $a3,$a3,$a1
 511         srwi    $t1,$ahi,$Sigma0[0]
 512         srwi    $s0,$ahi,$Sigma0[1]-32
 513          adde   $hhi,$hhi,$s1
 514         srwi    $s1,$alo,$Sigma0[1]-32
 515         insrwi  $t0,$ahi,$Sigma0[0],0
 516         insrwi  $t1,$alo,$Sigma0[0],0
 517          xor    $a2,$a2,$blo                    ; Maj(a,b,c)
 518          addc   $dlo,$dlo,$hlo                  ; d+=h
 519          xor    $a3,$a3,$bhi
 520         insrwi  $s0,$alo,$Sigma0[1]-32,0
 521         insrwi  $s1,$ahi,$Sigma0[1]-32,0
 522          adde   $dhi,$dhi,$hhi
 523         srwi    $t2,$ahi,$Sigma0[2]-32
 524         srwi    $t3,$alo,$Sigma0[2]-32
 525         xor     $s0,$s0,$t0
 526          addc   $hlo,$hlo,$a2                   ; h+=Maj(a,b,c)
 527         xor     $s1,$s1,$t1
 528         insrwi  $t2,$alo,$Sigma0[2]-32,0
 529         insrwi  $t3,$ahi,$Sigma0[2]-32,0
 530          adde   $hhi,$hhi,$a3
 531 ___
 532 $code.=<<___ if ($i>=15);
 533         lwz     $t0,`$XOFF+0+$SZ*(($i+2)%16)`($sp)
 534         lwz     $t1,`$XOFF+4+$SZ*(($i+2)%16)`($sp)
 535 ___
 536 $code.=<<___ if ($i<15);
 537         lwz     $t1,`$SZ*($i+1)+0`($inp)
 538         lwz     $t0,`$SZ*($i+1)+4`($inp)
 539 ___
 540 $code.=<<___;
 541         xor     $s0,$s0,$t2                     ; Sigma0(a)
 542         xor     $s1,$s1,$t3
 543         addc    $hlo,$hlo,$s0                   ; h+=Sigma0(a)
 544         adde    $hhi,$hhi,$s1
 545 ___
 546 $code.=<<___ if ($i==15);
 547         lwz     $x0,`$XOFF+0+$SZ*(($i+1)%16)`($sp)
 548         lwz     $x1,`$XOFF+4+$SZ*(($i+1)%16)`($sp)
 549 ___
 550 }
 551 sub ROUND_16_xx_ppc32 {
 552 my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
 553         $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
 554
 555 $code.=<<___;
 556         srwi    $s0,$t0,$sigma0[0]
 557         srwi    $s1,$t1,$sigma0[0]
 558         srwi    $t2,$t0,$sigma0[1]
 559         srwi    $t3,$t1,$sigma0[1]
 560         insrwi  $s0,$t1,$sigma0[0],0
 561         insrwi  $s1,$t0,$sigma0[0],0
 562         srwi    $a0,$t0,$sigma0[2]
 563         insrwi  $t2,$t1,$sigma0[1],0
 564         insrwi  $t3,$t0,$sigma0[1],0
 565         insrwi  $a0,$t1,$sigma0[2],0
 566         xor     $s0,$s0,$t2
 567          lwz    $t2,`$XOFF+0+$SZ*(($i+14)%16)`($sp)
 568         srwi    $a1,$t1,$sigma0[2]
 569         xor     $s1,$s1,$t3
 570          lwz    $t3,`$XOFF+4+$SZ*(($i+14)%16)`($sp)
 571         xor     $a0,$a0,$s0
 572          srwi   $s0,$t2,$sigma1[0]
 573         xor     $a1,$a1,$s1
 574          srwi   $s1,$t3,$sigma1[0]
 575         addc    $x0,$x0,$a0                     ; x[i]+=sigma0(x[i+1])
 576          srwi   $a0,$t3,$sigma1[1]-32
 577         insrwi  $s0,$t3,$sigma1[0],0
 578         insrwi  $s1,$t2,$sigma1[0],0
 579         adde    $x1,$x1,$a1
 580          srwi   $a1,$t2,$sigma1[1]-32
 581
 582         insrwi  $a0,$t2,$sigma1[1]-32,0
 583         srwi    $t2,$t2,$sigma1[2]
 584         insrwi  $a1,$t3,$sigma1[1]-32,0
 585         insrwi  $t2,$t3,$sigma1[2],0
 586         xor     $s0,$s0,$a0
 587          lwz    $a0,`$XOFF+0+$SZ*(($i+9)%16)`($sp)
 588         srwi    $t3,$t3,$sigma1[2]
 589         xor     $s1,$s1,$a1
 590          lwz    $a1,`$XOFF+4+$SZ*(($i+9)%16)`($sp)
 591         xor     $s0,$s0,$t2
 592          addc   $x0,$x0,$a0                     ; x[i]+=x[i+9]
 593         xor     $s1,$s1,$t3
 594          adde   $x1,$x1,$a1
 595         addc    $x0,$x0,$s0                     ; x[i]+=sigma1(x[i+14])
 596         adde    $x1,$x1,$s1
 597 ___
 598         ($t0,$t1,$x0,$x1) = ($x0,$x1,$t0,$t1);
 599         &ROUND_00_15_ppc32(@_);
 600 }
 601
 602 $code.=<<___;
 603 .align  4
 604 Lsha2_block_private:
 605         lwz     $t1,0($inp)
 606         xor     $a2,@V[3],@V[5]         ; B^C, magic seed
 607         lwz     $t0,4($inp)
 608         xor     $a3,@V[2],@V[4]
 609 ___
 610 for($i=0;$i<16;$i++) {
 611         &ROUND_00_15_ppc32($i,@V);
 612         unshift(@V,pop(@V));    unshift(@V,pop(@V));
 613         ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
 614 }
 615 $code.=<<___;
 616         li      $a0,`$rounds/16-1`
 617         mtctr   $a0
 618 .align  4
 619 Lrounds:
 620         addi    $Tbl,$Tbl,`16*$SZ`
 621 ___
 622 for(;$i<32;$i++) {
 623         &ROUND_16_xx_ppc32($i,@V);
 624         unshift(@V,pop(@V));    unshift(@V,pop(@V));
 625         ($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
 626 }
 627 $code.=<<___;
 628         bdnz-   Lrounds
 629
 630         $POP    $ctx,`$FRAME-$SIZE_T*22`($sp)
 631         $POP    $inp,`$FRAME-$SIZE_T*23`($sp)   ; inp pointer
 632         $POP    $num,`$FRAME-$SIZE_T*24`($sp)   ; end pointer
 633         subi    $Tbl,$Tbl,`($rounds-16)*$SZ`    ; rewind Tbl
 634
 635         lwz     $t0,0($ctx)
 636         lwz     $t1,4($ctx)
 637         lwz     $t2,8($ctx)
 638         lwz     $t3,12($ctx)
 639         lwz     $a0,16($ctx)
 640         lwz     $a1,20($ctx)
 641         lwz     $a2,24($ctx)
 642         addc    @V[1],@V[1],$t1
 643         lwz     $a3,28($ctx)
 644         adde    @V[0],@V[0],$t0
 645         lwz     $t0,32($ctx)
 646         addc    @V[3],@V[3],$t3
 647         lwz     $t1,36($ctx)
 648         adde    @V[2],@V[2],$t2
 649         lwz     $t2,40($ctx)
 650         addc    @V[5],@V[5],$a1
 651         lwz     $t3,44($ctx)
 652         adde    @V[4],@V[4],$a0
 653         lwz     $a0,48($ctx)
 654         addc    @V[7],@V[7],$a3
 655         lwz     $a1,52($ctx)
 656         adde    @V[6],@V[6],$a2
 657         lwz     $a2,56($ctx)
 658         addc    @V[9],@V[9],$t1
 659         lwz     $a3,60($ctx)
 660         adde    @V[8],@V[8],$t0
 661         stw     @V[0],0($ctx)
 662         stw     @V[1],4($ctx)
 663         addc    @V[11],@V[11],$t3
 664         stw     @V[2],8($ctx)
 665         stw     @V[3],12($ctx)
 666         adde    @V[10],@V[10],$t2
 667         stw     @V[4],16($ctx)
 668         stw     @V[5],20($ctx)
 669         addc    @V[13],@V[13],$a1
 670         stw     @V[6],24($ctx)
 671         stw     @V[7],28($ctx)
 672         adde    @V[12],@V[12],$a0
 673         stw     @V[8],32($ctx)
 674         stw     @V[9],36($ctx)
 675         addc    @V[15],@V[15],$a3
 676         stw     @V[10],40($ctx)
 677         stw     @V[11],44($ctx)
 678         adde    @V[14],@V[14],$a2
 679         stw     @V[12],48($ctx)
 680         stw     @V[13],52($ctx)
 681         stw     @V[14],56($ctx)
 682         stw     @V[15],60($ctx)
 683
 684         addi    $inp,$inp,`16*$SZ`              ; advance inp
 685         $PUSH   $inp,`$FRAME-$SIZE_T*23`($sp)
 686         $UCMP   $inp,$num
 687         bne     Lsha2_block_private
 688         blr
 689         .long   0
 690         .byte   0,12,0x14,0,0,0,0,0
 691 .size   $func,.-$func
 692 ___
 693 }
 694
 695 # Ugly hack here, because PPC assembler syntax seem to vary too
 696 # much from platforms to platform...
 697 $code.=<<___;
 698 .align  6
 699 LPICmeup:
 700         mflr    r0
 701         bcl     20,31,\$+4
 702         mflr    $Tbl    ; vvvvvv "distance" between . and 1st data entry
 703         addi    $Tbl,$Tbl,`64-8`
 704         mtlr    r0
 705         blr
 706         .long   0
 707         .byte   0,12,0x14,0,0,0,0,0
 708         .space  `64-9*4`
 709 ___
 710 $code.=<<___ if ($SZ==8);
 711         .quad   0x428a2f98d728ae22,0x7137449123ef65cd
 712         .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 713         .quad   0x3956c25bf348b538,0x59f111f1b605d019
 714         .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
 715         .quad   0xd807aa98a3030242,0x12835b0145706fbe
 716         .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 717         .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
 718         .quad   0x9bdc06a725c71235,0xc19bf174cf692694
 719         .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
 720         .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 721         .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
 722         .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 723         .quad   0x983e5152ee66dfab,0xa831c66d2db43210
 724         .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
 725         .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
 726         .quad   0x06ca6351e003826f,0x142929670a0e6e70
 727         .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
 728         .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 729         .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
 730         .quad   0x81c2c92e47edaee6,0x92722c851482353b
 731         .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
 732         .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
 733         .quad   0xd192e819d6ef5218,0xd69906245565a910
 734         .quad   0xf40e35855771202a,0x106aa07032bbd1b8
 735         .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
 736         .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 737         .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
 738         .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 739         .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
 740         .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
 741         .quad   0x90befffa23631e28,0xa4506cebde82bde9
 742         .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
 743         .quad   0xca273eceea26619c,0xd186b8c721c0c207
 744         .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 745         .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
 746         .quad   0x113f9804bef90dae,0x1b710b35131c471b
 747         .quad   0x28db77f523047d84,0x32caab7b40c72493
 748         .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 749         .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 750         .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
 751 ___
 752 $code.=<<___ if ($SZ==4);
 753         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 754         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 755         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 756         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 757         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 758         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 759         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 760         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 761         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 762         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 763         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 764         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 765         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 766         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 767         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 768         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 769 ___
 770
 771 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 772 print $code;
 773 close STDOUT;