SHA1 assembler show off: minor performance updates and new modules for
authorAndy Polyakov <appro@openssl.org>
Sun, 15 Nov 2009 17:26:11 +0000 (17:26 +0000)
committerAndy Polyakov <appro@openssl.org>
Sun, 15 Nov 2009 17:26:11 +0000 (17:26 +0000)
forgotten CPUs.

crypto/sha/asm/sha1-alpha.pl [new file with mode: 0644]
crypto/sha/asm/sha1-armv4-large.pl
crypto/sha/asm/sha1-ia64.pl
crypto/sha/asm/sha1-mips.pl [new file with mode: 0644]
crypto/sha/asm/sha1-parisc.pl [new file with mode: 0644]

diff --git a/crypto/sha/asm/sha1-alpha.pl b/crypto/sha/asm/sha1-alpha.pl
new file mode 100644 (file)
index 0000000..dd9b43b
--- /dev/null
@@ -0,0 +1,314 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for Alpha.
+
+# On 21264 performance is 33% better than code generated by vendor
+# compiler, and 75% better than GCC [3.4]. Implementation features
+# vectorized byte swap, but not Xupdate.
+
+@X=(   "\$0",  "\$1",  "\$2",  "\$3",  "\$4",  "\$5",  "\$6",  "\$7",
+       "\$8",  "\$9",  "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
+$ctx="a0";     # $16
+$inp="a1";
+$num="a2";
+$A="a3";
+$B="a4";       # 20
+$C="a5";
+$D="t8";
+$E="t9";       @V=($A,$B,$C,$D,$E);
+$t0="t10";     # 24
+$t1="t11";
+$t2="ra";
+$t3="t12";
+$K="AT";       # 28
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+       ldq_u   @X[0],0+0($inp)
+       ldq_u   @X[1],0+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<14);
+       ldq_u   @X[$i+2],($i+2)*4+0($inp)
+       ldq_u   @X[$i+3],($i+2)*4+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<15);
+       extql   @X[$i],$inp,@X[$i]
+       extqh   @X[$i+1],$inp,@X[$i+1]
+
+       or      @X[$i+1],@X[$i],@X[$i]  # pair of 32-bit values are fetched
+
+       srl     @X[$i],24,$t0           # vectorized byte swap
+       srl     @X[$i],8,$t2
+
+       sll     @X[$i],8,$t3
+       sll     @X[$i],24,@X[$i]
+       zapnot  $t0,0x11,$t0
+       zapnot  $t2,0x22,$t2
+
+       zapnot  @X[$i],0x88,@X[$i]
+       or      $t0,$t2,$t0
+       zapnot  $t3,0x44,$t3
+       sll     $a,5,$t1
+
+       or      @X[$i],$t0,@X[$i]
+       addl    $K,$e,$e
+       and     $b,$c,$t2
+       zapnot  $a,0xf,$a
+
+       or      @X[$i],$t3,@X[$i]
+       srl     $a,27,$t0
+       bic     $d,$b,$t3
+       sll     $b,30,$b
+
+       extll   @X[$i],4,@X[$i+1]       # extract upper half
+       or      $t2,$t3,$t2
+       addl    @X[$i],$e,$e
+
+       addl    $t1,$e,$e
+       srl     $b,32,$t3
+       zapnot  @X[$i],0xf,@X[$i]
+
+       addl    $t0,$e,$e
+       addl    $t2,$e,$e
+       or      $t3,$b,$b
+___
+$code.=<<___ if (($i&1) && $i<15);
+       sll     $a,5,$t1
+       addl    $K,$e,$e
+       and     $b,$c,$t2
+       zapnot  $a,0xf,$a
+
+       srl     $a,27,$t0
+       addl    @X[$i%16],$e,$e
+       bic     $d,$b,$t3
+       sll     $b,30,$b
+
+       or      $t2,$t3,$t2
+       addl    $t1,$e,$e
+       srl     $b,32,$t3
+       zapnot  @X[$i],0xf,@X[$i]
+
+       addl    $t0,$e,$e
+       addl    $t2,$e,$e
+       or      $t3,$b,$b
+___
+$code.=<<___ if ($i>=15);      # with forward Xupdate
+       sll     $a,5,$t1
+       addl    $K,$e,$e
+       and     $b,$c,$t2
+       xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+       zapnot  $a,0xf,$a
+       addl    @X[$i%16],$e,$e
+       bic     $d,$b,$t3
+       xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+       srl     $a,27,$t0
+       addl    $t1,$e,$e
+       or      $t2,$t3,$t2
+       xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+       sll     $b,30,$b
+       addl    $t0,$e,$e
+       srl     @X[$j%16],31,$t1
+
+       addl    $t2,$e,$e
+       srl     $b,32,$t3
+       addl    @X[$j%16],@X[$j%16],@X[$j%16]
+
+       or      $t3,$b,$b
+       zapnot  @X[$i%16],0xf,@X[$i%16]
+       or      $t1,@X[$j%16],@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);       # with forward Xupdate
+       sll     $a,5,$t1
+       addl    $K,$e,$e
+       zapnot  $a,0xf,$a
+       xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+       sll     $b,30,$t3
+       addl    $t1,$e,$e
+       xor     $b,$c,$t2
+       xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+       srl     $b,2,$b
+       addl    @X[$i%16],$e,$e
+       xor     $d,$t2,$t2
+       xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+       srl     @X[$j%16],31,$t1
+       addl    $t2,$e,$e
+       srl     $a,27,$t0
+       addl    @X[$j%16],@X[$j%16],@X[$j%16]
+
+       or      $t3,$b,$b
+       addl    $t0,$e,$e
+       or      $t1,@X[$j%16],@X[$j%16]
+___
+$code.=<<___ if ($i<77);
+       zapnot  @X[$i%16],0xf,@X[$i%16]
+___
+$code.=<<___ if ($i==79);      # with context fetch
+       sll     $a,5,$t1
+       addl    $K,$e,$e
+       zapnot  $a,0xf,$a
+       ldl     @X[0],0($ctx)
+
+       sll     $b,30,$t3
+       addl    $t1,$e,$e
+       xor     $b,$c,$t2
+       ldl     @X[1],4($ctx)
+
+       srl     $b,2,$b
+       addl    @X[$i%16],$e,$e
+       xor     $d,$t2,$t2
+       ldl     @X[2],8($ctx)
+
+       srl     $a,27,$t0
+       addl    $t2,$e,$e
+       ldl     @X[3],12($ctx)
+
+       or      $t3,$b,$b
+       addl    $t0,$e,$e
+       ldl     @X[4],16($ctx)
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;  # with forward Xupdate
+       sll     $a,5,$t1
+       addl    $K,$e,$e
+       zapnot  $a,0xf,$a
+       xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+       srl     $a,27,$t0
+       and     $b,$c,$t2
+       and     $b,$d,$t3
+       xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+       sll     $b,30,$b
+       addl    $t1,$e,$e
+       xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+       srl     @X[$j%16],31,$t1
+       addl    $t0,$e,$e
+       or      $t2,$t3,$t2
+       and     $c,$d,$t3
+
+       or      $t2,$t3,$t2
+       srl     $b,32,$t3
+       addl    @X[$i%16],$e,$e
+       addl    @X[$j%16],@X[$j%16],@X[$j%16]
+
+       or      $t3,$b,$b
+       addl    $t2,$e,$e
+       or      $t1,@X[$j%16],@X[$j%16]
+       zapnot  @X[$i%16],0xf,@X[$i%16]
+___
+}
+
+$code=<<___;
+#include <asm.h>
+#include <regdef.h>
+
+.text
+
+.set   noat
+.set   noreorder
+.globl sha1_block_data_order
+.align 5
+.ent   sha1_block_data_order
+sha1_block_data_order:
+       lda     sp,-64(sp)
+       stq     ra,0(sp)
+       stq     s0,8(sp)
+       stq     s1,16(sp)
+       stq     s2,24(sp)
+       stq     s3,32(sp)
+       stq     s4,40(sp)
+       stq     s5,48(sp)
+       stq     fp,56(sp)
+       .mask   0x0400fe00,-64
+       .frame  sp,64,ra
+       .prologue 0
+
+       ldl     $A,0($ctx)
+       ldl     $B,4($ctx)
+       sll     $num,6,$num
+       ldl     $C,8($ctx)
+       ldl     $D,12($ctx)
+       ldl     $E,16($ctx)
+       addq    $inp,$num,$num
+
+.Lloop:
+       .set    noreorder
+       ldah    $K,23170(zero)
+       zapnot  $B,0xf,$B
+       lda     $K,31129($K)    # K_00_19
+___
+for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+       ldah    $K,28378(zero)
+       lda     $K,-5215($K)    # K_20_39
+___
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+       ldah    $K,-28900(zero)
+       lda     $K,-17188($K)   # K_40_59
+___
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+       ldah    $K,-13725(zero)
+       lda     $K,-15914($K)   # K_60_79
+___
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+       addl    @X[0],$A,$A
+       addl    @X[1],$B,$B
+       addl    @X[2],$C,$C
+       addl    @X[3],$D,$D
+       addl    @X[4],$E,$E
+       stl     $A,0($ctx)
+       stl     $B,4($ctx)
+       addq    $inp,64,$inp
+       stl     $C,8($ctx)
+       stl     $D,12($ctx)
+       stl     $E,16($ctx)
+       cmpult  $inp,$num,$t1
+       bne     $t1,.Lloop
+
+       .set    noreorder
+       ldq     ra,0(sp)
+       ldq     s0,8(sp)
+       ldq     s1,16(sp)
+       ldq     s2,24(sp)
+       ldq     s3,32(sp)
+       ldq     s4,40(sp)
+       ldq     s5,48(sp)
+       ldq     fp,56(sp)
+       lda     sp,64(sp)
+       ret     (ra)
+.end   sha1_block_data_order
+___
+print $code;
+close STDOUT;
index c5c4480f94d267c6db389874045f64877cb48410..f99606a32ea50e22197fdd4b7643519b1d29339d 100644 (file)
@@ -86,8 +86,8 @@ $code.=<<___;
        ldr     $t3,[$Xi,#2*4]
        add     $e,$K,$e,ror#2                  @ E+=K_xx_xx
        eor     $t0,$t0,$t1
+       eor     $t2,$t2,$t3
        eor     $t0,$t0,$t2
-       eor     $t0,$t0,$t3
        add     $e,$e,$a,ror#27                 @ E+=ROR(A,27)
 ___
 $code.=<<___ if (!defined($flag));
@@ -131,6 +131,15 @@ ___
 
 sub BODY_40_59 {
 my ($a,$b,$c,$d,$e)=@_;
+if (1) {
+       &Xupdate(@_);
+$code.=<<___;
+       and     $t2,$c,$d
+       and     $t1,$b,$t1,ror#2
+       add     $e,$e,$t2,ror#2
+       add     $e,$e,$t1                       @ E+=F_40_59(B,C,D)
+___
+} else {
        &Xupdate(@_,1);
 $code.=<<___;
        and     $t1,$b,$c,ror#2
@@ -140,6 +149,7 @@ $code.=<<___;
        add     $e,$e,$t1                       @ E+=F_40_59(B,C,D)
 ___
 }
+}
 
 $code=<<___;
 .text
index 51c4f47ecbdc215d6457c8d3c1ac3d16e9db9733..db28f0805a11c568e468d2cb7d8a46280c70237c 100644 (file)
@@ -15,7 +15,7 @@
 # is >50% better than HP C and >2x better than gcc.
 
 $code=<<___;
-.ident  \"sha1-ia64.s, version 1.2\"
+.ident  \"sha1-ia64.s, version 1.3\"
 .ident  \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
 .explicit
 
@@ -26,14 +26,10 @@ if ($^O eq "hpux") {
     $ADDP="addp4";
     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
 } else { $ADDP="add"; }
-for (@ARGV) {  $big_endian=1 if (/\-DB_ENDIAN/);
-               $big_endian=0 if (/\-DL_ENDIAN/);   }
-if (!defined($big_endian))
-           {   $big_endian=(unpack('L',pack('N',1))==1);   }
 
 #$human=1;
 if ($human) {  # useful for visual code auditing...
-       ($A,$B,$C,$D,$E,$T)   = ("A","B","C","D","E","T");
+       ($A,$B,$C,$D,$E)   = ("A","B","C","D","E");
        ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
        ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
            (   "K_00_19","K_20_39","K_40_59","K_60_79" );
@@ -41,47 +37,50 @@ if ($human) {       # useful for visual code auditing...
                "X8", "X9","X10","X11","X12","X13","X14","X15"  );
 }
 else {
-       ($A,$B,$C,$D,$E,$T)   = ("loc0","loc1","loc2","loc3","loc4","loc5");
-       ($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10");
+       ($A,$B,$C,$D,$E)   =    ("loc0","loc1","loc2","loc3","loc4");
+       ($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
        ($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
-           (   "r14", "r15", "loc11", "loc12"  );
+           (   "r14", "r15", "loc10", "loc11"  );
        @X= (   "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
                "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"  );
 }
 
 sub BODY_00_15 {
 local  *code=shift;
-local  ($i,$a,$b,$c,$d,$e,$f)=@_;
+my     ($i,$a,$b,$c,$d,$e)=@_;
+my     $j=$i+1;
+my     $Xn=@X[$j%16];
 
 $code.=<<___ if ($i==0);
-{ .mmi;        ld1     $X[$i&0xf]=[inp],2          // MSB
+{ .mmi;        ld1     $X[$i]=[inp],2              // MSB
        ld1     tmp2=[tmp3],2           };;
 { .mmi;        ld1     tmp0=[inp],2
        ld1     tmp4=[tmp3],2               // LSB
-       dep     $X[$i&0xf]=$X[$i&0xf],tmp2,8,8  };;
+       dep     $X[$i]=$X[$i],tmp2,8,8  };;
 ___
 if ($i<15) {
        $code.=<<___;
-{ .mmi;        ld1     $X[($i+1)&0xf]=[inp],2      // +1
+{ .mmi;        ld1     $Xn=[inp],2                 // forward Xload
+       nop.m   0x0
        dep     tmp1=tmp0,tmp4,8,8      };;
-{ .mmi;        ld1     tmp2=[tmp3],2               // +1
+{ .mmi;        ld1     tmp2=[tmp3],2               // forward Xload
        and     tmp4=$c,$b
-       dep     $X[$i&0xf]=$X[$i&0xf],tmp1,16,16        } //;;
-{ .mmi;        andcm   tmp1=$d,$b
-       add     tmp0=$e,$K_00_19
+       dep     $X[$i]=$X[$i],tmp1,16,16} //;;
+{ .mmi;        add     $e=$e,$K_00_19              // e+=K_00_19
+       andcm   tmp1=$d,$b
        dep.z   tmp5=$a,5,27            };; // a<<5
-{ .mmi;        or      tmp4=tmp4,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
-       add     $f=tmp0,$X[$i&0xf]          // f=xi+e+K_00_19
+{ .mmi;        add     $e=$e,$X[$i]                // e+=Xload
+       or      tmp4=tmp4,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
        extr.u  tmp1=$a,27,5            };; // a>>27
-{ .mmi;        ld1     tmp0=[inp],2                // +1
-       add     $f=$f,tmp4                  // f+=F_00_19(b,c,d)
+{ .mmi;        ld1     tmp0=[inp],2                // forward Xload
+       add     $e=$e,tmp4                  // e+=F_00_19(b,c,d)
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30)
-{ .mmi;        ld1     tmp4=[tmp3],2               // +1
+{ .mmi;        ld1     tmp4=[tmp3],2               // forward Xload
        or      tmp5=tmp1,tmp5              // ROTATE(a,5)
        mux2    tmp6=$a,0x44            };; // see b in next iteration
-{ .mii;        add     $f=$f,tmp5                  // f+=ROTATE(a,5)
-       dep     $X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8  // +1
-       mux2    $X[$i&0xf]=$X[$i&0xf],0x44      } //;;
+{ .mii;        add     $e=$e,tmp5                  // e+=ROTATE(a,5)
+       dep     $Xn=$Xn,tmp2,8,8            // forward Xload
+       mux2    $X[$i]=$X[$i],0x44      } //;;
 
 ___
        }
@@ -89,24 +88,24 @@ else        {
        $code.=<<___;
 { .mii;        and     tmp3=$c,$b
        dep     tmp1=tmp0,tmp4,8,8;;
-       dep     $X[$i&0xf]=$X[$i&0xf],tmp1,16,16        } //;;
-{ .mmi;        andcm   tmp1=$d,$b
-       add     tmp0=$e,$K_00_19
+       dep     $X[$i]=$X[$i],tmp1,16,16} //;;
+{ .mmi;        add     $e=$e,$K_00_19              // e+=K_00_19
+       andcm   tmp1=$d,$b
        dep.z   tmp5=$a,5,27            };; // a<<5
-{ .mmi;        or      tmp4=tmp3,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
-       add     $f=tmp0,$X[$i&0xf]          // f=xi+e+K_00_19
+{ .mmi;        add     $e=$e,$X[$i]                // e+=Xupdate
+       or      tmp4=tmp3,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
        extr.u  tmp1=$a,27,5            }   // a>>27
-{ .mmi;        xor     tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]  // +1
-       xor     tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
+{ .mmi;        xor     $Xn=$Xn,$X[($j+2)%16]       // forward Xupdate
+       xor     tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
        nop.i   0                       };;
-{ .mmi;        add     $f=$f,tmp4                  // f+=F_00_19(b,c,d)
-       xor     tmp2=tmp2,tmp3              // +1
+{ .mmi;        add     $e=$e,tmp4                  // e+=F_00_19(b,c,d)
+       xor     $Xn=$Xn,tmp3                // forward Xupdate
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30)
 { .mmi; or     tmp1=tmp1,tmp5              // ROTATE(a,5)
        mux2    tmp6=$a,0x44            };; // see b in next iteration
-{ .mii;        add     $f=$f,tmp1                  // f+=ROTATE(a,5)
-       shrp    $e=tmp2,tmp2,31             // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
-       mux2    $X[$i&0xf]=$X[$i&0xf],0x44  };;
+{ .mii;        add     $e=$e,tmp1                  // e+=ROTATE(a,5)
+       shrp    $Xn=$Xn,$Xn,31              // ROTATE(x[0]^x[2]^x[8]^x[13],1)
+       mux2    $X[$i]=$X[$i],0x44      };;
 
 ___
        }
@@ -114,27 +113,28 @@ ___
 
 sub BODY_16_19 {
 local  *code=shift;
-local  ($i,$a,$b,$c,$d,$e,$f)=@_;
+my     ($i,$a,$b,$c,$d,$e)=@_;
+my     $j=$i+1;
+my     $Xn=@X[$j%16];
 
 $code.=<<___;
-{ .mmi;        mov     $X[$i&0xf]=$f               // Xupdate
-       and     tmp0=$c,$b
+{ .mib;        add     $e=$e,$K_00_19              // e+=K_00_19
        dep.z   tmp5=$a,5,27            }   // a<<5
-{ .mmi;        andcm   tmp1=$d,$b
-       add     tmp4=$e,$K_00_19        };;
-{ .mmi;        or      tmp0=tmp0,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
-       add     $f=$f,tmp4                  // f+=e+K_00_19
+{ .mib;        andcm   tmp1=$d,$b
+       and     tmp0=$c,$b              };;
+{ .mmi;        add     $e=$e,$X[$i%16]             // e+=Xupdate
+       or      tmp0=tmp0,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
        extr.u  tmp1=$a,27,5            }   // a>>27
-{ .mmi;        xor     tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]  // +1
-       xor     tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
+{ .mmi;        xor     $Xn=$Xn,$X[($j+2)%16]       // forward Xupdate
+       xor     tmp3=$X[($j+8)%16],$X[($j+13)%16]       // forward Xupdate
        nop.i   0                       };;
-{ .mmi;        add     $f=$f,tmp0                  // f+=F_00_19(b,c,d)
-       xor     tmp2=tmp2,tmp3              // +1
+{ .mmi;        add     $e=$e,tmp0                  // f+=F_00_19(b,c,d)
+       xor     $Xn=$Xn,tmp3                // forward Xupdate
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30)
 { .mmi;        or      tmp1=tmp1,tmp5              // ROTATE(a,5)
        mux2    tmp6=$a,0x44            };; // see b in next iteration
-{ .mii;        add     $f=$f,tmp1                  // f+=ROTATE(a,5)
-       shrp    $e=tmp2,tmp2,31             // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+{ .mii;        add     $e=$e,tmp1                  // e+=ROTATE(a,5)
+       shrp    $Xn=$Xn,$Xn,31              // ROTATE(x[0]^x[2]^x[8]^x[13],1)
        nop.i   0                       };;
 
 ___
@@ -142,49 +142,47 @@ ___
 
 sub BODY_20_39 {
 local  *code=shift;
-local  ($i,$a,$b,$c,$d,$e,$f,$Konst)=@_;
+my     ($i,$a,$b,$c,$d,$e,$Konst)=@_;
        $Konst = $K_20_39 if (!defined($Konst));
+my     $j=$i+1;
+my     $Xn=@X[$j%16];
 
 if ($i<79) {
 $code.=<<___;
-{ .mib;        mov     $X[$i&0xf]=$f               // Xupdate
+{ .mib;        add     $e=$e,$Konst                // e+=K_XX_XX
        dep.z   tmp5=$a,5,27            }   // a<<5
 { .mib;        xor     tmp0=$c,$b
-       add     tmp4=$e,$Konst          };;
-{ .mmi;        xor     tmp0=tmp0,$d                // F_20_39(b,c,d)=b^c^d
-       add     $f=$f,tmp4                  // f+=e+K_20_39
+       xor     $Xn=$Xn,$X[($j+2)%16]   };; // forward Xupdate
+{ .mib;        add     $e=$e,$X[$i%16]             // e+=Xupdate
        extr.u  tmp1=$a,27,5            }   // a>>27
-{ .mmi;        xor     tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]  // +1
-       xor     tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
-       nop.i   0                       };;
-{ .mmi;        add     $f=$f,tmp0                  // f+=F_20_39(b,c,d)
-       xor     tmp2=tmp2,tmp3              // +1
+{ .mib;        xor     tmp0=tmp0,$d                // F_20_39(b,c,d)=b^c^d
+       xor     $Xn=$Xn,$X[($j+8)%16]   };; // forward Xupdate
+{ .mmi;        add     $e=$e,tmp0                  // e+=F_20_39(b,c,d)
+       xor     $Xn=$Xn,$X[($j+13)%16]      // forward Xupdate
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30)
 { .mmi;        or      tmp1=tmp1,tmp5              // ROTATE(a,5)
        mux2    tmp6=$a,0x44            };; // see b in next iteration
-{ .mii;        add     $f=$f,tmp1                  // f+=ROTATE(a,5)
-       shrp    $e=tmp2,tmp2,31             // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+{ .mii;        add     $e=$e,tmp1                  // e+=ROTATE(a,5)
+       shrp    $Xn=$Xn,$Xn,31              // ROTATE(x[0]^x[2]^x[8]^x[13],1)
        nop.i   0                       };;
 
 ___
 }
 else {
 $code.=<<___;
-{ .mib;        mov     $X[$i&0xf]=$f               // Xupdate
+{ .mib;        add     $e=$e,$Konst                // e+=K_60_79
        dep.z   tmp5=$a,5,27            }   // a<<5
 { .mib;        xor     tmp0=$c,$b
-       add     tmp4=$e,$Konst          };;
-{ .mib;        xor     tmp0=tmp0,$d                // F_20_39(b,c,d)=b^c^d
-       extr.u  tmp1=$a,27,5            }   // a>>27
-{ .mib;        add     $f=$f,tmp4                  // f+=e+K_20_39
        add     $h1=$h1,$a              };; // wrap up
-{ .mmi;        add     $f=$f,tmp0                  // f+=F_20_39(b,c,d)
-       shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30) ;;?
-{ .mmi;        or      tmp1=tmp1,tmp5              // ROTATE(a,5)
+{ .mib;        add     $e=$e,$X[$i%16]             // e+=Xupdate
+       extr.u  tmp1=$a,27,5            }   // a>>27
+{ .mib;        xor     tmp0=tmp0,$d                // F_20_39(b,c,d)=b^c^d
        add     $h3=$h3,$c              };; // wrap up
-{ .mib;        add     tmp3=1,inp                  // used in unaligned codepath
-       add     $f=$f,tmp1              }   // f+=ROTATE(a,5)
-{ .mib;        add     $h2=$h2,$b                  // wrap up
+{ .mmi;        add     $e=$e,tmp0                  // e+=F_20_39(b,c,d)
+       or      tmp1=tmp1,tmp5              // ROTATE(a,5)
+       shrp    $b=tmp6,tmp6,2          };; // b=ROTATE(b,30) ;;?
+{ .mmi;        add     $e=$e,tmp1                  // e+=ROTATE(a,5)
+       add     tmp3=1,inp                  // used in unaligned codepath
        add     $h4=$h4,$d              };; // wrap up
 
 ___
@@ -193,29 +191,29 @@ ___
 
 sub BODY_40_59 {
 local  *code=shift;
-local  ($i,$a,$b,$c,$d,$e,$f)=@_;
+my     ($i,$a,$b,$c,$d,$e)=@_;
+my     $j=$i+1;
+my     $Xn=@X[$j%16];
 
 $code.=<<___;
-{ .mmi;        mov     $X[$i&0xf]=$f               // Xupdate
-       and     tmp0=$c,$b
+{ .mib;        add     $e=$e,$K_40_59              // e+=K_40_59
        dep.z   tmp5=$a,5,27            }   // a<<5
-{ .mmi;        and     tmp1=$d,$b
-       add     tmp4=$e,$K_40_59        };;
-{ .mmi;        or      tmp0=tmp0,tmp1              // (b&c)|(b&d)
-       add     $f=$f,tmp4                  // f+=e+K_40_59
+{ .mib;        and     tmp1=$c,$d
+       xor     tmp0=$c,$d              };;
+{ .mmi;        add     $e=$e,$X[$i%16]             // e+=Xupdate
+       add     tmp5=tmp5,tmp1              // a<<5+(c&d)
        extr.u  tmp1=$a,27,5            }   // a>>27
-{ .mmi;        and     tmp4=$c,$d
-       xor     tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]  // +1
-       xor     tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
-       };;
-{ .mmi;        or      tmp1=tmp1,tmp5              // ROTATE(a,5)
-       xor     tmp2=tmp2,tmp3              // +1
+{ .mmi;        and     tmp0=tmp0,$b
+       xor     $Xn=$Xn,$X[($j+2)%16]       // forward Xupdate
+       xor     tmp3=$X[($j+8)%16],$X[($j+13)%16] };;   // forward Xupdate
+{ .mmi;        add     $e=$e,tmp0                  // e+=b&(c^d)
+       add     tmp5=tmp5,tmp1              // ROTATE(a,5)+(c&d)
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30)
-{ .mmi;        or      tmp0=tmp0,tmp4              // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d)
+{ .mmi;        xor     $Xn=$Xn,tmp3
        mux2    tmp6=$a,0x44            };; // see b in next iteration
-{ .mii;        add     $f=$f,tmp0                  // f+=F_40_59(b,c,d)
-       shrp    $e=tmp2,tmp2,31;;           // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
-       add     $f=$f,tmp1              };; // f+=ROTATE(a,5)
+{ .mii;        add     $e=$e,tmp5                  // e+=ROTATE(a,5)+(c&d)
+       shrp    $Xn=$Xn,$Xn,31              // ROTATE(x[0]^x[2]^x[8]^x[13],1)
+       nop.i   0x0                     };;
 
 ___
 }
@@ -237,7 +235,7 @@ inp=r33;    // in1
 .align 32
 sha1_block_data_order:
        .prologue
-{ .mmi;        alloc   tmp1=ar.pfs,3,15,0,0
+{ .mmi;        alloc   tmp1=ar.pfs,3,14,0,0
        $ADDP   tmp0=4,ctx
        .save   ar.lc,r3
        mov     r3=ar.lc                }
@@ -245,8 +243,8 @@ sha1_block_data_order:
        $ADDP   inp=0,inp
        mov     r2=pr                   };;
 tmp4=in2;
-tmp5=loc13;
-tmp6=loc14;
+tmp5=loc12;
+tmp6=loc13;
        .body
 { .mlx;        ld4     $h0=[ctx],8
        movl    $K_00_19=0x5a827999     }
@@ -273,7 +271,7 @@ tmp6=loc14;
 
 ___
 
-{ my $i,@V=($A,$B,$C,$D,$E,$T);
+{ my $i,@V=($A,$B,$C,$D,$E);
 
        for($i=0;$i<16;$i++)    { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
        for(;$i<20;$i++)        { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
@@ -281,12 +279,12 @@ ___
        for(;$i<60;$i++)        { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
        for(;$i<80;$i++)        { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
 
-       (($V[5] eq $D) and ($V[0] eq $E)) or die;       # double-check
+       (($V[0] eq $A) and ($V[4] eq $E)) or die;       # double-check
 }
 
 $code.=<<___;
-{ .mmb;        add     $h0=$h0,$E
-       nop.m   0
+{ .mmb;        add     $h0=$h0,$A
+       add     $h2=$h2,$C
        br.ctop.dptk.many       .Ldtop  };;
 .Ldend:
 { .mmi;        add     tmp0=4,ctx
diff --git a/crypto/sha/asm/sha1-mips.pl b/crypto/sha/asm/sha1-mips.pl
new file mode 100644 (file)
index 0000000..a2f7aad
--- /dev/null
@@ -0,0 +1,281 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for MIPS.
+
+# Performance improvement is 30% on unaligned input. The "secret" is
+# to deploy lwl/lwr pair to load unaligned input. One could have
+# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
+# compatible subroutine. There is room for minor optimization on
+# little-endian platforms...
+#
+# The code is somewhat IRIX-centric, i.e. is likely to require minor
+# adaptations for other OSes...
+
+for (@ARGV) {   $big_endian=1 if (/\-DB_ENDIAN/);
+                $big_endian=0 if (/\-DL_ENDIAN/);   }
+if (!defined($big_endian))
+            {   $big_endian=(unpack('L',pack('N',1))==1);   }
+
+# offsets of the Most and Least Significant Bytes
+$MSB=$big_endian?0:3;
+$LSB=3&~$MSB;
+
+@X=(   "\$8",  "\$9",  "\$10", "\$11", "\$12", "\$13", "\$14", "\$15",
+       "\$16", "\$17", "\$18", "\$19", "\$20", "\$21", "\$22", "\$23");
+$ctx="\$4";    # a0
+$inp="\$5";    # a1
+$num="\$6";    # a2
+$A="\$1";
+$B="\$2";
+$C="\$3";
+$D="\$7";
+$E="\$24";     @V=($A,$B,$C,$D,$E);
+$t0="\$25";    # jp,t9
+$t1="\$28";    # gp
+$t2="\$30";    # fp,s8
+$K="\$31";     # ra
+
+$FRAMESIZE=16;
+
+sub BODY_00_14 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___   if (!$big_endian);
+       srl     $t0,@X[$i],24   # byte swap($i)
+       srl     $t1,@X[$i],8
+       andi    $t2,@X[$i],0xFF00
+       sll     @X[$i],@X[$i],24
+       andi    $t1,0xFF00
+       sll     $t2,$t2,8
+       or      @X[$i],$t0
+       or      @X[$i],$t1
+       or      @X[$i],$t2
+___
+$code.=<<___;
+        lwl    @X[$j],$j*4+$MSB($inp)
+       sll     $t0,$a,5        # $i
+       addu    $e,$K
+        lwr    @X[$j],$j*4+$LSB($inp)
+       srl     $t1,$a,27
+       addu    $e,$t0
+       xor     $t0,$c,$d
+       addu    $e,$t1
+       sll     $t2,$b,30
+       and     $t0,$b
+       srl     $b,$b,2
+       xor     $t0,$d
+       addu    $e,@X[$i]
+       or      $b,$t2
+       addu    $e,$t0
+___
+}
+
+sub BODY_15_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+
+$code.=<<___   if (!$big_endian && $i==15);
+       srl     $t0,@X[$i],24   # byte swap($i)
+       srl     $t1,@X[$i],8
+       andi    $t2,@X[$i],0xFF00
+       sll     @X[$i],@X[$i],24
+       andi    $t1,0xFF00
+       sll     $t2,$t2,8
+       or      @X[$i],$t0
+       or      @X[$i],$t1
+       or      @X[$i],$t2
+___
+$code.=<<___;
+        xor    @X[$j%16],@X[($j+2)%16]
+       sll     $t0,$a,5        # $i
+       addu    $e,$K
+       srl     $t1,$a,27
+       addu    $e,$t0
+        xor    @X[$j%16],@X[($j+8)%16]
+       xor     $t0,$c,$d
+       addu    $e,$t1
+        xor    @X[$j%16],@X[($j+13)%16]
+       sll     $t2,$b,30
+       and     $t0,$b
+        srl    $t1,@X[$j%16],31
+        addu   @X[$j%16],@X[$j%16]
+       srl     $b,$b,2
+       xor     $t0,$d
+        or     @X[$j%16],$t1
+       addu    $e,@X[$i%16]
+       or      $b,$t2
+       addu    $e,$t0
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+        xor    @X[$j%16],@X[($j+2)%16]
+       sll     $t0,$a,5        # $i
+       addu    $e,$K
+       srl     $t1,$a,27
+       addu    $e,$t0
+        xor    @X[$j%16],@X[($j+8)%16]
+       xor     $t0,$c,$d
+       addu    $e,$t1
+        xor    @X[$j%16],@X[($j+13)%16]
+       sll     $t2,$b,30
+       xor     $t0,$b
+        srl    $t1,@X[$j%16],31
+        addu   @X[$j%16],@X[$j%16]
+       srl     $b,$b,2
+       addu    $e,@X[$i%16]
+        or     @X[$j%16],$t1
+       or      $b,$t2
+       addu    $e,$t0
+___
+$code.=<<___ if ($i==79);
+        lw     @X[0],0($ctx)
+       sll     $t0,$a,5        # $i
+       addu    $e,$K
+        lw     @X[1],4($ctx)
+       srl     $t1,$a,27
+       addu    $e,$t0
+        lw     @X[2],8($ctx)
+       xor     $t0,$c,$d
+       addu    $e,$t1
+        lw     @X[3],12($ctx)
+       sll     $t2,$b,30
+       xor     $t0,$b
+        lw     @X[4],16($ctx)
+       srl     $b,$b,2
+       addu    $e,@X[$i%16]
+       or      $b,$t2
+       addu    $e,$t0
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+        xor    @X[$j%16],@X[($j+2)%16]
+       sll     $t0,$a,5        # $i
+       addu    $e,$K
+       srl     $t1,$a,27
+       addu    $e,$t0
+        xor    @X[$j%16],@X[($j+8)%16]
+       and     $t0,$c,$d
+       addu    $e,$t1
+        xor    @X[$j%16],@X[($j+13)%16]
+       sll     $t2,$b,30
+       addu    $e,$t0
+        srl    $t1,@X[$j%16],31
+       xor     $t0,$c,$d
+        addu   @X[$j%16],@X[$j%16]
+       and     $t0,$b
+       srl     $b,$b,2
+        or     @X[$j%16],$t1
+       addu    $e,@X[$i%16]
+       or      $b,$t2
+       addu    $e,$t0
+___
+}
+
+$code=<<___;
+#include <asm.h>
+#include <regdef.h>
+
+.text
+
+.set   noat
+.set   noreorder
+.align 5
+.globl sha1_block_data_order
+.ent   sha1_block_data_order
+sha1_block_data_order:
+       .frame  sp,$FRAMESIZE*SZREG,zero
+       .mask   0xd0ff0000,-$FRAMESIZE*SZREG
+       .set    noreorder
+       PTR_SUB sp,$FRAMESIZE*SZREG
+       REG_S   \$31,($FRAMESIZE-1)*SZREG(sp)
+       REG_S   \$30,($FRAMESIZE-2)*SZREG(sp)
+       REG_S   \$28,($FRAMESIZE-3)*SZREG(sp)
+       REG_S   \$23,($FRAMESIZE-4)*SZREG(sp)
+       REG_S   \$22,($FRAMESIZE-5)*SZREG(sp)
+       REG_S   \$21,($FRAMESIZE-6)*SZREG(sp)
+       REG_S   \$20,($FRAMESIZE-7)*SZREG(sp)
+       REG_S   \$19,($FRAMESIZE-8)*SZREG(sp)
+       REG_S   \$18,($FRAMESIZE-9)*SZREG(sp)
+       REG_S   \$17,($FRAMESIZE-10)*SZREG(sp)
+       REG_S   \$16,($FRAMESIZE-11)*SZREG(sp)
+
+       lw      $A,0($ctx)
+       lw      $B,4($ctx)
+       lw      $C,8($ctx)
+       lw      $D,12($ctx)
+       b       .Loop
+       lw      $E,16($ctx)
+.align 4
+.Loop:
+       .set    reorder
+       lwl     @X[0],$MSB($inp)
+       lui     $K,0x5a82
+       lwr     @X[0],$LSB($inp)
+       ori     $K,0x7999       # K_00_19
+___
+for ($i=0;$i<15;$i++)  { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
+for (;$i<20;$i++)      { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       lui     $K,0x6ed9
+       ori     $K,0xeba1       # K_20_39
+___
+for (;$i<40;$i++)      { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       lui     $K,0x8f1b
+       ori     $K,0xbcdc       # K_40_59
+___
+for (;$i<60;$i++)      { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       lui     $K,0xca62
+       ori     $K,0xc1d6       # K_60_79
+___
+for (;$i<80;$i++)      { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       addu    $A,$X[0]
+       addu    $B,$X[1]
+       sw      $A,0($ctx)
+       addu    $C,$X[2]
+       addu    $D,$X[3]
+       sw      $B,4($ctx)
+       addu    $E,$X[4]
+       PTR_SUB $num,1
+       sw      $C,8($ctx)
+       sw      $D,12($ctx)
+       sw      $E,16($ctx)
+       .set    noreorder
+       bnez    $num,.Loop
+       PTR_ADD $inp,64
+
+       .set    noreorder
+       REG_L   \$31,($FRAMESIZE-1)*SZREG(sp)
+       REG_L   \$30,($FRAMESIZE-2)*SZREG(sp)
+       REG_L   \$28,($FRAMESIZE-3)*SZREG(sp)
+       REG_L   \$23,($FRAMESIZE-4)*SZREG(sp)
+       REG_L   \$22,($FRAMESIZE-5)*SZREG(sp)
+       REG_L   \$21,($FRAMESIZE-6)*SZREG(sp)
+       REG_L   \$20,($FRAMESIZE-7)*SZREG(sp)
+       REG_L   \$19,($FRAMESIZE-8)*SZREG(sp)
+       REG_L   \$18,($FRAMESIZE-9)*SZREG(sp)
+       REG_L   \$17,($FRAMESIZE-10)*SZREG(sp)
+       REG_L   \$16,($FRAMESIZE-11)*SZREG(sp)
+       jr      ra
+       PTR_ADD sp,$FRAMESIZE*SZREG
+.end   sha1_block_data_order
+___
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-parisc.pl b/crypto/sha/asm/sha1-parisc.pl
new file mode 100644 (file)
index 0000000..6d7bf49
--- /dev/null
@@ -0,0 +1,259 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for PA-RISC.
+
+# June 2009.
+#
+# On PA-7100LC performance is >30% better than gcc 3.2 generated code
+# for aligned input and >50% better for unaligned. Compared to vendor
+# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
+# few percent faster in 32-bit one (this for aligned input, data for
+# unaligned input is not available).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+       $LEVEL          ="2.0W";
+       $SIZE_T         =8;
+       $FRAME_MARKER   =80;
+       $SAVED_RP       =16;
+       $PUSH           ="std";
+       $PUSHMA         ="std,ma";
+       $POP            ="ldd";
+       $POPMB          ="ldd,mb";
+} else {
+       $LEVEL          ="1.0";
+       $SIZE_T         =4;
+       $FRAME_MARKER   =48;
+       $SAVED_RP       =20;
+       $PUSH           ="stw";
+       $PUSHMA         ="stwm";
+       $POP            ="ldw";
+       $POPMB          ="ldwm";
+}
+
+$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
+                               #                 [+ argument transfer]
+$ctx="%r26";           # arg0
+$inp="%r25";           # arg1
+$num="%r24";           # arg2
+
+$t0="%r28";
+$t1="%r29";
+$K="%r31";
+
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
+
+@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<15);
+       addl    $K,$e,$e        ; $i
+       shd     $a,$a,27,$t1
+       addl    @X[$i],$e,$e
+       and     $c,$b,$t0
+       addl    $t1,$e,$e
+       andcm   $d,$b,$t1
+       shd     $b,$b,2,$b
+       or      $t1,$t0,$t0
+       addl    $t0,$e,$e
+___
+$code.=<<___ if ($i>=15);      # with forward Xupdate
+       addl    $K,$e,$e        ; $i
+       shd     $a,$a,27,$t1
+       xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+       addl    @X[$i%16],$e,$e
+       and     $c,$b,$t0
+       xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+       addl    $t1,$e,$e
+       andcm   $d,$b,$t1
+       shd     $b,$b,2,$b
+       or      $t1,$t0,$t0
+       xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+       add     $t0,$e,$e
+       shd     @X[$j%16],@X[$j%16],31,@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+       xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]       ; $i
+       addl    $K,$e,$e
+       shd     $a,$a,27,$t1
+       xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+       addl    @X[$i%16],$e,$e
+       xor     $b,$c,$t0
+       xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+       addl    $t1,$e,$e
+       shd     $b,$b,2,$b
+       xor     $d,$t0,$t0
+       shd     @X[$j%16],@X[$j%16],31,@X[$j%16]
+       addl    $t0,$e,$e
+___
+$code.=<<___ if ($i==79);      # with context load
+       ldw     0($ctx),@X[0]   ; $i
+       addl    $K,$e,$e
+       shd     $a,$a,27,$t1
+       ldw     4($ctx),@X[1]
+       addl    @X[$i%16],$e,$e
+       xor     $b,$c,$t0
+       ldw     8($ctx),@X[2]
+       addl    $t1,$e,$e
+       shd     $b,$b,2,$b
+       xor     $d,$t0,$t0
+       ldw     12($ctx),@X[3]
+       addl    $t0,$e,$e
+       ldw     16($ctx),@X[4]
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;
+       shd     $a,$a,27,$t1    ; $i
+       addl    $K,$e,$e
+       xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+       xor     $d,$c,$t0
+       addl    @X[$i%16],$e,$e
+       xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+       and     $b,$t0,$t0
+       addl    $t1,$e,$e
+       shd     $b,$b,2,$b
+       xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+       addl    $t0,$e,$e
+       and     $d,$c,$t1
+       shd     @X[$j%16],@X[$j%16],31,@X[$j%16]
+       addl    $t1,$e,$e
+___
+}
+
+$code=<<___;
+       .LEVEL  $LEVEL
+       .SPACE  \$TEXT\$
+       .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+       .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+sha1_block_data_order
+       .PROC
+       .CALLINFO       FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
+       .ENTRY
+       $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
+       $PUSHMA %r3,$FRAME(%sp)
+       $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+       $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+       $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+       $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
+       $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
+       $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
+       $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
+       $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
+       $PUSH   %r12,`-$FRAME+9*$SIZE_T`(%sp)
+       $PUSH   %r13,`-$FRAME+10*$SIZE_T`(%sp)
+       $PUSH   %r14,`-$FRAME+11*$SIZE_T`(%sp)
+       $PUSH   %r15,`-$FRAME+12*$SIZE_T`(%sp)
+       $PUSH   %r16,`-$FRAME+13*$SIZE_T`(%sp)
+
+       ldw     0($ctx),$A
+       ldw     4($ctx),$B
+       ldw     8($ctx),$C
+       ldw     12($ctx),$D
+       ldw     16($ctx),$E
+
+       extru   $inp,31,2,$t0           ; t0=inp&3;
+       sh3addl $t0,%r0,$t0             ; t0*=8;
+       subi    32,$t0,$t0              ; t0=32-t0;
+       mtctl   $t0,%cr11               ; %sar=t0;
+
+L\$oop
+       ldi     3,$t0
+       andcm   $inp,$t0,$t0            ; 64-bit neutral
+___
+       for ($i=0;$i<15;$i++) {         # load input block
+       $code.="\tldw   `4*$i`($t0),@X[$i]\n";          }
+$code.=<<___;
+       cmpb,*= $inp,$t0,L\$aligned
+       ldw     60($t0),@X[15]
+       ldw     64($t0),@X[16]
+___
+       for ($i=0;$i<16;$i++) {         # align input
+       $code.="\tvshd  @X[$i],@X[$i+1],@X[$i]\n";      }
+$code.=<<___;
+L\$aligned
+       ldil    L'0x5a827000,$K         ; K_00_19
+       ldo     0x999($K),$K
+___
+for ($i=0;$i<20;$i++)   { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       ldil    L'0x6ed9e000,$K         ; K_20_39
+       ldo     0xba1($K),$K
+___
+
+for (;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       ldil    L'0x8f1bb000,$K         ; K_40_59
+       ldo     0xcdc($K),$K
+___
+
+for (;$i<60;$i++)       { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       ldil    L'0xca62c000,$K         ; K_60_79
+       ldo     0x1d6($K),$K
+___
+for (;$i<80;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+       addl    @X[0],$A,$A
+       addl    @X[1],$B,$B
+       addl    @X[2],$C,$C
+       addl    @X[3],$D,$D
+       addl    @X[4],$E,$E
+       stw     $A,0($ctx)
+       stw     $B,4($ctx)
+       stw     $C,8($ctx)
+       stw     $D,12($ctx)
+       stw     $E,16($ctx)
+       addib,*<> -1,$num,L\$oop
+       ldo     64($inp),$inp
+
+       $POP    `-$FRAME-$SAVED_RP`(%sp),%r2    ; standard epilogue
+       $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+       $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+       $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+       $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
+       $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
+       $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
+       $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
+       $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
+       $POP    `-$FRAME+9*$SIZE_T`(%sp),%r12
+       $POP    `-$FRAME+10*$SIZE_T`(%sp),%r13
+       $POP    `-$FRAME+11*$SIZE_T`(%sp),%r14
+       $POP    `-$FRAME+12*$SIZE_T`(%sp),%r15
+       $POP    `-$FRAME+13*$SIZE_T`(%sp),%r16
+       bv      (%r2)
+       .EXIT
+       $POPMB  -$FRAME(%sp),%r3
+       .PROCEND
+       .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/,\*/,/gm if ($SIZE_T==4);
+print $code;
+close STDOUT;