crypto/sha/asm/sha1-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # sha1_block procedure for x86_64.
  11 #
  12 # It was brought to my attention that on EM64T compiler-generated code
  13 # was far behind 32-bit assembler implementation. This is unlike on
  14 # Opteron where compiler-generated code was only 15% behind 32-bit
  15 # assembler, which originally made it hard to motivate the effort.
  16 # There was suggestion to mechanically translate 32-bit code, but I
  17 # dismissed it, reasoning that x86_64 offers enough register bank
  18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
  19 # implementation:-) However! While 64-bit code does performs better
  20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
  21 # x86_64 does offer larger *addressable* bank, but out-of-order core
  22 # reaches for even more registers through dynamic aliasing, and EM64T
  23 # core must have managed to run-time optimize even 32-bit code just as
  24 # good as 64-bit one. Performance improvement is summarized in the
  25 # following table:
  26 #
  27 #               gcc 3.4         32-bit asm      cycles/byte
  28 # Opteron       +45%            +20%            6.8
  29 # Xeon P4       +65%            +0%             9.9
  30 # Core2         +60%            +10%            8.8
  31
  32 $output=shift;
  33
  34 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  35 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  36 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  37 die "can't locate x86_64-xlate.pl";
  38
  39 open STDOUT,"| $^X $xlate $output";
  40
  41 $ctx="%rdi";    # 1st arg
  42 $inp="%rsi";    # 2nd arg
  43 $num="%rdx";    # 3rd arg
  44
  45 # reassign arguments in order to produce more compact code
  46 $ctx="%r8";
  47 $inp="%r9";
  48 $num="%r10";
  49
  50 $xi="%eax";
  51 $t0="%ebx";
  52 $t1="%ecx";
  53 $A="%edx";
  54 $B="%esi";
  55 $C="%edi";
  56 $D="%ebp";
  57 $E="%r11d";
  58 $T="%r12d";
  59
  60 @V=($A,$B,$C,$D,$E,$T);
  61
  62 sub PROLOGUE {
  63 my $func=shift;
  64 $code.=<<___;
  65 .globl  $func
  66 .type   $func,\@function,3
  67 .align  16
  68 $func:
  69         push    %rbx
  70         push    %rbp
  71         push    %r12
  72         mov     %rsp,%rax
  73         mov     %rdi,$ctx       # reassigned argument
  74         sub     \$`8+16*4`,%rsp
  75         mov     %rsi,$inp       # reassigned argument
  76         and     \$-64,%rsp
  77         mov     %rdx,$num       # reassigned argument
  78         mov     %rax,`16*4`(%rsp)
  79
  80         mov     0($ctx),$A
  81         mov     4($ctx),$B
  82         mov     8($ctx),$C
  83         mov     12($ctx),$D
  84         mov     16($ctx),$E
  85 ___
  86 }
  87
  88 sub EPILOGUE {
  89 my $func=shift;
  90 $code.=<<___;
  91         mov     `16*4`(%rsp),%rsp
  92         pop     %r12
  93         pop     %rbp
  94         pop     %rbx
  95         ret
  96 .size   $func,.-$func
  97 ___
  98 }
  99
 100 sub BODY_00_19 {
 101 my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
 102 my $j=$i+1;
 103 $code.=<<___ if ($i==0);
 104         mov     `4*$i`($inp),$xi
 105         `"bswap $xi"    if(!defined($host))`
 106         mov     $xi,`4*$i`(%rsp)
 107 ___
 108 $code.=<<___ if ($i<15);
 109         lea     0x5a827999($xi,$e),$f
 110         mov     $c,$t0
 111         mov     `4*$j`($inp),$xi
 112         mov     $a,$e
 113         xor     $d,$t0
 114         `"bswap $xi"    if(!defined($host))`
 115         rol     \$5,$e
 116         and     $b,$t0
 117         mov     $xi,`4*$j`(%rsp)
 118         add     $e,$f
 119         xor     $d,$t0
 120         rol     \$30,$b
 121         add     $t0,$f
 122 ___
 123 $code.=<<___ if ($i>=15);
 124         lea     0x5a827999($xi,$e),$f
 125         mov     `4*($j%16)`(%rsp),$xi
 126         mov     $c,$t0
 127         mov     $a,$e
 128         xor     `4*(($j+2)%16)`(%rsp),$xi
 129         xor     $d,$t0
 130         rol     \$5,$e
 131         xor     `4*(($j+8)%16)`(%rsp),$xi
 132         and     $b,$t0
 133         add     $e,$f
 134         xor     `4*(($j+13)%16)`(%rsp),$xi
 135         xor     $d,$t0
 136         rol     \$30,$b
 137         add     $t0,$f
 138         rol     \$1,$xi
 139         mov     $xi,`4*($j%16)`(%rsp)
 140 ___
 141 }
 142
 143 sub BODY_20_39 {
 144 my ($i,$a,$b,$c,$d,$e,$f)=@_;
 145 my $j=$i+1;
 146 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
 147 $code.=<<___ if ($i<79);
 148         lea     $K($xi,$e),$f
 149         mov     `4*($j%16)`(%rsp),$xi
 150         mov     $c,$t0
 151         mov     $a,$e
 152         xor     `4*(($j+2)%16)`(%rsp),$xi
 153         xor     $b,$t0
 154         rol     \$5,$e
 155         xor     `4*(($j+8)%16)`(%rsp),$xi
 156         xor     $d,$t0
 157         add     $e,$f
 158         xor     `4*(($j+13)%16)`(%rsp),$xi
 159         rol     \$30,$b
 160         add     $t0,$f
 161         rol     \$1,$xi
 162 ___
 163 $code.=<<___ if ($i<76);
 164         mov     $xi,`4*($j%16)`(%rsp)
 165 ___
 166 $code.=<<___ if ($i==79);
 167         lea     $K($xi,$e),$f
 168         mov     $c,$t0
 169         mov     $a,$e
 170         xor     $b,$t0
 171         rol     \$5,$e
 172         xor     $d,$t0
 173         add     $e,$f
 174         rol     \$30,$b
 175         add     $t0,$f
 176 ___
 177 }
 178
 179 sub BODY_40_59 {
 180 my ($i,$a,$b,$c,$d,$e,$f)=@_;
 181 my $j=$i+1;
 182 $code.=<<___;
 183         lea     0x8f1bbcdc($xi,$e),$f
 184         mov     `4*($j%16)`(%rsp),$xi
 185         mov     $b,$t0
 186         mov     $b,$t1
 187         xor     `4*(($j+2)%16)`(%rsp),$xi
 188         mov     $a,$e
 189         and     $c,$t0
 190         xor     `4*(($j+8)%16)`(%rsp),$xi
 191         or      $c,$t1
 192         rol     \$5,$e
 193         xor     `4*(($j+13)%16)`(%rsp),$xi
 194         and     $d,$t1
 195         add     $e,$f
 196         rol     \$1,$xi
 197         or      $t1,$t0
 198         rol     \$30,$b
 199         mov     $xi,`4*($j%16)`(%rsp)
 200         add     $t0,$f
 201 ___
 202 }
 203
 204 $code=".text\n";
 205
 206 &PROLOGUE("sha1_block_data_order");
 207 $code.=".align  4\n.Lloop:\n";
 208 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 209 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 210 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
 211 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 212 $code.=<<___;
 213         add     0($ctx),$E
 214         add     4($ctx),$T
 215         add     8($ctx),$A
 216         add     12($ctx),$B
 217         add     16($ctx),$C
 218         mov     $E,0($ctx)
 219         mov     $T,4($ctx)
 220         mov     $A,8($ctx)
 221         mov     $B,12($ctx)
 222         mov     $C,16($ctx)
 223
 224         xchg    $E,$A   # mov   $E,$A
 225         xchg    $T,$B   # mov   $T,$B
 226         xchg    $E,$C   # mov   $A,$C
 227         xchg    $T,$D   # mov   $B,$D
 228                         # mov   $C,$E
 229         lea     `16*4`($inp),$inp
 230         sub     \$1,$num
 231         jnz     .Lloop
 232 ___
 233 &EPILOGUE("sha1_block_data_order");
 234 $code.=<<___;
 235 .asciz  "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 236 ___
 237
 238 ####################################################################
 239
 240 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 241 print $code;
 242 close STDOUT;