crypto/sha/asm/sha1-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. Rights for redistribution and usage in source and binary
   6 # forms are granted according to the OpenSSL license.
   7 # ====================================================================
   8 #
   9 # sha1_block procedure for x86_64.
  10 #
  11 # It was brought to my attention that on EM64T compiler-generated code
  12 # was far behind 32-bit assembler implementation. This is unlike on
  13 # Opteron where compiler-generated code was only 15% behind 32-bit
  14 # assembler, which originally made it hard to motivate the effort.
  15 # There was suggestion to mechanically translate 32-bit code, but I
  16 # dismissed it, reasoning that x86_64 offers enough register bank
  17 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
  18 # implementation:-) However! While 64-bit code does performs better
  19 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
  20 # x86_64 does offer larger *addressable* bank, but out-of-order core
  21 # reaches for even more registers through dynamic aliasing, and EM64T
  22 # core must have managed to run-time optimize even 32-bit code just as
  23 # good as 64-bit one. Performance improvement is summarized in the
  24 # following table:
  25 #
  26 #               gcc 3.4         32-bit asm      cycles/byte
  27 # Opteron       +45%            +20%            6.8
  28 # Xeon          +65%            +0%             9.9
  29
  30 $output=shift;
  31 open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
  32
  33 $ctx="%rdi";    # 1st arg
  34 $inp="%rsi";    # 2nd arg
  35 $num="%rdx";    # 3rd arg
  36
  37 # reassign arguments in order to produce more compact code
  38 $ctx="%r8";
  39 $inp="%r9";
  40 $num="%r10";
  41
  42 $xi="%eax";
  43 $t0="%ebx";
  44 $t1="%ecx";
  45 $A="%edx";
  46 $B="%esi";
  47 $C="%edi";
  48 $D="%ebp";
  49 $E="%r11d";
  50 $T="%r12d";
  51
  52 @V=($A,$B,$C,$D,$E,$T);
  53
  54 sub PROLOGUE {
  55 my $func=shift;
  56 $code.=<<___;
  57 .globl  $func
  58 .type   $func,\@function,3
  59 .align  16
  60 $func:
  61         push    %rbx
  62         push    %rbp
  63         push    %r12
  64         mov     %rsp,%rax
  65         mov     %rdi,$ctx       # reassigned argument
  66         sub     \$`8+16*4`,%rsp
  67         mov     %rsi,$inp       # reassigned argument
  68         and     \$-64,%rsp
  69         mov     %rdx,$num       # reassigned argument
  70         mov     %rax,`16*4`(%rsp)
  71
  72         mov     0($ctx),$A
  73         mov     4($ctx),$B
  74         mov     8($ctx),$C
  75         mov     12($ctx),$D
  76         mov     16($ctx),$E
  77 ___
  78 }
  79
  80 sub EPILOGUE {
  81 my $func=shift;
  82 $code.=<<___;
  83         mov     `16*4`(%rsp),%rsp
  84         pop     %r12
  85         pop     %rbp
  86         pop     %rbx
  87         ret
  88 .size   $func,.-$func
  89 ___
  90 }
  91
  92 sub BODY_00_19 {
  93 my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
  94 my $j=$i+1;
  95 $code.=<<___ if ($i==0);
  96         mov     `4*$i`($inp),$xi
  97         `"bswap $xi"    if(!defined($host))`
  98         mov     $xi,`4*$i`(%rsp)
  99 ___
 100 $code.=<<___ if ($i<15);
 101         lea     0x5a827999($xi,$e),$f
 102         mov     $c,$t0
 103         mov     `4*$j`($inp),$xi
 104         mov     $a,$e
 105         xor     $d,$t0
 106         `"bswap $xi"    if(!defined($host))`
 107         rol     \$5,$e
 108         and     $b,$t0
 109         mov     $xi,`4*$j`(%rsp)
 110         add     $e,$f
 111         xor     $d,$t0
 112         rol     \$30,$b
 113         add     $t0,$f
 114 ___
 115 $code.=".Lshortcut:\n" if ($i==15);
 116 $code.=<<___ if ($i>=15);
 117         lea     0x5a827999($xi,$e),$f
 118         mov     `4*($j%16)`(%rsp),$xi
 119         mov     $c,$t0
 120         mov     $a,$e
 121         xor     `4*(($j+2)%16)`(%rsp),$xi
 122         xor     $d,$t0
 123         rol     \$5,$e
 124         xor     `4*(($j+8)%16)`(%rsp),$xi
 125         and     $b,$t0
 126         add     $e,$f
 127         xor     `4*(($j+13)%16)`(%rsp),$xi
 128         xor     $d,$t0
 129         rol     \$30,$b
 130         add     $t0,$f
 131         rol     \$1,$xi
 132         mov     $xi,`4*($j%16)`(%rsp)
 133 ___
 134 }
 135
 136 sub BODY_20_39 {
 137 my ($i,$a,$b,$c,$d,$e,$f)=@_;
 138 my $j=$i+1;
 139 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
 140 $code.=<<___ if ($i<79);
 141         lea     $K($xi,$e),$f
 142         mov     `4*($j%16)`(%rsp),$xi
 143         mov     $c,$t0
 144         mov     $a,$e
 145         xor     `4*(($j+2)%16)`(%rsp),$xi
 146         xor     $b,$t0
 147         rol     \$5,$e
 148         xor     `4*(($j+8)%16)`(%rsp),$xi
 149         xor     $d,$t0
 150         add     $e,$f
 151         xor     `4*(($j+13)%16)`(%rsp),$xi
 152         rol     \$30,$b
 153         add     $t0,$f
 154         rol     \$1,$xi
 155         mov     $xi,`4*($j%16)`(%rsp)
 156 ___
 157 $code.=<<___ if ($i==79);
 158         lea     $K($xi,$e),$f
 159         mov     $c,$t0
 160         mov     $a,$e
 161         xor     $b,$t0
 162         rol     \$5,$e
 163         xor     $d,$t0
 164         add     $e,$f
 165         rol     \$30,$b
 166         add     $t0,$f
 167 ___
 168 }
 169
 170 sub BODY_40_59 {
 171 my ($i,$a,$b,$c,$d,$e,$f)=@_;
 172 my $j=$i+1;
 173 $code.=<<___;
 174         lea     0x8f1bbcdc($xi,$e),$f
 175         mov     `4*($j%16)`(%rsp),$xi
 176         mov     $b,$t0
 177         mov     $b,$t1
 178         xor     `4*(($j+2)%16)`(%rsp),$xi
 179         mov     $a,$e
 180         and     $c,$t0
 181         xor     `4*(($j+8)%16)`(%rsp),$xi
 182         or      $c,$t1
 183         rol     \$5,$e
 184         xor     `4*(($j+13)%16)`(%rsp),$xi
 185         and     $d,$t1
 186         add     $e,$f
 187         rol     \$1,$xi
 188         or      $t1,$t0
 189         rol     \$30,$b
 190         mov     $xi,`4*($j%16)`(%rsp)
 191         add     $t0,$f
 192 ___
 193 }
 194
 195 $code=".text\n";
 196
 197 &PROLOGUE("sha1_block_asm_data_order");
 198 $code.=".align  4\n.Lloop:\n";
 199 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 200 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 201 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
 202 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 203 $code.=<<___;
 204         add     0($ctx),$E
 205         add     4($ctx),$T
 206         add     8($ctx),$A
 207         add     12($ctx),$B
 208         add     16($ctx),$C
 209         mov     $E,0($ctx)
 210         mov     $T,4($ctx)
 211         mov     $A,8($ctx)
 212         mov     $B,12($ctx)
 213         mov     $C,16($ctx)
 214
 215         xchg    $E,$A   # mov   $E,$A
 216         xchg    $T,$B   # mov   $T,$B
 217         xchg    $E,$C   # mov   $A,$C
 218         xchg    $T,$D   # mov   $B,$D
 219                         # mov   $C,$E
 220         lea     `16*4`($inp),$inp
 221         sub     \$1,$num
 222         jnz     .Lloop
 223 ___
 224 &EPILOGUE("sha1_block_asm_data_order");
 225
 226 ####################################################################
 227
 228 @V=($A,$B,$C,$D,$E,$T);
 229
 230 &PROLOGUE("sha1_block_asm_host_order");
 231 for($i=0;$i<15;$i++)    { &BODY_00_19($i,@V,1); unshift(@V,pop(@V)); }
 232 $code.=<<___;
 233         jmp     .Lshortcut
 234 .size   sha1_block_asm_host_order,.-sha1_block_asm_host_order
 235 ___
 236
 237 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 238 print $code;
 239 close STDOUT;