crypto/rc4/asm/rc4-amd64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. Rights for redistribution and usage in source and binary
   6 # forms are granted according to the OpenSSL license.
   7 # ====================================================================
   8 #
   9 # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
  10 # "hand-coded assembler"] doesn't stand for the whole improvement
  11 # coefficient. It turned out that eliminating RC4_CHAR from config
  12 # line results in ~40% improvement (yes, even for C implementation).
  13 # Presumably it has everything to do with AMD cache architecture and
  14 # RAW or whatever penalties. Once again! The module *requires* config
  15 # line *without* RC4_CHAR! As for coding "secret," I bet on partial
  16 # register arithmetics. For example instead of 'inc %r8; and $255,%r8'
  17 # I simply 'inc %r8b'. Even though optimization manual discourages
  18 # to operate on partial registers, it turned out to be the best bet.
  19 # At least for AMD... How IA32E would perform remains to be seen...
  20
  21 # As was shown by Marc Bevand reordering of couple of load operations
  22 # results in even higher performance gain of 3.3x:-) At least on
  23 # Opteron... For reference, 1x in this case is RC4_CHAR C-code
  24 # compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
  25 # Latter means that if you want to *estimate* what to expect from
  26 # *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz.
  27
  28 $output=shift;
  29
  30 $win64a=1 if ($output =~ /win64a.[s|asm]/);
  31
  32 open STDOUT,">$output" || die "can't open $output: $!";
  33
  34 if (defined($win64a)) {
  35     $dat="%rcx";        # arg1
  36     $len="%rdx";        # arg2
  37     $inp="%rsi";        # r8, arg3 moves here
  38     $out="%rdi";        # r9, arg4 moves here
  39 } else {
  40     $dat="%rdi";        # arg1
  41     $len="%rsi";        # arg2
  42     $inp="%rdx";        # arg3
  43     $out="%rcx";        # arg4
  44 }
  45
  46 $XX="%r10";
  47 $TX="%r8";
  48 $YY="%r11";
  49 $TY="%r9";
  50
  51 sub PTR() {
  52     my $ret=shift;
  53     if (defined($win64a)) {
  54         $ret =~ s/\[([\S]+)\+([\S]+)\]/[$2+$1]/g;   # [%rN+%rM*4]->[%rM*4+%rN]
  55         $ret =~ s/:([^\[]+)\[([^\]]+)\]/:[$2+$1]/g; # :off[ea]->:[ea+off]
  56     } else {
  57         $ret =~ s/[\+\*]/,/g;           # [%rN+%rM*4]->[%rN,%rM,4]
  58         $ret =~ s/\[([^\]]+)\]/($1)/g;  # [%rN]->(%rN)
  59     }
  60     $ret;
  61 }
  62
  63 $code=<<___ if (!defined($win64a));
  64 .text
  65
  66 .globl  RC4
  67 .type   RC4,\@function
  68 .align  16
  69 RC4:    or      $len,$len
  70         jne     .Lentry
  71         repret
  72 .Lentry:
  73 ___
  74 $code=<<___ if (defined($win64a));
  75 _TEXT   SEGMENT
  76 PUBLIC  RC4
  77 ALIGN   16
  78 RC4     PROC
  79         or      $len,$len
  80         jne     .Lentry
  81         repret
  82 .Lentry:
  83         push    %rdi
  84         push    %rsi
  85         sub     \$40,%rsp
  86         mov     %r8,$inp
  87         mov     %r9,$out
  88 ___
  89 $code.=<<___;
  90         add     \$8,$dat
  91         movl    `&PTR("DWORD:-8[$dat]")`,$XX#d
  92         movl    `&PTR("DWORD:-4[$dat]")`,$YY#d
  93         test    \$-8,$len
  94         jz      .Lloop1
  95 .align  16
  96 .Lloop8:
  97         inc     $XX#b
  98         movl    `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
  99         add     $TX#b,$YY#b
 100         movl    `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
 101         movl    $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
 102         movl    $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
 103         add     $TX#b,$TY#b
 104         inc     $XX#b
 105         movl    `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
 106         movb    `&PTR("BYTE:[$dat+$TY*4]")`,%al
 107 ___
 108 for ($i=1;$i<=6;$i++) {
 109 $code.=<<___;
 110         add     $TX#b,$YY#b
 111         ror     \$8,%rax
 112         movl    `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
 113         movl    $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
 114         movl    $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
 115         add     $TX#b,$TY#b
 116         inc     $XX#b
 117         movl    `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
 118         movb    `&PTR("BYTE:[$dat+$TY*4]")`,%al
 119 ___
 120 }
 121 $code.=<<___;
 122         add     $TX#b,$YY#b
 123         ror     \$8,%rax
 124         movl    `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
 125         movl    $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
 126         movl    $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
 127         sub     \$8,$len
 128         add     $TY#b,$TX#b
 129         movb    `&PTR("BYTE:[$dat+$TX*4]")`,%al
 130         ror     \$8,%rax
 131         add     \$8,$inp
 132         add     \$8,$out
 133
 134         xor     `&PTR("QWORD:-8[$inp]")`,%rax
 135         mov     %rax,`&PTR("QWORD:-8[$out]")`
 136
 137         test    \$-8,$len
 138         jnz     .Lloop8
 139         cmp     \$0,$len
 140         jne     .Lloop1
 141 .Lexit:
 142         movl    $XX#d,`&PTR("DWORD:-8[$dat]")`
 143         movl    $YY#d,`&PTR("DWORD:-4[$dat]")`
 144 ___
 145 $code.=<<___ if (defined($win64a));
 146         add     \$40,%rsp
 147         pop     %rsi
 148         pop     %rdi
 149 ___
 150 $code.=<<___;
 151         repret
 152 .align  16
 153 .Lloop1:
 154         movzb   `&PTR("BYTE:[$inp]")`,%eax
 155         inc     $XX#b
 156         movl    `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
 157         add     $TX#b,$YY#b
 158         movl    `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
 159         movl    $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
 160         movl    $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
 161         add     $TY#b,$TX#b
 162         movl    `&PTR("DWORD:[$dat+$TX*4]")`,$TY#d
 163         xor     $TY,%rax
 164         inc     $inp
 165         movb    %al,`&PTR("BYTE:[$out]")`
 166         inc     $out
 167         dec     $len
 168         jnz     .Lloop1
 169         jmp     .Lexit
 170 ___
 171 $code.=<<___ if (defined($win64a));
 172 RC4     ENDP
 173 _TEXT   ENDS
 174 END
 175 ___
 176 $code.=<<___ if (!defined($win64a));
 177 .size   RC4,.-RC4
 178 ___
 179
 180 $code =~ s/#([bwd])/$1/gm;
 181 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 182
 183 if (defined($win64a)) {
 184     $code =~ s/\.align/ALIGN/gm;
 185     $code =~ s/[\$%]//gm;
 186     $code =~ s/\.L/\$L/gm;
 187     $code =~ s/([\w]+)([\s]+)([\S]+),([\S]+)/$1$2$4,$3/gm;
 188     $code =~ s/([QD]*WORD|BYTE):/$1 PTR/gm;
 189     $code =~ s/mov[bwlq]/mov/gm;
 190     $code =~ s/movzb/movzx/gm;
 191     $code =~ s/repret/DB\t0F3h,0C3h/gm;
 192 } else {
 193     $code =~ s/([QD]*WORD|BYTE)://gm;
 194     $code =~ s/repret/.byte\t0xF3,0xC3/gm;
 195 }
 196 print $code;