crypto/chacha/asm/chacha-s390x.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # December 2015
  11 #
  12 # ChaCha20 for s390x.
  13 #
  14 # 3 times faster than compiler-generated code.
  15
  16 $flavour = shift;
  17
  18 if ($flavour =~ /3[12]/) {
  19         $SIZE_T=4;
  20         $g="";
  21 } else {
  22         $SIZE_T=8;
  23         $g="g";
  24 }
  25
  26 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  27 open STDOUT,">$output";
  28
  29 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
  30 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  31     $code .= "\t$opcode\t".join(',',@_)."\n";
  32 }
  33
  34 my $sp="%r15";
  35
  36 my $stdframe=16*$SIZE_T+4*8;
  37 my $frame=$stdframe+4*20;
  38
  39 my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
  40
  41 my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
  42 my @t=map("%r$_",(8,9));
  43
  44 sub ROUND {
  45 my ($a0,$b0,$c0,$d0)=@_;
  46 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  47 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  48 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  49 my ($xc,$xc_)=map("\"$_\"",@t);
  50 my @x=map("\"$_\"",@x);
  51
  52         # Consider order in which variables are addressed by their
  53         # index:
  54         #
  55         #       a   b   c   d
  56         #
  57         #       0   4   8  12 < even round
  58         #       1   5   9  13
  59         #       2   6  10  14
  60         #       3   7  11  15
  61         #       0   5  10  15 < odd round
  62         #       1   6  11  12
  63         #       2   7   8  13
  64         #       3   4   9  14
  65         #
  66         # 'a', 'b' and 'd's are permanently allocated in registers,
  67         # @x[0..7,12..15], while 'c's are maintained in memory. If
  68         # you observe 'c' column, you'll notice that pair of 'c's is
  69         # invariant between rounds. This means that we have to reload
  70         # them once per round, in the middle. This is why you'll see
  71         # 'c' stores and loads in the middle, but none in the beginning
  72         # or end.
  73
  74         (
  75         "&alr   (@x[$a0],@x[$b0])",     # Q1
  76          "&alr  (@x[$a1],@x[$b1])",     # Q2
  77         "&xr    (@x[$d0],@x[$a0])",
  78          "&xr   (@x[$d1],@x[$a1])",
  79         "&rll   (@x[$d0],@x[$d0],16)",
  80          "&rll  (@x[$d1],@x[$d1],16)",
  81
  82         "&alr   ($xc,@x[$d0])",
  83          "&alr  ($xc_,@x[$d1])",
  84         "&xr    (@x[$b0],$xc)",
  85          "&xr   (@x[$b1],$xc_)",
  86         "&rll   (@x[$b0],@x[$b0],12)",
  87          "&rll  (@x[$b1],@x[$b1],12)",
  88
  89         "&alr   (@x[$a0],@x[$b0])",
  90          "&alr  (@x[$a1],@x[$b1])",
  91         "&xr    (@x[$d0],@x[$a0])",
  92          "&xr   (@x[$d1],@x[$a1])",
  93         "&rll   (@x[$d0],@x[$d0],8)",
  94          "&rll  (@x[$d1],@x[$d1],8)",
  95
  96         "&alr   ($xc,@x[$d0])",
  97          "&alr  ($xc_,@x[$d1])",
  98         "&xr    (@x[$b0],$xc)",
  99          "&xr   (@x[$b1],$xc_)",
 100         "&rll   (@x[$b0],@x[$b0],7)",
 101          "&rll  (@x[$b1],@x[$b1],7)",
 102
 103         "&stm   ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's
 104         "&lm    ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
 105
 106         "&alr   (@x[$a2],@x[$b2])",     # Q3
 107          "&alr  (@x[$a3],@x[$b3])",     # Q4
 108         "&xr    (@x[$d2],@x[$a2])",
 109          "&xr   (@x[$d3],@x[$a3])",
 110         "&rll   (@x[$d2],@x[$d2],16)",
 111          "&rll  (@x[$d3],@x[$d3],16)",
 112
 113         "&alr   ($xc,@x[$d2])",
 114          "&alr  ($xc_,@x[$d3])",
 115         "&xr    (@x[$b2],$xc)",
 116          "&xr   (@x[$b3],$xc_)",
 117         "&rll   (@x[$b2],@x[$b2],12)",
 118          "&rll  (@x[$b3],@x[$b3],12)",
 119
 120         "&alr   (@x[$a2],@x[$b2])",
 121          "&alr  (@x[$a3],@x[$b3])",
 122         "&xr    (@x[$d2],@x[$a2])",
 123          "&xr   (@x[$d3],@x[$a3])",
 124         "&rll   (@x[$d2],@x[$d2],8)",
 125          "&rll  (@x[$d3],@x[$d3],8)",
 126
 127         "&alr   ($xc,@x[$d2])",
 128          "&alr  ($xc_,@x[$d3])",
 129         "&xr    (@x[$b2],$xc)",
 130          "&xr   (@x[$b3],$xc_)",
 131         "&rll   (@x[$b2],@x[$b2],7)",
 132          "&rll  (@x[$b3],@x[$b3],7)"
 133         );
 134 }
 135
 136 $code.=<<___;
 137 .text
 138
 139 .globl  ChaCha20_ctr32
 140 .type   ChaCha20_ctr32,\@function
 141 .align  32
 142 ChaCha20_ctr32:
 143         a${g}hi $len,-64
 144         l${g}hi %r1,-$frame
 145         stm${g} %r6,%r15,`6*$SIZE_T`($sp)
 146         sl${g}r $out,$inp                       # difference
 147         la      $len,0($inp,$len)               # end of input minus 64
 148         larl    %r7,.Lsigma
 149         lgr     %r0,$sp
 150         la      $sp,0(%r1,$sp)
 151         st${g}  %r0,0($sp)
 152
 153         lmg     %r8,%r11,0($key)                # load key
 154         lmg     %r12,%r13,0($counter)           # load counter
 155         lmg     %r6,%r7,0(%r7)                  # load sigma constant
 156
 157         la      %r14,0($inp)
 158         st${g}  $out,$frame+3*$SIZE_T($sp)
 159         st${g}  $len,$frame+4*$SIZE_T($sp)
 160         stmg    %r6,%r13,$stdframe($sp)         # copy key schedule to stack
 161         srlg    @x[12],%r12,32                  # 32-bit counter value
 162         j       .Loop_outer
 163
 164 .align  16
 165 .Loop_outer:
 166         lm      @x[0],@x[7],$stdframe+4*0($sp)          # load x[0]-x[7]
 167         lm      @t[0],@t[1],$stdframe+4*10($sp)         # load x[10]-x[11]
 168         lm      @x[13],@x[15],$stdframe+4*13($sp)       # load x[13]-x[15]
 169         stm     @t[0],@t[1],$stdframe+4*8+4*10($sp)     # offload x[10]-x[11]
 170         lm      @t[0],@t[1],$stdframe+4*8($sp)          # load x[8]-x[9]
 171         st      @x[12],$stdframe+4*12($sp)              # save counter
 172         st${g}  %r14,$frame+2*$SIZE_T($sp)              # save input pointer
 173         lhi     %r14,10
 174         j       .Loop
 175
 176 .align  4
 177 .Loop:
 178 ___
 179         foreach (&ROUND(0, 4, 8,12)) { eval; }
 180         foreach (&ROUND(0, 5,10,15)) { eval; }
 181 $code.=<<___;
 182         brct    %r14,.Loop
 183
 184         l${g}   %r14,$frame+2*$SIZE_T($sp)              # pull input pointer
 185         stm     @t[0],@t[1],$stdframe+4*8+4*8($sp)      # offload x[8]-x[9]
 186         lm${g}  @t[0],@t[1],$frame+3*$SIZE_T($sp)
 187
 188         al      @x[0],$stdframe+4*0($sp)        # accumulate key schedule
 189         al      @x[1],$stdframe+4*1($sp)
 190         al      @x[2],$stdframe+4*2($sp)
 191         al      @x[3],$stdframe+4*3($sp)
 192         al      @x[4],$stdframe+4*4($sp)
 193         al      @x[5],$stdframe+4*5($sp)
 194         al      @x[6],$stdframe+4*6($sp)
 195         al      @x[7],$stdframe+4*7($sp)
 196         lrvr    @x[0],@x[0]
 197         lrvr    @x[1],@x[1]
 198         lrvr    @x[2],@x[2]
 199         lrvr    @x[3],@x[3]
 200         lrvr    @x[4],@x[4]
 201         lrvr    @x[5],@x[5]
 202         lrvr    @x[6],@x[6]
 203         lrvr    @x[7],@x[7]
 204         al      @x[12],$stdframe+4*12($sp)
 205         al      @x[13],$stdframe+4*13($sp)
 206         al      @x[14],$stdframe+4*14($sp)
 207         al      @x[15],$stdframe+4*15($sp)
 208         lrvr    @x[12],@x[12]
 209         lrvr    @x[13],@x[13]
 210         lrvr    @x[14],@x[14]
 211         lrvr    @x[15],@x[15]
 212
 213         la      @t[0],0(@t[0],%r14)             # reconstruct output pointer
 214         cl${g}r %r14,@t[1]
 215         jh      .Ltail
 216
 217         x       @x[0],4*0(%r14)                 # xor with input
 218         x       @x[1],4*1(%r14)
 219         st      @x[0],4*0(@t[0])                # store output
 220         x       @x[2],4*2(%r14)
 221         st      @x[1],4*1(@t[0])
 222         x       @x[3],4*3(%r14)
 223         st      @x[2],4*2(@t[0])
 224         x       @x[4],4*4(%r14)
 225         st      @x[3],4*3(@t[0])
 226          lm     @x[0],@x[3],$stdframe+4*8+4*8($sp)      # load x[8]-x[11]
 227         x       @x[5],4*5(%r14)
 228         st      @x[4],4*4(@t[0])
 229         x       @x[6],4*6(%r14)
 230          al     @x[0],$stdframe+4*8($sp)
 231         st      @x[5],4*5(@t[0])
 232         x       @x[7],4*7(%r14)
 233          al     @x[1],$stdframe+4*9($sp)
 234         st      @x[6],4*6(@t[0])
 235         x       @x[12],4*12(%r14)
 236          al     @x[2],$stdframe+4*10($sp)
 237         st      @x[7],4*7(@t[0])
 238         x       @x[13],4*13(%r14)
 239          al     @x[3],$stdframe+4*11($sp)
 240         st      @x[12],4*12(@t[0])
 241         x       @x[14],4*14(%r14)
 242         st      @x[13],4*13(@t[0])
 243         x       @x[15],4*15(%r14)
 244         st      @x[14],4*14(@t[0])
 245          lrvr   @x[0],@x[0]
 246         st      @x[15],4*15(@t[0])
 247          lrvr   @x[1],@x[1]
 248          lrvr   @x[2],@x[2]
 249          lrvr   @x[3],@x[3]
 250         lhi     @x[12],1
 251          x      @x[0],4*8(%r14)
 252         al      @x[12],$stdframe+4*12($sp)      # increment counter
 253          x      @x[1],4*9(%r14)
 254          st     @x[0],4*8(@t[0])
 255          x      @x[2],4*10(%r14)
 256          st     @x[1],4*9(@t[0])
 257          x      @x[3],4*11(%r14)
 258          st     @x[2],4*10(@t[0])
 259         la      %r14,64(%r14)
 260          st     @x[3],4*11(@t[0])
 261
 262         cl${g}r %r14,@t[1]                      # done yet?
 263         jle     .Loop_outer
 264
 265 .Ldone:
 266         xgr     %r0,%r0
 267         xgr     %r1,%r1
 268         xgr     %r2,%r2
 269         xgr     %r3,%r3
 270         stmg    %r0,%r3,$stdframe+4*4($sp)      # wipe key copy
 271         stmg    %r0,%r3,$stdframe+4*12($sp)
 272
 273         lm${g}  %r6,%r15,`$frame+6*$SIZE_T`($sp)
 274         br      %r14
 275
 276 .align  16
 277 .Ltail:
 278         la      @t[1],64($t[1])
 279         stm     @x[0],@x[7],$stdframe+4*0($sp)
 280         sl${g}r @t[1],%r14
 281         lm      @x[0],@x[3],$stdframe+4*8+4*8($sp)
 282         l${g}hi @x[6],0
 283         stm     @x[12],@x[15],$stdframe+4*12($sp)
 284         al      @x[0],$stdframe+4*8($sp)
 285         al      @x[1],$stdframe+4*9($sp)
 286         al      @x[2],$stdframe+4*10($sp)
 287         al      @x[3],$stdframe+4*11($sp)
 288         lrvr    @x[0],@x[0]
 289         lrvr    @x[1],@x[1]
 290         lrvr    @x[2],@x[2]
 291         lrvr    @x[3],@x[3]
 292         stm     @x[0],@x[3],$stdframe+4*8+4*8($sp)
 293
 294 .Loop_tail:
 295         llgc    @x[4],0(@x[6],%r14)
 296         llgc    @x[5],$stdframe(@x[6],$sp)
 297         xr      @x[5],@x[4]
 298         stc     @x[5],0(@x[6],@t[0])
 299         la      @x[6],1(@x[6])
 300         brct    @t[1],.Loop_tail
 301
 302         j       .Ldone
 303 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
 304
 305 .align  32
 306 .Lsigma:
 307 .long   0x61707865,0x3320646e,0x79622d32,0x6b206574     # endian-neutral
 308 .asciz  "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
 309 .align  4
 310 ___
 311
 312 foreach (split("\n",$code)) {
 313         s/\`([^\`]*)\`/eval $1/ge;
 314
 315         print $_,"\n";
 316 }
 317 close STDOUT;