crypto/bn/asm/ppc64-mont-fixed.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 # ====================================================================
  10 # Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
  11 # <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
  12 # the OpenSSL project.
  13 # ====================================================================
  14
  15 #
  16 # Fixed length (n=6), unrolled PPC Montgomery Multiplication
  17 #
  18
  19 # 2021
  20 #
  21 # Although this is a generic implementation for unrolling Montgomery
  22 # Multiplication for arbitrary values of n, this is currently only
  23 # used for n = 6 to improve the performance of ECC p384.
  24 #
  25 # Unrolling allows intermediate results to be stored in registers,
  26 # rather than on the stack, improving performance by ~7% compared to
  27 # the existing PPC assembly code.
  28 #
  29 # The ISA 3.0 implementation uses combination multiply/add
  30 # instructions (maddld, maddhdu) to improve performance by an
  31 # additional ~10% on Power 9.
  32 #
  33 # Finally, saving non-volatile registers into volatile vector
  34 # registers instead of onto the stack saves a little more.
  35 #
  36 # On a Power 9 machine we see an overall improvement of ~18%.
  37 #
  38
  39 use strict;
  40 use warnings;
  41
  42 my ($flavour, $output, $dir, $xlate);
  43
  44 # $output is the last argument if it looks like a file (it has an extension)
  45 # $flavour is the first argument if it doesn't look like a file
  46 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  47 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  48
  49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  50 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  51 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  52 die "can't locate ppc-xlate.pl";
  53
  54 open STDOUT,"| $^X $xlate $flavour \"$output\""
  55     or die "can't call $xlate: $!";
  56
  57 if ($flavour !~ /64/) {
  58         die "bad flavour ($flavour) - only ppc64 permitted";
  59 }
  60
  61 my $SIZE_T= 8;
  62
  63 # Registers are global so the code is remotely readable
  64
  65 # Parameters for Montgomery multiplication
  66 my $ze  = "r0";
  67 my $sp  = "r1";
  68 my $toc = "r2";
  69 my $rp  = "r3";
  70 my $ap  = "r4";
  71 my $bp  = "r5";
  72 my $np  = "r6";
  73 my $n0  = "r7";
  74 my $num = "r8";
  75
  76 my $i   = "r9";
  77 my $c0  = "r10";
  78 my $bp0 = "r11";
  79 my $bpi = "r11";
  80 my $bpj = "r11";
  81 my $tj  = "r12";
  82 my $apj = "r12";
  83 my $npj = "r12";
  84 my $lo  = "r14";
  85 my $c1  = "r14";
  86
  87 # Non-volatile registers used for tp[i]
  88 #
  89 # 12 registers are available but the limit on unrolling is 10,
  90 # since registers from $tp[0] to $tp[$n+1] are used.
  91 my @tp = ("r20" .. "r31");
  92
  93 # volatile VSRs for saving non-volatile GPRs - faster than stack
  94 my @vsrs = ("v32" .. "v46");
  95
  96 package Mont;
  97
  98 sub new($$)
  99 {
 100         my ($class, $n) = @_;
 101
 102         if ($n > 10) {
 103                 die "Can't unroll for BN length ${n} (maximum 10)"
 104         }
 105
 106         my $self = {
 107                 code => "",
 108                 n => $n,
 109         };
 110         bless $self, $class;
 111
 112         return $self;
 113 }
 114
 115 sub add_code($$)
 116 {
 117         my ($self, $c) = @_;
 118
 119         $self->{code} .= $c;
 120 }
 121
 122 sub get_code($)
 123 {
 124         my ($self) = @_;
 125
 126         return $self->{code};
 127 }
 128
 129 sub get_function_name($)
 130 {
 131         my ($self) = @_;
 132
 133         return "bn_mul_mont_fixed_n" . $self->{n};
 134 }
 135
 136 sub get_label($$)
 137 {
 138         my ($self, $l) = @_;
 139
 140         return "L" . $l . "_" . $self->{n};
 141 }
 142
 143 sub get_labels($@)
 144 {
 145         my ($self, @labels) = @_;
 146
 147         my %out = ();
 148
 149         foreach my $l (@labels) {
 150                 $out{"$l"} = $self->get_label("$l");
 151         }
 152
 153         return \%out;
 154 }
 155
 156 sub nl($)
 157 {
 158         my ($self) = @_;
 159
 160         $self->add_code("\n");
 161 }
 162
 163 sub copy_result($)
 164 {
 165         my ($self) = @_;
 166
 167         my ($n) = $self->{n};
 168
 169         for (my $j = 0; $j < $n; $j++) {
 170                 $self->add_code(<<___);
 171         std             $tp[$j],`$j*$SIZE_T`($rp)
 172 ___
 173         }
 174
 175 }
 176
 177 sub mul_mont_fixed($)
 178 {
 179         my ($self) = @_;
 180
 181         my ($n) = $self->{n};
 182         my $fname = $self->get_function_name();
 183         my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
 184
 185         $self->add_code(<<___);
 186
 187 .globl  .${fname}
 188 .align  5
 189 .${fname}:
 190
 191 ___
 192
 193         $self->save_registers();
 194
 195         $self->add_code(<<___);
 196         li              $ze,0
 197         ld              $n0,0($n0)
 198
 199         ld              $bp0,0($bp)
 200
 201         ld              $apj,0($ap)
 202 ___
 203
 204         $self->mul_c_0($tp[0], $apj, $bp0, $c0);
 205
 206         for (my $j = 1; $j < $n - 1; $j++) {
 207                 $self->add_code(<<___);
 208         ld              $apj,`$j*$SIZE_T`($ap)
 209 ___
 210                 $self->mul($tp[$j], $apj, $bp0, $c0);
 211         }
 212
 213         $self->add_code(<<___);
 214         ld              $apj,`($n-1)*$SIZE_T`($ap)
 215 ___
 216
 217         $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
 218
 219         $self->add_code(<<___);
 220         li              $tp[$n+1],0
 221
 222 ___
 223
 224         $self->add_code(<<___);
 225         li              $i,0
 226         mtctr           $num
 227         b               $label->{"enter"}
 228
 229 .align  4
 230 $label->{"outer"}:
 231         ldx             $bpi,$bp,$i
 232
 233         ld              $apj,0($ap)
 234 ___
 235
 236         $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
 237
 238         for (my $j = 1; $j < $n; $j++) {
 239                 $self->add_code(<<___);
 240         ld              $apj,`$j*$SIZE_T`($ap)
 241 ___
 242                 $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
 243         }
 244
 245         $self->add_code(<<___);
 246         addc            $tp[$n],$tp[$n],$c0
 247         addze           $tp[$n+1],$ze
 248 ___
 249
 250         $self->add_code(<<___);
 251 .align  4
 252 $label->{"enter"}:
 253         mulld           $bpi,$tp[0],$n0
 254
 255         ld              $npj,0($np)
 256 ___
 257
 258         $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
 259
 260         for (my $j = 1; $j < $n; $j++) {
 261                 $self->add_code(<<___);
 262         ld              $npj,`$j*$SIZE_T`($np)
 263 ___
 264                 $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
 265         }
 266
 267         $self->add_code(<<___);
 268         addc            $tp[$n-1],$tp[$n],$c0
 269         addze           $tp[$n],$tp[$n+1]
 270
 271         addi            $i,$i,$SIZE_T
 272         bdnz            $label->{"outer"}
 273
 274         and.            $tp[$n],$tp[$n],$tp[$n]
 275         bne             $label->{"sub"}
 276
 277         cmpld           $tp[$n-1],$npj
 278         blt             $label->{"copy"}
 279
 280 $label->{"sub"}:
 281 ___
 282
 283         #
 284         # Reduction
 285         #
 286
 287                 $self->add_code(<<___);
 288         ld              $bpj,`0*$SIZE_T`($np)
 289         subfc           $c1,$bpj,$tp[0]
 290         std             $c1,`0*$SIZE_T`($rp)
 291
 292 ___
 293         for (my $j = 1; $j < $n - 1; $j++) {
 294                 $self->add_code(<<___);
 295         ld              $bpj,`$j*$SIZE_T`($np)
 296         subfe           $c1,$bpj,$tp[$j]
 297         std             $c1,`$j*$SIZE_T`($rp)
 298
 299 ___
 300         }
 301
 302                 $self->add_code(<<___);
 303         subfe           $c1,$npj,$tp[$n-1]
 304         std             $c1,`($n-1)*$SIZE_T`($rp)
 305
 306 ___
 307
 308         $self->add_code(<<___);
 309         addme.          $tp[$n],$tp[$n]
 310         beq             $label->{"end"}
 311
 312 $label->{"copy"}:
 313 ___
 314
 315         $self->copy_result();
 316
 317         $self->add_code(<<___);
 318
 319 $label->{"end"}:
 320 ___
 321
 322         $self->restore_registers();
 323
 324         $self->add_code(<<___);
 325         li              r3,1
 326         blr
 327 .size .${fname},.-.${fname}
 328 ___
 329
 330 }
 331
 332 package Mont::GPR;
 333
 334 our @ISA = ('Mont');
 335
 336 sub new($$)
 337 {
 338     my ($class, $n) = @_;
 339
 340     return $class->SUPER::new($n);
 341 }
 342
 343 sub save_registers($)
 344 {
 345         my ($self) = @_;
 346
 347         my $n = $self->{n};
 348
 349         $self->add_code(<<___);
 350         std     $lo,-8($sp)
 351 ___
 352
 353         for (my $j = 0; $j <= $n+1; $j++) {
 354                 $self->{code}.=<<___;
 355         std     $tp[$j],-`($j+2)*8`($sp)
 356 ___
 357         }
 358
 359         $self->add_code(<<___);
 360
 361 ___
 362 }
 363
 364 sub restore_registers($)
 365 {
 366         my ($self) = @_;
 367
 368         my $n = $self->{n};
 369
 370         $self->add_code(<<___);
 371         ld      $lo,-8($sp)
 372 ___
 373
 374         for (my $j = 0; $j <= $n+1; $j++) {
 375                 $self->{code}.=<<___;
 376         ld      $tp[$j],-`($j+2)*8`($sp)
 377 ___
 378         }
 379
 380         $self->{code} .=<<___;
 381
 382 ___
 383 }
 384
 385 # Direct translation of C mul()
 386 sub mul($$$$$)
 387 {
 388         my ($self, $r, $a, $w, $c) = @_;
 389
 390         $self->add_code(<<___);
 391         mulld           $lo,$a,$w
 392         addc            $r,$lo,$c
 393         mulhdu          $c,$a,$w
 394         addze           $c,$c
 395
 396 ___
 397 }
 398
 399 # Like mul() but $c is ignored as an input - an optimisation to save a
 400 # preliminary instruction that would set input $c to 0
 401 sub mul_c_0($$$$$)
 402 {
 403         my ($self, $r, $a, $w, $c) = @_;
 404
 405         $self->add_code(<<___);
 406         mulld           $r,$a,$w
 407         mulhdu          $c,$a,$w
 408
 409 ___
 410 }
 411
 412 # Like mul() but does not to the final addition of CA into $c - an
 413 # optimisation to save an instruction
 414 sub mul_last($$$$$$)
 415 {
 416         my ($self, $r1, $r2, $a, $w, $c) = @_;
 417
 418         $self->add_code(<<___);
 419         mulld           $lo,$a,$w
 420         addc            $r1,$lo,$c
 421         mulhdu          $c,$a,$w
 422
 423         addze           $r2,$c
 424 ___
 425 }
 426
 427 # Like C mul_add() but allow $r_out and $r_in to be different
 428 sub mul_add($$$$$$)
 429 {
 430         my ($self, $r_out, $r_in, $a, $w, $c) = @_;
 431
 432         $self->add_code(<<___);
 433         mulld           $lo,$a,$w
 434         addc            $lo,$lo,$c
 435         mulhdu          $c,$a,$w
 436         addze           $c,$c
 437         addc            $r_out,$r_in,$lo
 438         addze           $c,$c
 439
 440 ___
 441 }
 442
 443 # Like mul_add() but $c is ignored as an input - an optimisation to save a
 444 # preliminary instruction that would set input $c to 0
 445 sub mul_add_c_0($$$$$$)
 446 {
 447         my ($self, $r_out, $r_in, $a, $w, $c) = @_;
 448
 449         $self->add_code(<<___);
 450         mulld           $lo,$a,$w
 451         addc            $r_out,$r_in,$lo
 452         mulhdu          $c,$a,$w
 453         addze           $c,$c
 454
 455 ___
 456 }
 457
 458 package Mont::GPR_300;
 459
 460 our @ISA = ('Mont::GPR');
 461
 462 sub new($$)
 463 {
 464         my ($class, $n) = @_;
 465
 466         my $mont = $class->SUPER::new($n);
 467
 468         return $mont;
 469 }
 470
 471 sub get_function_name($)
 472 {
 473         my ($self) = @_;
 474
 475         return "bn_mul_mont_300_fixed_n" . $self->{n};
 476 }
 477
 478 sub get_label($$)
 479 {
 480         my ($self, $l) = @_;
 481
 482         return "L" . $l . "_300_" . $self->{n};
 483 }
 484
 485 # Direct translation of C mul()
 486 sub mul($$$$$)
 487 {
 488         my ($self, $r, $a, $w, $c, $last) = @_;
 489
 490         $self->add_code(<<___);
 491         maddld          $r,$a,$w,$c
 492         maddhdu         $c,$a,$w,$c
 493
 494 ___
 495 }
 496
 497 # Save the last carry as the final entry
 498 sub mul_last($$$$$)
 499 {
 500         my ($self, $r1, $r2, $a, $w, $c) = @_;
 501
 502         $self->add_code(<<___);
 503         maddld          $r1,$a,$w,$c
 504         maddhdu         $r2,$a,$w,$c
 505
 506 ___
 507 }
 508
 509 # Like mul() but $c is ignored as an input - an optimisation to save a
 510 # preliminary instruction that would set input $c to 0
 511 sub mul_c_0($$$$$)
 512 {
 513         my ($self, $r, $a, $w, $c) = @_;
 514
 515         $self->add_code(<<___);
 516         mulld          $r,$a,$w
 517         mulhdu          $c,$a,$w
 518
 519 ___
 520 }
 521
 522 # Like C mul_add() but allow $r_out and $r_in to be different
 523 sub mul_add($$$$$$)
 524 {
 525         my ($self, $r_out, $r_in, $a, $w, $c) = @_;
 526
 527         $self->add_code(<<___);
 528         maddld          $lo,$a,$w,$c
 529         maddhdu         $c,$a,$w,$c
 530         addc            $r_out,$r_in,$lo
 531         addze           $c,$c
 532
 533 ___
 534 }
 535
 536 # Like mul_add() but $c is ignored as an input - an optimisation to save a
 537 # preliminary instruction that would set input $c to 0
 538 sub mul_add_c_0($$$$$$)
 539 {
 540         my ($self, $r_out, $r_in, $a, $w, $c) = @_;
 541
 542         $self->add_code(<<___);
 543         maddld          $lo,$a,$w,$r_in
 544         maddhdu         $c,$a,$w,$r_in
 545 ___
 546
 547         if ($r_out ne $lo) {
 548                 $self->add_code(<<___);
 549         mr                      $r_out,$lo
 550 ___
 551         }
 552
 553         $self->nl();
 554 }
 555
 556
 557 package main;
 558
 559 my $code;
 560
 561 $code.=<<___;
 562 .machine "any"
 563 .text
 564 ___
 565
 566 my $mont;
 567
 568 $mont = new Mont::GPR(6);
 569 $mont->mul_mont_fixed();
 570 $code .= $mont->get_code();
 571
 572 $mont = new Mont::GPR_300(6);
 573 $mont->mul_mont_fixed();
 574 $code .= $mont->get_code();
 575
 576 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 577
 578 $code.=<<___;
 579 .asciz  "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
 580 ___
 581
 582 print $code;
 583 close STDOUT or die "error closing STDOUT: $!";