2 # Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
11 # <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
12 # the OpenSSL project.
13 # ====================================================================
16 # Fixed length (n=6), unrolled PPC Montgomery Multiplication
21 # Although this is a generic implementation for unrolling Montgomery
22 # Multiplication for arbitrary values of n, this is currently only
23 # used for n = 6 to improve the performance of ECC p384.
25 # Unrolling allows intermediate results to be stored in registers,
26 # rather than on the stack, improving performance by ~7% compared to
27 # the existing PPC assembly code.
29 # The ISA 3.0 implementation uses combination multiply/add
30 # instructions (maddld, maddhdu) to improve performance by an
31 # additional ~10% on Power 9.
33 # Finally, saving non-volatile registers into volatile vector
34 # registers instead of onto the stack saves a little more.
36 # On a Power 9 machine we see an overall improvement of ~18%.
42 my ($flavour, $output, $dir, $xlate);
44 # $output is the last argument if it looks like a file (it has an extension)
45 # $flavour is the first argument if it doesn't look like a file
46 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
47 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
52 die "can't locate ppc-xlate.pl";
54 open STDOUT,"| $^X $xlate $flavour \"$output\""
55 or die "can't call $xlate: $!";
57 if ($flavour !~ /64/) {
58 die "bad flavour ($flavour) - only ppc64 permitted";
63 # Registers are global so the code is remotely readable
65 # Parameters for Montgomery multiplication
87 # Non-volatile registers used for tp[i]
89 # 12 registers are available but the limit on unrolling is 10,
90 # since registers from $tp[0] to $tp[$n+1] are used.
91 my @tp = ("r20" .. "r31");
93 # volatile VSRs for saving non-volatile GPRs - faster than stack
94 my @vsrs = ("v32" .. "v46");
100 my ($class, $n) = @_;
103 die "Can't unroll for BN length ${n} (maximum 10)"
126 return $self->{code};
129 sub get_function_name($)
133 return "bn_mul_mont_fixed_n" . $self->{n};
140 return "L" . $l . "_" . $self->{n};
145 my ($self, @labels) = @_;
149 foreach my $l (@labels) {
150 $out{"$l"} = $self->get_label("$l");
160 $self->add_code("\n");
167 my ($n) = $self->{n};
169 for (my $j = 0; $j < $n; $j++) {
170 $self->add_code(<<___);
171 std $tp[$j],`$j*$SIZE_T`($rp)
177 sub mul_mont_fixed($)
181 my ($n) = $self->{n};
182 my $fname = $self->get_function_name();
183 my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
185 $self->add_code(<<___);
193 $self->save_registers();
195 $self->add_code(<<___);
204 $self->mul_c_0($tp[0], $apj, $bp0, $c0);
206 for (my $j = 1; $j < $n - 1; $j++) {
207 $self->add_code(<<___);
208 ld $apj,`$j*$SIZE_T`($ap)
210 $self->mul($tp[$j], $apj, $bp0, $c0);
213 $self->add_code(<<___);
214 ld $apj,`($n-1)*$SIZE_T`($ap)
217 $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
219 $self->add_code(<<___);
224 $self->add_code(<<___);
236 $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
238 for (my $j = 1; $j < $n; $j++) {
239 $self->add_code(<<___);
240 ld $apj,`$j*$SIZE_T`($ap)
242 $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
245 $self->add_code(<<___);
246 addc $tp[$n],$tp[$n],$c0
250 $self->add_code(<<___);
253 mulld $bpi,$tp[0],$n0
258 $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
260 for (my $j = 1; $j < $n; $j++) {
261 $self->add_code(<<___);
262 ld $npj,`$j*$SIZE_T`($np)
264 $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
267 $self->add_code(<<___);
268 addc $tp[$n-1],$tp[$n],$c0
269 addze $tp[$n],$tp[$n+1]
272 bdnz $label->{"outer"}
274 and. $tp[$n],$tp[$n],$tp[$n]
287 $self->add_code(<<___);
288 ld $bpj,`0*$SIZE_T`($np)
289 subfc $c1,$bpj,$tp[0]
290 std $c1,`0*$SIZE_T`($rp)
293 for (my $j = 1; $j < $n - 1; $j++) {
294 $self->add_code(<<___);
295 ld $bpj,`$j*$SIZE_T`($np)
296 subfe $c1,$bpj,$tp[$j]
297 std $c1,`$j*$SIZE_T`($rp)
302 $self->add_code(<<___);
303 subfe $c1,$npj,$tp[$n-1]
304 std $c1,`($n-1)*$SIZE_T`($rp)
308 $self->add_code(<<___);
309 addme. $tp[$n],$tp[$n]
315 $self->copy_result();
317 $self->add_code(<<___);
322 $self->restore_registers();
324 $self->add_code(<<___);
327 .size .${fname},.-.${fname}
338 my ($class, $n) = @_;
340 return $class->SUPER::new($n);
343 sub save_registers($)
349 $self->add_code(<<___);
353 for (my $j = 0; $j <= $n+1; $j++) {
354 $self->{code}.=<<___;
355 std $tp[$j],-`($j+2)*8`($sp)
359 $self->add_code(<<___);
364 sub restore_registers($)
370 $self->add_code(<<___);
374 for (my $j = 0; $j <= $n+1; $j++) {
375 $self->{code}.=<<___;
376 ld $tp[$j],-`($j+2)*8`($sp)
380 $self->{code} .=<<___;
385 # Direct translation of C mul()
388 my ($self, $r, $a, $w, $c) = @_;
390 $self->add_code(<<___);
399 # Like mul() but $c is ignored as an input - an optimisation to save a
400 # preliminary instruction that would set input $c to 0
403 my ($self, $r, $a, $w, $c) = @_;
405 $self->add_code(<<___);
412 # Like mul() but does not to the final addition of CA into $c - an
413 # optimisation to save an instruction
416 my ($self, $r1, $r2, $a, $w, $c) = @_;
418 $self->add_code(<<___);
427 # Like C mul_add() but allow $r_out and $r_in to be different
430 my ($self, $r_out, $r_in, $a, $w, $c) = @_;
432 $self->add_code(<<___);
437 addc $r_out,$r_in,$lo
443 # Like mul_add() but $c is ignored as an input - an optimisation to save a
444 # preliminary instruction that would set input $c to 0
445 sub mul_add_c_0($$$$$$)
447 my ($self, $r_out, $r_in, $a, $w, $c) = @_;
449 $self->add_code(<<___);
451 addc $r_out,$r_in,$lo
458 package Mont::GPR_300;
460 our @ISA = ('Mont::GPR');
464 my ($class, $n) = @_;
466 my $mont = $class->SUPER::new($n);
471 sub get_function_name($)
475 return "bn_mul_mont_300_fixed_n" . $self->{n};
482 return "L" . $l . "_300_" . $self->{n};
485 # Direct translation of C mul()
488 my ($self, $r, $a, $w, $c, $last) = @_;
490 $self->add_code(<<___);
497 # Save the last carry as the final entry
500 my ($self, $r1, $r2, $a, $w, $c) = @_;
502 $self->add_code(<<___);
509 # Like mul() but $c is ignored as an input - an optimisation to save a
510 # preliminary instruction that would set input $c to 0
513 my ($self, $r, $a, $w, $c) = @_;
515 $self->add_code(<<___);
522 # Like C mul_add() but allow $r_out and $r_in to be different
525 my ($self, $r_out, $r_in, $a, $w, $c) = @_;
527 $self->add_code(<<___);
530 addc $r_out,$r_in,$lo
536 # Like mul_add() but $c is ignored as an input - an optimisation to save a
537 # preliminary instruction that would set input $c to 0
538 sub mul_add_c_0($$$$$$)
540 my ($self, $r_out, $r_in, $a, $w, $c) = @_;
542 $self->add_code(<<___);
543 maddld $lo,$a,$w,$r_in
544 maddhdu $c,$a,$w,$r_in
548 $self->add_code(<<___);
568 $mont = new Mont::GPR(6);
569 $mont->mul_mont_fixed();
570 $code .= $mont->get_code();
572 $mont = new Mont::GPR_300(6);
573 $mont->mul_mont_fixed();
574 $code .= $mont->get_code();
576 $code =~ s/\`([^\`]*)\`/eval $1/gem;
579 .asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
583 close STDOUT or die "error closing STDOUT: $!";