85ec79344c96f007a9afaa2d7037cf325c4da2cc
[openssl.git] / crypto / bn / asm / armv4-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # January 2007.
18
19 # Montgomery multiplication for ARMv4.
20 #
21 # Performance improvement naturally varies among CPU implementations
22 # and compilers. The code was observed to provide +65-35% improvement
23 # [depending on key length, less for longer keys] on ARM920T, and
24 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
25 # base and compiler generated code with in-lined umull and even umlal
26 # instructions. The latter means that this code didn't really have an
27 # "advantage" of utilizing some "secret" instruction.
28 #
29 # The code is interoperable with Thumb ISA and is rather compact, less
30 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
31 # about decorations, ABI and instruction syntax are identical.
32
33 # November 2013
34 #
35 # Add NEON code path, which handles lengths divisible by 8. RSA/DSA
36 # performance improvement on Cortex-A8 is ~45-100% depending on key
37 # length, more for longer keys. On Cortex-A15 the span is ~10-105%.
38 # On Snapdragon S4 improvement was measured to vary from ~70% to
39 # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
40 # rather because original integer-only code seems to perform
41 # suboptimally on S4. Situation on Cortex-A9 is unfortunately
42 # different. It's being looked into, but the trouble is that
43 # performance for vectors longer than 256 bits is actually couple
44 # of percent worse than for integer-only code. The code is chosen
45 # for execution on all NEON-capable processors, because gain on
46 # others outweighs the marginal loss on Cortex-A9.
47
48 # September 2015
49 #
50 # Align Cortex-A9 performance with November 2013 improvements, i.e.
51 # NEON code is now ~20-105% faster than integer-only one on this
52 # processor. But this optimization further improved performance even
53 # on other processors: NEON code path is ~45-180% faster than original
54 # integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
55 # Snapdragon S4.
56
57 # $output is the last argument if it looks like a file (it has an extension)
58 # $flavour is the first argument if it doesn't look like a file
59 my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
60 my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
61
62 if ($flavour && $flavour ne "void") {
63     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
65     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
66     die "can't locate arm-xlate.pl";
67
68     open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
69         or die "can't call $xlate: $1";
70 } else {
71     $output and open STDOUT,">$output";
72 }
73
74 $num="r0";      # starts as num argument, but holds &tp[num-1]
75 $ap="r1";
76 $bp="r2"; $bi="r2"; $rp="r2";
77 $np="r3";
78 $tp="r4";
79 $aj="r5";
80 $nj="r6";
81 $tj="r7";
82 $n0="r8";
83 ###########     # r9 is reserved by ELF as platform specific, e.g. TLS pointer
84 $alo="r10";     # sl, gcc uses it to keep @GOT
85 $ahi="r11";     # fp
86 $nlo="r12";     # ip
87 ###########     # r13 is stack pointer
88 $nhi="r14";     # lr
89 ###########     # r15 is program counter
90
91 #### argument block layout relative to &tp[num-1], a.k.a. $num
92 $_rp="$num,#12*4";
93 # ap permanently resides in r1
94 $_bp="$num,#13*4";
95 # np permanently resides in r3
96 $_n0="$num,#14*4";
97 $_num="$num,#15*4";     $_bpend=$_num;
98
99 $code=<<___;
100 #include "arm_arch.h"
101
102 #if defined(__thumb2__)
103 .syntax unified
104 .thumb
105 #else
106 .code   32
107 #endif
108
109 .text
110
111 #if __ARM_MAX_ARCH__>=7
112 .align  5
113 .LOPENSSL_armcap:
114 # ifdef _WIN32
115 .word   OPENSSL_armcap_P
116 # else
117 .word   OPENSSL_armcap_P-.Lbn_mul_mont
118 # endif
119 #endif
120
121 .global bn_mul_mont
122 .type   bn_mul_mont,%function
123
124 .align  5
125 bn_mul_mont:
126 .Lbn_mul_mont:
127         ldr     ip,[sp,#4]              @ load num
128         stmdb   sp!,{r0,r2}             @ sp points at argument block
129 #if __ARM_MAX_ARCH__>=7
130         tst     ip,#7
131         bne     .Lialu
132         ldr     r0,.LOPENSSL_armcap
133 #if !defined(_WIN32)
134         adr     r2,.Lbn_mul_mont
135         ldr     r0,[r0,r2]
136 # endif
137 # if defined(__APPLE__) || defined(_WIN32)
138         ldr     r0,[r0]
139 # endif
140         tst     r0,#ARMV7_NEON          @ NEON available?
141         ldmia   sp, {r0,r2}
142         beq     .Lialu
143         add     sp,sp,#8
144         b       bn_mul8x_mont_neon
145 .align  4
146 .Lialu:
147 #endif
148         cmp     ip,#2
149         mov     $num,ip                 @ load num
150 #ifdef  __thumb2__
151         ittt    lt
152 #endif
153         movlt   r0,#0
154         addlt   sp,sp,#2*4
155         blt     .Labrt
156
157         stmdb   sp!,{r4-r12,lr}         @ save 10 registers
158
159         mov     $num,$num,lsl#2         @ rescale $num for byte count
160         sub     sp,sp,$num              @ alloca(4*num)
161         sub     sp,sp,#4                @ +extra dword
162         sub     $num,$num,#4            @ "num=num-1"
163         add     $tp,$bp,$num            @ &bp[num-1]
164
165         add     $num,sp,$num            @ $num to point at &tp[num-1]
166         ldr     $n0,[$_n0]              @ &n0
167         ldr     $bi,[$bp]               @ bp[0]
168         ldr     $aj,[$ap],#4            @ ap[0],ap++
169         ldr     $nj,[$np],#4            @ np[0],np++
170         ldr     $n0,[$n0]               @ *n0
171         str     $tp,[$_bpend]           @ save &bp[num]
172
173         umull   $alo,$ahi,$aj,$bi       @ ap[0]*bp[0]
174         str     $n0,[$_n0]              @ save n0 value
175         mul     $n0,$alo,$n0            @ "tp[0]"*n0
176         mov     $nlo,#0
177         umlal   $alo,$nlo,$nj,$n0       @ np[0]*n0+"t[0]"
178         mov     $tp,sp
179
180 .L1st:
181         ldr     $aj,[$ap],#4            @ ap[j],ap++
182         mov     $alo,$ahi
183         ldr     $nj,[$np],#4            @ np[j],np++
184         mov     $ahi,#0
185         umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[0]
186         mov     $nhi,#0
187         umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
188         adds    $nlo,$nlo,$alo
189         str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
190         adc     $nlo,$nhi,#0
191         cmp     $tp,$num
192         bne     .L1st
193
194         adds    $nlo,$nlo,$ahi
195         ldr     $tp,[$_bp]              @ restore bp
196         mov     $nhi,#0
197         ldr     $n0,[$_n0]              @ restore n0
198         adc     $nhi,$nhi,#0
199         str     $nlo,[$num]             @ tp[num-1]=
200         mov     $tj,sp
201         str     $nhi,[$num,#4]          @ tp[num]=
202 \f
203 .Louter:
204         sub     $tj,$num,$tj            @ "original" $num-1 value
205         sub     $ap,$ap,$tj             @ "rewind" ap to &ap[1]
206         ldr     $bi,[$tp,#4]!           @ *(++bp)
207         sub     $np,$np,$tj             @ "rewind" np to &np[1]
208         ldr     $aj,[$ap,#-4]           @ ap[0]
209         ldr     $alo,[sp]               @ tp[0]
210         ldr     $nj,[$np,#-4]           @ np[0]
211         ldr     $tj,[sp,#4]             @ tp[1]
212
213         mov     $ahi,#0
214         umlal   $alo,$ahi,$aj,$bi       @ ap[0]*bp[i]+tp[0]
215         str     $tp,[$_bp]              @ save bp
216         mul     $n0,$alo,$n0
217         mov     $nlo,#0
218         umlal   $alo,$nlo,$nj,$n0       @ np[0]*n0+"tp[0]"
219         mov     $tp,sp
220
221 .Linner:
222         ldr     $aj,[$ap],#4            @ ap[j],ap++
223         adds    $alo,$ahi,$tj           @ +=tp[j]
224         ldr     $nj,[$np],#4            @ np[j],np++
225         mov     $ahi,#0
226         umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[i]
227         mov     $nhi,#0
228         umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
229         adc     $ahi,$ahi,#0
230         ldr     $tj,[$tp,#8]            @ tp[j+1]
231         adds    $nlo,$nlo,$alo
232         str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
233         adc     $nlo,$nhi,#0
234         cmp     $tp,$num
235         bne     .Linner
236
237         adds    $nlo,$nlo,$ahi
238         mov     $nhi,#0
239         ldr     $tp,[$_bp]              @ restore bp
240         adc     $nhi,$nhi,#0
241         ldr     $n0,[$_n0]              @ restore n0
242         adds    $nlo,$nlo,$tj
243         ldr     $tj,[$_bpend]           @ restore &bp[num]
244         adc     $nhi,$nhi,#0
245         str     $nlo,[$num]             @ tp[num-1]=
246         str     $nhi,[$num,#4]          @ tp[num]=
247
248         cmp     $tp,$tj
249 #ifdef  __thumb2__
250         itt     ne
251 #endif
252         movne   $tj,sp
253         bne     .Louter
254 \f
255         ldr     $rp,[$_rp]              @ pull rp
256         mov     $aj,sp
257         add     $num,$num,#4            @ $num to point at &tp[num]
258         sub     $aj,$num,$aj            @ "original" num value
259         mov     $tp,sp                  @ "rewind" $tp
260         mov     $ap,$tp                 @ "borrow" $ap
261         sub     $np,$np,$aj             @ "rewind" $np to &np[0]
262
263         subs    $tj,$tj,$tj             @ "clear" carry flag
264 .Lsub:  ldr     $tj,[$tp],#4
265         ldr     $nj,[$np],#4
266         sbcs    $tj,$tj,$nj             @ tp[j]-np[j]
267         str     $tj,[$rp],#4            @ rp[j]=
268         teq     $tp,$num                @ preserve carry
269         bne     .Lsub
270         sbcs    $nhi,$nhi,#0            @ upmost carry
271         mov     $tp,sp                  @ "rewind" $tp
272         sub     $rp,$rp,$aj             @ "rewind" $rp
273
274 .Lcopy: ldr     $tj,[$tp]               @ conditional copy
275         ldr     $aj,[$rp]
276         str     sp,[$tp],#4             @ zap tp
277 #ifdef  __thumb2__
278         it      cc
279 #endif
280         movcc   $aj,$tj
281         str     $aj,[$rp],#4
282         teq     $tp,$num                @ preserve carry
283         bne     .Lcopy
284
285         mov     sp,$num
286         add     sp,sp,#4                @ skip over tp[num+1]
287         ldmia   sp!,{r4-r12,lr}         @ restore registers
288         add     sp,sp,#2*4              @ skip over {r0,r2}
289         mov     r0,#1
290 .Labrt:
291 #if __ARM_ARCH__>=5
292         ret                             @ bx lr
293 #else
294         tst     lr,#1
295         moveq   pc,lr                   @ be binary compatible with V4, yet
296         bx      lr                      @ interoperable with Thumb ISA:-)
297 #endif
298 .size   bn_mul_mont,.-bn_mul_mont
299 ___
300 {
301 my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
302 my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
303 my ($Z,$Temp)=("q4","q5");
304 my @ACC=map("q$_",(6..13));
305 my ($Bi,$Ni,$M0)=map("d$_",(28..31));
306 my $zero="$Z#lo";
307 my $temp="$Temp#lo";
308
309 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
310 my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
311
312 $code.=<<___;
313 #if __ARM_MAX_ARCH__>=7
314 .arch   armv7-a
315 .fpu    neon
316
317 .type   bn_mul8x_mont_neon,%function
318 .align  5
319 bn_mul8x_mont_neon:
320         mov     ip,sp
321         stmdb   sp!,{r4-r11}
322         vstmdb  sp!,{d8-d15}            @ ABI specification says so
323         ldmia   ip,{r4-r5}              @ load rest of parameter block
324         mov     ip,sp
325
326         cmp     $num,#8
327         bhi     .LNEON_8n
328
329         @ special case for $num==8, everything is in register bank...
330
331         vld1.32         {${Bi}[0]}, [$bptr,:32]!
332         veor            $zero,$zero,$zero
333         sub             $toutptr,sp,$num,lsl#4
334         vld1.32         {$A0-$A3},  [$aptr]!            @ can't specify :32 :-(
335         and             $toutptr,$toutptr,#-64
336         vld1.32         {${M0}[0]}, [$n0,:32]
337         mov             sp,$toutptr                     @ alloca
338         vzip.16         $Bi,$zero
339
340         vmull.u32       @ACC[0],$Bi,${A0}[0]
341         vmull.u32       @ACC[1],$Bi,${A0}[1]
342         vmull.u32       @ACC[2],$Bi,${A1}[0]
343         vshl.i64        $Ni,@ACC[0]#hi,#16
344         vmull.u32       @ACC[3],$Bi,${A1}[1]
345
346         vadd.u64        $Ni,$Ni,@ACC[0]#lo
347         veor            $zero,$zero,$zero
348         vmul.u32        $Ni,$Ni,$M0
349
350         vmull.u32       @ACC[4],$Bi,${A2}[0]
351          vld1.32        {$N0-$N3}, [$nptr]!
352         vmull.u32       @ACC[5],$Bi,${A2}[1]
353         vmull.u32       @ACC[6],$Bi,${A3}[0]
354         vzip.16         $Ni,$zero
355         vmull.u32       @ACC[7],$Bi,${A3}[1]
356
357         vmlal.u32       @ACC[0],$Ni,${N0}[0]
358         sub             $outer,$num,#1
359         vmlal.u32       @ACC[1],$Ni,${N0}[1]
360         vmlal.u32       @ACC[2],$Ni,${N1}[0]
361         vmlal.u32       @ACC[3],$Ni,${N1}[1]
362
363         vmlal.u32       @ACC[4],$Ni,${N2}[0]
364         vmov            $Temp,@ACC[0]
365         vmlal.u32       @ACC[5],$Ni,${N2}[1]
366         vmov            @ACC[0],@ACC[1]
367         vmlal.u32       @ACC[6],$Ni,${N3}[0]
368         vmov            @ACC[1],@ACC[2]
369         vmlal.u32       @ACC[7],$Ni,${N3}[1]
370         vmov            @ACC[2],@ACC[3]
371         vmov            @ACC[3],@ACC[4]
372         vshr.u64        $temp,$temp,#16
373         vmov            @ACC[4],@ACC[5]
374         vmov            @ACC[5],@ACC[6]
375         vadd.u64        $temp,$temp,$Temp#hi
376         vmov            @ACC[6],@ACC[7]
377         veor            @ACC[7],@ACC[7]
378         vshr.u64        $temp,$temp,#16
379
380         b       .LNEON_outer8
381
382 .align  4
383 .LNEON_outer8:
384         vld1.32         {${Bi}[0]}, [$bptr,:32]!
385         veor            $zero,$zero,$zero
386         vzip.16         $Bi,$zero
387         vadd.u64        @ACC[0]#lo,@ACC[0]#lo,$temp
388
389         vmlal.u32       @ACC[0],$Bi,${A0}[0]
390         vmlal.u32       @ACC[1],$Bi,${A0}[1]
391         vmlal.u32       @ACC[2],$Bi,${A1}[0]
392         vshl.i64        $Ni,@ACC[0]#hi,#16
393         vmlal.u32       @ACC[3],$Bi,${A1}[1]
394
395         vadd.u64        $Ni,$Ni,@ACC[0]#lo
396         veor            $zero,$zero,$zero
397         subs            $outer,$outer,#1
398         vmul.u32        $Ni,$Ni,$M0
399
400         vmlal.u32       @ACC[4],$Bi,${A2}[0]
401         vmlal.u32       @ACC[5],$Bi,${A2}[1]
402         vmlal.u32       @ACC[6],$Bi,${A3}[0]
403         vzip.16         $Ni,$zero
404         vmlal.u32       @ACC[7],$Bi,${A3}[1]
405
406         vmlal.u32       @ACC[0],$Ni,${N0}[0]
407         vmlal.u32       @ACC[1],$Ni,${N0}[1]
408         vmlal.u32       @ACC[2],$Ni,${N1}[0]
409         vmlal.u32       @ACC[3],$Ni,${N1}[1]
410
411         vmlal.u32       @ACC[4],$Ni,${N2}[0]
412         vmov            $Temp,@ACC[0]
413         vmlal.u32       @ACC[5],$Ni,${N2}[1]
414         vmov            @ACC[0],@ACC[1]
415         vmlal.u32       @ACC[6],$Ni,${N3}[0]
416         vmov            @ACC[1],@ACC[2]
417         vmlal.u32       @ACC[7],$Ni,${N3}[1]
418         vmov            @ACC[2],@ACC[3]
419         vmov            @ACC[3],@ACC[4]
420         vshr.u64        $temp,$temp,#16
421         vmov            @ACC[4],@ACC[5]
422         vmov            @ACC[5],@ACC[6]
423         vadd.u64        $temp,$temp,$Temp#hi
424         vmov            @ACC[6],@ACC[7]
425         veor            @ACC[7],@ACC[7]
426         vshr.u64        $temp,$temp,#16
427
428         bne     .LNEON_outer8
429
430         vadd.u64        @ACC[0]#lo,@ACC[0]#lo,$temp
431         mov             $toutptr,sp
432         vshr.u64        $temp,@ACC[0]#lo,#16
433         mov             $inner,$num
434         vadd.u64        @ACC[0]#hi,@ACC[0]#hi,$temp
435         add             $tinptr,sp,#96
436         vshr.u64        $temp,@ACC[0]#hi,#16
437         vzip.16         @ACC[0]#lo,@ACC[0]#hi
438
439         b       .LNEON_tail_entry
440
441 .align  4
442 .LNEON_8n:
443         veor            @ACC[0],@ACC[0],@ACC[0]
444          sub            $toutptr,sp,#128
445         veor            @ACC[1],@ACC[1],@ACC[1]
446          sub            $toutptr,$toutptr,$num,lsl#4
447         veor            @ACC[2],@ACC[2],@ACC[2]
448          and            $toutptr,$toutptr,#-64
449         veor            @ACC[3],@ACC[3],@ACC[3]
450          mov            sp,$toutptr                     @ alloca
451         veor            @ACC[4],@ACC[4],@ACC[4]
452          add            $toutptr,$toutptr,#256
453         veor            @ACC[5],@ACC[5],@ACC[5]
454          sub            $inner,$num,#8
455         veor            @ACC[6],@ACC[6],@ACC[6]
456         veor            @ACC[7],@ACC[7],@ACC[7]
457
458 .LNEON_8n_init:
459         vst1.64         {@ACC[0]-@ACC[1]},[$toutptr,:256]!
460         subs            $inner,$inner,#8
461         vst1.64         {@ACC[2]-@ACC[3]},[$toutptr,:256]!
462         vst1.64         {@ACC[4]-@ACC[5]},[$toutptr,:256]!
463         vst1.64         {@ACC[6]-@ACC[7]},[$toutptr,:256]!
464         bne             .LNEON_8n_init
465
466         add             $tinptr,sp,#256
467         vld1.32         {$A0-$A3},[$aptr]!
468         add             $bnptr,sp,#8
469         vld1.32         {${M0}[0]},[$n0,:32]
470         mov             $outer,$num
471         b               .LNEON_8n_outer
472
473 .align  4
474 .LNEON_8n_outer:
475         vld1.32         {${Bi}[0]},[$bptr,:32]! @ *b++
476         veor            $zero,$zero,$zero
477         vzip.16         $Bi,$zero
478         add             $toutptr,sp,#128
479         vld1.32         {$N0-$N3},[$nptr]!
480
481         vmlal.u32       @ACC[0],$Bi,${A0}[0]
482         vmlal.u32       @ACC[1],$Bi,${A0}[1]
483          veor           $zero,$zero,$zero
484         vmlal.u32       @ACC[2],$Bi,${A1}[0]
485          vshl.i64       $Ni,@ACC[0]#hi,#16
486         vmlal.u32       @ACC[3],$Bi,${A1}[1]
487          vadd.u64       $Ni,$Ni,@ACC[0]#lo
488         vmlal.u32       @ACC[4],$Bi,${A2}[0]
489          vmul.u32       $Ni,$Ni,$M0
490         vmlal.u32       @ACC[5],$Bi,${A2}[1]
491         vst1.32         {$Bi},[sp,:64]          @ put aside smashed b[8*i+0]
492         vmlal.u32       @ACC[6],$Bi,${A3}[0]
493          vzip.16        $Ni,$zero
494         vmlal.u32       @ACC[7],$Bi,${A3}[1]
495 ___
496 for ($i=0; $i<7;) {
497 $code.=<<___;
498         vld1.32         {${Bi}[0]},[$bptr,:32]! @ *b++
499         vmlal.u32       @ACC[0],$Ni,${N0}[0]
500         veor            $temp,$temp,$temp
501         vmlal.u32       @ACC[1],$Ni,${N0}[1]
502         vzip.16         $Bi,$temp
503         vmlal.u32       @ACC[2],$Ni,${N1}[0]
504          vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
505         vmlal.u32       @ACC[3],$Ni,${N1}[1]
506         vmlal.u32       @ACC[4],$Ni,${N2}[0]
507          vadd.u64       @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
508         vmlal.u32       @ACC[5],$Ni,${N2}[1]
509          vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
510         vmlal.u32       @ACC[6],$Ni,${N3}[0]
511         vmlal.u32       @ACC[7],$Ni,${N3}[1]
512          vadd.u64       @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
513         vst1.32         {$Ni},[$bnptr,:64]!     @ put aside smashed m[8*i+$i]
514 ___
515         push(@ACC,shift(@ACC)); $i++;
516 $code.=<<___;
517         vmlal.u32       @ACC[0],$Bi,${A0}[0]
518         vld1.64         {@ACC[7]},[$tinptr,:128]!
519         vmlal.u32       @ACC[1],$Bi,${A0}[1]
520          veor           $zero,$zero,$zero
521         vmlal.u32       @ACC[2],$Bi,${A1}[0]
522          vshl.i64       $Ni,@ACC[0]#hi,#16
523         vmlal.u32       @ACC[3],$Bi,${A1}[1]
524          vadd.u64       $Ni,$Ni,@ACC[0]#lo
525         vmlal.u32       @ACC[4],$Bi,${A2}[0]
526          vmul.u32       $Ni,$Ni,$M0
527         vmlal.u32       @ACC[5],$Bi,${A2}[1]
528         vst1.32         {$Bi},[$bnptr,:64]!     @ put aside smashed b[8*i+$i]
529         vmlal.u32       @ACC[6],$Bi,${A3}[0]
530          vzip.16        $Ni,$zero
531         vmlal.u32       @ACC[7],$Bi,${A3}[1]
532 ___
533 }
534 $code.=<<___;
535         vld1.32         {$Bi},[sp,:64]          @ pull smashed b[8*i+0]
536         vmlal.u32       @ACC[0],$Ni,${N0}[0]
537         vld1.32         {$A0-$A3},[$aptr]!
538         vmlal.u32       @ACC[1],$Ni,${N0}[1]
539         vmlal.u32       @ACC[2],$Ni,${N1}[0]
540          vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
541         vmlal.u32       @ACC[3],$Ni,${N1}[1]
542         vmlal.u32       @ACC[4],$Ni,${N2}[0]
543          vadd.u64       @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
544         vmlal.u32       @ACC[5],$Ni,${N2}[1]
545          vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
546         vmlal.u32       @ACC[6],$Ni,${N3}[0]
547         vmlal.u32       @ACC[7],$Ni,${N3}[1]
548          vadd.u64       @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
549         vst1.32         {$Ni},[$bnptr,:64]      @ put aside smashed m[8*i+$i]
550         add             $bnptr,sp,#8            @ rewind
551 ___
552         push(@ACC,shift(@ACC));
553 $code.=<<___;
554         sub             $inner,$num,#8
555         b               .LNEON_8n_inner
556
557 .align  4
558 .LNEON_8n_inner:
559         subs            $inner,$inner,#8
560         vmlal.u32       @ACC[0],$Bi,${A0}[0]
561         vld1.64         {@ACC[7]},[$tinptr,:128]
562         vmlal.u32       @ACC[1],$Bi,${A0}[1]
563         vld1.32         {$Ni},[$bnptr,:64]!     @ pull smashed m[8*i+0]
564         vmlal.u32       @ACC[2],$Bi,${A1}[0]
565         vld1.32         {$N0-$N3},[$nptr]!
566         vmlal.u32       @ACC[3],$Bi,${A1}[1]
567         it              ne
568         addne           $tinptr,$tinptr,#16     @ don't advance in last iteration
569         vmlal.u32       @ACC[4],$Bi,${A2}[0]
570         vmlal.u32       @ACC[5],$Bi,${A2}[1]
571         vmlal.u32       @ACC[6],$Bi,${A3}[0]
572         vmlal.u32       @ACC[7],$Bi,${A3}[1]
573 ___
574 for ($i=1; $i<8; $i++) {
575 $code.=<<___;
576         vld1.32         {$Bi},[$bnptr,:64]!     @ pull smashed b[8*i+$i]
577         vmlal.u32       @ACC[0],$Ni,${N0}[0]
578         vmlal.u32       @ACC[1],$Ni,${N0}[1]
579         vmlal.u32       @ACC[2],$Ni,${N1}[0]
580         vmlal.u32       @ACC[3],$Ni,${N1}[1]
581         vmlal.u32       @ACC[4],$Ni,${N2}[0]
582         vmlal.u32       @ACC[5],$Ni,${N2}[1]
583         vmlal.u32       @ACC[6],$Ni,${N3}[0]
584         vmlal.u32       @ACC[7],$Ni,${N3}[1]
585         vst1.64         {@ACC[0]},[$toutptr,:128]!
586 ___
587         push(@ACC,shift(@ACC));
588 $code.=<<___;
589         vmlal.u32       @ACC[0],$Bi,${A0}[0]
590         vld1.64         {@ACC[7]},[$tinptr,:128]
591         vmlal.u32       @ACC[1],$Bi,${A0}[1]
592         vld1.32         {$Ni},[$bnptr,:64]!     @ pull smashed m[8*i+$i]
593         vmlal.u32       @ACC[2],$Bi,${A1}[0]
594         it              ne
595         addne           $tinptr,$tinptr,#16     @ don't advance in last iteration
596         vmlal.u32       @ACC[3],$Bi,${A1}[1]
597         vmlal.u32       @ACC[4],$Bi,${A2}[0]
598         vmlal.u32       @ACC[5],$Bi,${A2}[1]
599         vmlal.u32       @ACC[6],$Bi,${A3}[0]
600         vmlal.u32       @ACC[7],$Bi,${A3}[1]
601 ___
602 }
603 $code.=<<___;
604         it              eq
605         subeq           $aptr,$aptr,$num,lsl#2  @ rewind
606         vmlal.u32       @ACC[0],$Ni,${N0}[0]
607         vld1.32         {$Bi},[sp,:64]          @ pull smashed b[8*i+0]
608         vmlal.u32       @ACC[1],$Ni,${N0}[1]
609         vld1.32         {$A0-$A3},[$aptr]!
610         vmlal.u32       @ACC[2],$Ni,${N1}[0]
611         add             $bnptr,sp,#8            @ rewind
612         vmlal.u32       @ACC[3],$Ni,${N1}[1]
613         vmlal.u32       @ACC[4],$Ni,${N2}[0]
614         vmlal.u32       @ACC[5],$Ni,${N2}[1]
615         vmlal.u32       @ACC[6],$Ni,${N3}[0]
616         vst1.64         {@ACC[0]},[$toutptr,:128]!
617         vmlal.u32       @ACC[7],$Ni,${N3}[1]
618
619         bne             .LNEON_8n_inner
620 ___
621         push(@ACC,shift(@ACC));
622 $code.=<<___;
623         add             $tinptr,sp,#128
624         vst1.64         {@ACC[0]-@ACC[1]},[$toutptr,:256]!
625         veor            q2,q2,q2                @ $N0-$N1
626         vst1.64         {@ACC[2]-@ACC[3]},[$toutptr,:256]!
627         veor            q3,q3,q3                @ $N2-$N3
628         vst1.64         {@ACC[4]-@ACC[5]},[$toutptr,:256]!
629         vst1.64         {@ACC[6]},[$toutptr,:128]
630
631         subs            $outer,$outer,#8
632         vld1.64         {@ACC[0]-@ACC[1]},[$tinptr,:256]!
633         vld1.64         {@ACC[2]-@ACC[3]},[$tinptr,:256]!
634         vld1.64         {@ACC[4]-@ACC[5]},[$tinptr,:256]!
635         vld1.64         {@ACC[6]-@ACC[7]},[$tinptr,:256]!
636
637         itt             ne
638         subne           $nptr,$nptr,$num,lsl#2  @ rewind
639         bne             .LNEON_8n_outer
640
641         add             $toutptr,sp,#128
642         vst1.64         {q2-q3}, [sp,:256]!     @ start wiping stack frame
643         vshr.u64        $temp,@ACC[0]#lo,#16
644         vst1.64         {q2-q3},[sp,:256]!
645         vadd.u64        @ACC[0]#hi,@ACC[0]#hi,$temp
646         vst1.64         {q2-q3}, [sp,:256]!
647         vshr.u64        $temp,@ACC[0]#hi,#16
648         vst1.64         {q2-q3}, [sp,:256]!
649         vzip.16         @ACC[0]#lo,@ACC[0]#hi
650
651         mov             $inner,$num
652         b               .LNEON_tail_entry
653
654 .align  4
655 .LNEON_tail:
656         vadd.u64        @ACC[0]#lo,@ACC[0]#lo,$temp
657         vshr.u64        $temp,@ACC[0]#lo,#16
658         vld1.64         {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
659         vadd.u64        @ACC[0]#hi,@ACC[0]#hi,$temp
660         vld1.64         {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
661         vshr.u64        $temp,@ACC[0]#hi,#16
662         vld1.64         {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
663         vzip.16         @ACC[0]#lo,@ACC[0]#hi
664
665 .LNEON_tail_entry:
666 ___
667 for ($i=1; $i<8; $i++) {
668 $code.=<<___;
669         vadd.u64        @ACC[1]#lo,@ACC[1]#lo,$temp
670         vst1.32         {@ACC[0]#lo[0]}, [$toutptr, :32]!
671         vshr.u64        $temp,@ACC[1]#lo,#16
672         vadd.u64        @ACC[1]#hi,@ACC[1]#hi,$temp
673         vshr.u64        $temp,@ACC[1]#hi,#16
674         vzip.16         @ACC[1]#lo,@ACC[1]#hi
675 ___
676         push(@ACC,shift(@ACC));
677 }
678         push(@ACC,shift(@ACC));
679 $code.=<<___;
680         vld1.64         {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
681         subs            $inner,$inner,#8
682         vst1.32         {@ACC[7]#lo[0]},   [$toutptr, :32]!
683         bne     .LNEON_tail
684
685         vst1.32 {${temp}[0]}, [$toutptr, :32]           @ top-most bit
686         sub     $nptr,$nptr,$num,lsl#2                  @ rewind $nptr
687         subs    $aptr,sp,#0                             @ clear carry flag
688         add     $bptr,sp,$num,lsl#2
689
690 .LNEON_sub:
691         ldmia   $aptr!, {r4-r7}
692         ldmia   $nptr!, {r8-r11}
693         sbcs    r8, r4,r8
694         sbcs    r9, r5,r9
695         sbcs    r10,r6,r10
696         sbcs    r11,r7,r11
697         teq     $aptr,$bptr                             @ preserves carry
698         stmia   $rptr!, {r8-r11}
699         bne     .LNEON_sub
700
701         ldr     r10, [$aptr]                            @ load top-most bit
702         mov     r11,sp
703         veor    q0,q0,q0
704         sub     r11,$bptr,r11                           @ this is num*4
705         veor    q1,q1,q1
706         mov     $aptr,sp
707         sub     $rptr,$rptr,r11                         @ rewind $rptr
708         mov     $nptr,$bptr                             @ second 3/4th of frame
709         sbcs    r10,r10,#0                              @ result is carry flag
710
711 .LNEON_copy_n_zap:
712         ldmia   $aptr!, {r4-r7}
713         ldmia   $rptr,  {r8-r11}
714         it      cc
715         movcc   r8, r4
716         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
717         itt     cc
718         movcc   r9, r5
719         movcc   r10,r6
720         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
721         it      cc
722         movcc   r11,r7
723         ldmia   $aptr, {r4-r7}
724         stmia   $rptr!, {r8-r11}
725         sub     $aptr,$aptr,#16
726         ldmia   $rptr, {r8-r11}
727         it      cc
728         movcc   r8, r4
729         vst1.64 {q0-q1}, [$aptr,:256]!                  @ wipe
730         itt     cc
731         movcc   r9, r5
732         movcc   r10,r6
733         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
734         it      cc
735         movcc   r11,r7
736         teq     $aptr,$bptr                             @ preserves carry
737         stmia   $rptr!, {r8-r11}
738         bne     .LNEON_copy_n_zap
739
740         mov     sp,ip
741         vldmia  sp!,{d8-d15}
742         ldmia   sp!,{r4-r11}
743         ret                                             @ bx lr
744 .size   bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
745 #endif
746 ___
747 }
748 $code.=<<___;
749 .asciz  "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
750 .align  2
751 #if __ARM_MAX_ARCH__>=7
752 .comm   OPENSSL_armcap_P,4,4
753 #endif
754 ___
755
756 foreach (split("\n",$code)) {
757         s/\`([^\`]*)\`/eval $1/ge;
758
759         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge        or
760         s/\bret\b/bx    lr/g                                            or
761         s/\bbx\s+lr\b/.word\t0xe12fff1e/g;      # make it possible to compile with -march=armv4
762
763         print $_,"\n";
764 }
765
766 close STDOUT or die "error closing STDOUT";