7017ad5f20eb94af5d963046dfe923fe48600125
[openssl.git] / crypto / bn / asm / armv4-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # January 2007.
11
12 # Montgomery multiplication for ARMv4.
13 #
14 # Performance improvement naturally varies among CPU implementations
15 # and compilers. The code was observed to provide +65-35% improvement
16 # [depending on key length, less for longer keys] on ARM920T, and
17 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18 # base and compiler generated code with in-lined umull and even umlal
19 # instructions. The latter means that this code didn't really have an 
20 # "advantage" of utilizing some "secret" instruction.
21 #
22 # The code is interoperable with Thumb ISA and is rather compact, less
23 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
24 # about decorations, ABI and instruction syntax are identical.
25
26 # November 2013
27 #
28 # Add NEON code path, which handles lengths divisible by 8. RSA/DSA
29 # performance improvement on Cortex-A8 is ~45-100% depending on key
30 # length, more for longer keys. On Cortex-A15 the span is ~10-105%.
31 # On Snapdragon S4 improvement was measured to vary from ~70% to
32 # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
33 # rather because original integer-only code seems to perform
34 # suboptimally on S4. Situation on Cortex-A9 is unfortunately
35 # different. It's being looked into, but the trouble is that
36 # performance for vectors longer than 256 bits is actually couple
37 # of percent worse than for integer-only code. The code is chosen
38 # for execution on all NEON-capable processors, because gain on
39 # others outweighs the marginal loss on Cortex-A9.
40
41 # September 2015
42 #
43 # Align Cortex-A9 performance with November 2013 improvements, i.e.
44 # NEON code is now ~20-105% faster than integer-only one on this
45 # processor. But this optimization further improved performance even
46 # on other processors: NEON code path is ~45-180% faster than original
47 # integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
48 # Snapdragon S4.
49
50 $flavour = shift;
51 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
52 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
53
54 if ($flavour && $flavour ne "void") {
55     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
56     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
57     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
58     die "can't locate arm-xlate.pl";
59
60     open STDOUT,"| \"$^X\" $xlate $flavour $output";
61 } else {
62     open STDOUT,">$output";
63 }
64
65 $num="r0";      # starts as num argument, but holds &tp[num-1]
66 $ap="r1";
67 $bp="r2"; $bi="r2"; $rp="r2";
68 $np="r3";
69 $tp="r4";
70 $aj="r5";
71 $nj="r6";
72 $tj="r7";
73 $n0="r8";
74 ###########     # r9 is reserved by ELF as platform specific, e.g. TLS pointer
75 $alo="r10";     # sl, gcc uses it to keep @GOT
76 $ahi="r11";     # fp
77 $nlo="r12";     # ip
78 ###########     # r13 is stack pointer
79 $nhi="r14";     # lr
80 ###########     # r15 is program counter
81
82 #### argument block layout relative to &tp[num-1], a.k.a. $num
83 $_rp="$num,#12*4";
84 # ap permanently resides in r1
85 $_bp="$num,#13*4";
86 # np permanently resides in r3
87 $_n0="$num,#14*4";
88 $_num="$num,#15*4";     $_bpend=$_num;
89
90 $code=<<___;
91 #include "arm_arch.h"
92
93 .text
94 #if defined(__thumb2__)
95 .syntax unified
96 .thumb
97 #else
98 .code   32
99 #endif
100
101 #if __ARM_MAX_ARCH__>=7
102 .align  5
103 .LOPENSSL_armcap:
104 .word   OPENSSL_armcap_P-.Lbn_mul_mont
105 #endif
106
107 .global bn_mul_mont
108 .type   bn_mul_mont,%function
109
110 .align  5
111 bn_mul_mont:
112 .Lbn_mul_mont:
113         ldr     ip,[sp,#4]              @ load num
114         stmdb   sp!,{r0,r2}             @ sp points at argument block
115 #if __ARM_MAX_ARCH__>=7
116         tst     ip,#7
117         bne     .Lialu
118         adr     r0,.Lbn_mul_mont
119         ldr     r2,.LOPENSSL_armcap
120         ldr     r0,[r0,r2]
121 #ifdef  __APPLE__
122         ldr     r0,[r0]
123 #endif
124         tst     r0,#1                   @ NEON available?
125         ldmia   sp, {r0,r2}
126         beq     .Lialu
127         add     sp,sp,#8
128         b       bn_mul8x_mont_neon
129 .align  4
130 .Lialu:
131 #endif
132         cmp     ip,#2
133         mov     $num,ip                 @ load num
134 #ifdef  __thumb2__
135         ittt    lt
136 #endif
137         movlt   r0,#0
138         addlt   sp,sp,#2*4
139         blt     .Labrt
140
141         stmdb   sp!,{r4-r12,lr}         @ save 10 registers
142
143         mov     $num,$num,lsl#2         @ rescale $num for byte count
144         sub     sp,sp,$num              @ alloca(4*num)
145         sub     sp,sp,#4                @ +extra dword
146         sub     $num,$num,#4            @ "num=num-1"
147         add     $tp,$bp,$num            @ &bp[num-1]
148
149         add     $num,sp,$num            @ $num to point at &tp[num-1]
150         ldr     $n0,[$_n0]              @ &n0
151         ldr     $bi,[$bp]               @ bp[0]
152         ldr     $aj,[$ap],#4            @ ap[0],ap++
153         ldr     $nj,[$np],#4            @ np[0],np++
154         ldr     $n0,[$n0]               @ *n0
155         str     $tp,[$_bpend]           @ save &bp[num]
156
157         umull   $alo,$ahi,$aj,$bi       @ ap[0]*bp[0]
158         str     $n0,[$_n0]              @ save n0 value
159         mul     $n0,$alo,$n0            @ "tp[0]"*n0
160         mov     $nlo,#0
161         umlal   $alo,$nlo,$nj,$n0       @ np[0]*n0+"t[0]"
162         mov     $tp,sp
163
164 .L1st:
165         ldr     $aj,[$ap],#4            @ ap[j],ap++
166         mov     $alo,$ahi
167         ldr     $nj,[$np],#4            @ np[j],np++
168         mov     $ahi,#0
169         umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[0]
170         mov     $nhi,#0
171         umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
172         adds    $nlo,$nlo,$alo
173         str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
174         adc     $nlo,$nhi,#0
175         cmp     $tp,$num
176         bne     .L1st
177
178         adds    $nlo,$nlo,$ahi
179         ldr     $tp,[$_bp]              @ restore bp
180         mov     $nhi,#0
181         ldr     $n0,[$_n0]              @ restore n0
182         adc     $nhi,$nhi,#0
183         str     $nlo,[$num]             @ tp[num-1]=
184         mov     $tj,sp
185         str     $nhi,[$num,#4]          @ tp[num]=
186 \f
187 .Louter:
188         sub     $tj,$num,$tj            @ "original" $num-1 value
189         sub     $ap,$ap,$tj             @ "rewind" ap to &ap[1]
190         ldr     $bi,[$tp,#4]!           @ *(++bp)
191         sub     $np,$np,$tj             @ "rewind" np to &np[1]
192         ldr     $aj,[$ap,#-4]           @ ap[0]
193         ldr     $alo,[sp]               @ tp[0]
194         ldr     $nj,[$np,#-4]           @ np[0]
195         ldr     $tj,[sp,#4]             @ tp[1]
196
197         mov     $ahi,#0
198         umlal   $alo,$ahi,$aj,$bi       @ ap[0]*bp[i]+tp[0]
199         str     $tp,[$_bp]              @ save bp
200         mul     $n0,$alo,$n0
201         mov     $nlo,#0
202         umlal   $alo,$nlo,$nj,$n0       @ np[0]*n0+"tp[0]"
203         mov     $tp,sp
204
205 .Linner:
206         ldr     $aj,[$ap],#4            @ ap[j],ap++
207         adds    $alo,$ahi,$tj           @ +=tp[j]
208         ldr     $nj,[$np],#4            @ np[j],np++
209         mov     $ahi,#0
210         umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[i]
211         mov     $nhi,#0
212         umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
213         adc     $ahi,$ahi,#0
214         ldr     $tj,[$tp,#8]            @ tp[j+1]
215         adds    $nlo,$nlo,$alo
216         str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
217         adc     $nlo,$nhi,#0
218         cmp     $tp,$num
219         bne     .Linner
220
221         adds    $nlo,$nlo,$ahi
222         mov     $nhi,#0
223         ldr     $tp,[$_bp]              @ restore bp
224         adc     $nhi,$nhi,#0
225         ldr     $n0,[$_n0]              @ restore n0
226         adds    $nlo,$nlo,$tj
227         ldr     $tj,[$_bpend]           @ restore &bp[num]
228         adc     $nhi,$nhi,#0
229         str     $nlo,[$num]             @ tp[num-1]=
230         str     $nhi,[$num,#4]          @ tp[num]=
231
232         cmp     $tp,$tj
233 #ifdef  __thumb2__
234         itt     ne
235 #endif
236         movne   $tj,sp
237         bne     .Louter
238 \f
239         ldr     $rp,[$_rp]              @ pull rp
240         mov     $aj,sp
241         add     $num,$num,#4            @ $num to point at &tp[num]
242         sub     $aj,$num,$aj            @ "original" num value
243         mov     $tp,sp                  @ "rewind" $tp
244         mov     $ap,$tp                 @ "borrow" $ap
245         sub     $np,$np,$aj             @ "rewind" $np to &np[0]
246
247         subs    $tj,$tj,$tj             @ "clear" carry flag
248 .Lsub:  ldr     $tj,[$tp],#4
249         ldr     $nj,[$np],#4
250         sbcs    $tj,$tj,$nj             @ tp[j]-np[j]
251         str     $tj,[$rp],#4            @ rp[j]=
252         teq     $tp,$num                @ preserve carry
253         bne     .Lsub
254         sbcs    $nhi,$nhi,#0            @ upmost carry
255         mov     $tp,sp                  @ "rewind" $tp
256         sub     $rp,$rp,$aj             @ "rewind" $rp
257
258         and     $ap,$tp,$nhi
259         bic     $np,$rp,$nhi
260         orr     $ap,$ap,$np             @ ap=borrow?tp:rp
261
262 .Lcopy: ldr     $tj,[$ap],#4            @ copy or in-place refresh
263         str     sp,[$tp],#4             @ zap tp
264         str     $tj,[$rp],#4
265         cmp     $tp,$num
266         bne     .Lcopy
267
268         mov     sp,$num
269         add     sp,sp,#4                @ skip over tp[num+1]
270         ldmia   sp!,{r4-r12,lr}         @ restore registers
271         add     sp,sp,#2*4              @ skip over {r0,r2}
272         mov     r0,#1
273 .Labrt:
274 #if __ARM_ARCH__>=5
275         ret                             @ bx lr
276 #else
277         tst     lr,#1
278         moveq   pc,lr                   @ be binary compatible with V4, yet
279         bx      lr                      @ interoperable with Thumb ISA:-)
280 #endif
281 .size   bn_mul_mont,.-bn_mul_mont
282 ___
283 {
284 my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
285 my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
286 my ($Z,$Temp)=("q4","q5");
287 my @ACC=map("q$_",(6..13));
288 my ($Bi,$Ni,$M0)=map("d$_",(28..31));
289 my $zero="$Z#lo";
290 my $temp="$Temp#lo";
291
292 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
293 my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
294
295 $code.=<<___;
296 #if __ARM_MAX_ARCH__>=7
297 .arch   armv7-a
298 .fpu    neon
299
300 .type   bn_mul8x_mont_neon,%function
301 .align  5
302 bn_mul8x_mont_neon:
303         mov     ip,sp
304         stmdb   sp!,{r4-r11}
305         vstmdb  sp!,{d8-d15}            @ ABI specification says so
306         ldmia   ip,{r4-r5}              @ load rest of parameter block
307         mov     ip,sp
308
309         cmp     $num,#8
310         bhi     .LNEON_8n
311
312         @ special case for $num==8, everything is in register bank...
313
314         vld1.32         {${Bi}[0]}, [$bptr,:32]!
315         veor            $zero,$zero,$zero
316         sub             $toutptr,sp,$num,lsl#4
317         vld1.32         {$A0-$A3},  [$aptr]!            @ can't specify :32 :-(
318         and             $toutptr,$toutptr,#-64
319         vld1.32         {${M0}[0]}, [$n0,:32]
320         mov             sp,$toutptr                     @ alloca
321         vzip.16         $Bi,$zero
322
323         vmull.u32       @ACC[0],$Bi,${A0}[0]
324         vmull.u32       @ACC[1],$Bi,${A0}[1]
325         vmull.u32       @ACC[2],$Bi,${A1}[0]
326         vshl.i64        $Ni,@ACC[0]#hi,#16
327         vmull.u32       @ACC[3],$Bi,${A1}[1]
328
329         vadd.u64        $Ni,$Ni,@ACC[0]#lo
330         veor            $zero,$zero,$zero
331         vmul.u32        $Ni,$Ni,$M0
332
333         vmull.u32       @ACC[4],$Bi,${A2}[0]
334          vld1.32        {$N0-$N3}, [$nptr]!
335         vmull.u32       @ACC[5],$Bi,${A2}[1]
336         vmull.u32       @ACC[6],$Bi,${A3}[0]
337         vzip.16         $Ni,$zero
338         vmull.u32       @ACC[7],$Bi,${A3}[1]
339
340         vmlal.u32       @ACC[0],$Ni,${N0}[0]
341         sub             $outer,$num,#1
342         vmlal.u32       @ACC[1],$Ni,${N0}[1]
343         vmlal.u32       @ACC[2],$Ni,${N1}[0]
344         vmlal.u32       @ACC[3],$Ni,${N1}[1]
345
346         vmlal.u32       @ACC[4],$Ni,${N2}[0]
347         vmov            $Temp,@ACC[0]
348         vmlal.u32       @ACC[5],$Ni,${N2}[1]
349         vmov            @ACC[0],@ACC[1]
350         vmlal.u32       @ACC[6],$Ni,${N3}[0]
351         vmov            @ACC[1],@ACC[2]
352         vmlal.u32       @ACC[7],$Ni,${N3}[1]
353         vmov            @ACC[2],@ACC[3]
354         vmov            @ACC[3],@ACC[4]
355         vshr.u64        $temp,$temp,#16
356         vmov            @ACC[4],@ACC[5]
357         vmov            @ACC[5],@ACC[6]
358         vadd.u64        $temp,$temp,$Temp#hi
359         vmov            @ACC[6],@ACC[7]
360         veor            @ACC[7],@ACC[7]
361         vshr.u64        $temp,$temp,#16
362
363         b       .LNEON_outer8
364
365 .align  4
366 .LNEON_outer8:
367         vld1.32         {${Bi}[0]}, [$bptr,:32]!
368         veor            $zero,$zero,$zero
369         vzip.16         $Bi,$zero
370         vadd.u64        @ACC[0]#lo,@ACC[0]#lo,$temp
371
372         vmlal.u32       @ACC[0],$Bi,${A0}[0]
373         vmlal.u32       @ACC[1],$Bi,${A0}[1]
374         vmlal.u32       @ACC[2],$Bi,${A1}[0]
375         vshl.i64        $Ni,@ACC[0]#hi,#16
376         vmlal.u32       @ACC[3],$Bi,${A1}[1]
377
378         vadd.u64        $Ni,$Ni,@ACC[0]#lo
379         veor            $zero,$zero,$zero
380         subs            $outer,$outer,#1
381         vmul.u32        $Ni,$Ni,$M0
382
383         vmlal.u32       @ACC[4],$Bi,${A2}[0]
384         vmlal.u32       @ACC[5],$Bi,${A2}[1]
385         vmlal.u32       @ACC[6],$Bi,${A3}[0]
386         vzip.16         $Ni,$zero
387         vmlal.u32       @ACC[7],$Bi,${A3}[1]
388
389         vmlal.u32       @ACC[0],$Ni,${N0}[0]
390         vmlal.u32       @ACC[1],$Ni,${N0}[1]
391         vmlal.u32       @ACC[2],$Ni,${N1}[0]
392         vmlal.u32       @ACC[3],$Ni,${N1}[1]
393
394         vmlal.u32       @ACC[4],$Ni,${N2}[0]
395         vmov            $Temp,@ACC[0]
396         vmlal.u32       @ACC[5],$Ni,${N2}[1]
397         vmov            @ACC[0],@ACC[1]
398         vmlal.u32       @ACC[6],$Ni,${N3}[0]
399         vmov            @ACC[1],@ACC[2]
400         vmlal.u32       @ACC[7],$Ni,${N3}[1]
401         vmov            @ACC[2],@ACC[3]
402         vmov            @ACC[3],@ACC[4]
403         vshr.u64        $temp,$temp,#16
404         vmov            @ACC[4],@ACC[5]
405         vmov            @ACC[5],@ACC[6]
406         vadd.u64        $temp,$temp,$Temp#hi
407         vmov            @ACC[6],@ACC[7]
408         veor            @ACC[7],@ACC[7]
409         vshr.u64        $temp,$temp,#16
410
411         bne     .LNEON_outer8
412
413         vadd.u64        @ACC[0]#lo,@ACC[0]#lo,$temp
414         mov             $toutptr,sp
415         vshr.u64        $temp,@ACC[0]#lo,#16
416         mov             $inner,$num
417         vadd.u64        @ACC[0]#hi,@ACC[0]#hi,$temp
418         add             $tinptr,sp,#96
419         vshr.u64        $temp,@ACC[0]#hi,#16
420         vzip.16         @ACC[0]#lo,@ACC[0]#hi
421
422         b       .LNEON_tail_entry
423
424 .align  4
425 .LNEON_8n:
426         veor            @ACC[0],@ACC[0],@ACC[0]
427          sub            $toutptr,sp,#128
428         veor            @ACC[1],@ACC[1],@ACC[1]
429          sub            $toutptr,$toutptr,$num,lsl#4
430         veor            @ACC[2],@ACC[2],@ACC[2]
431          and            $toutptr,$toutptr,#-64
432         veor            @ACC[3],@ACC[3],@ACC[3]
433          mov            sp,$toutptr                     @ alloca
434         veor            @ACC[4],@ACC[4],@ACC[4]
435          add            $toutptr,$toutptr,#256
436         veor            @ACC[5],@ACC[5],@ACC[5]
437          sub            $inner,$num,#8
438         veor            @ACC[6],@ACC[6],@ACC[6]
439         veor            @ACC[7],@ACC[7],@ACC[7]
440
441 .LNEON_8n_init:
442         vst1.64         {@ACC[0]-@ACC[1]},[$toutptr,:256]!
443         subs            $inner,$inner,#8
444         vst1.64         {@ACC[2]-@ACC[3]},[$toutptr,:256]!
445         vst1.64         {@ACC[4]-@ACC[5]},[$toutptr,:256]!
446         vst1.64         {@ACC[6]-@ACC[7]},[$toutptr,:256]!
447         bne             .LNEON_8n_init
448
449         add             $tinptr,sp,#256
450         vld1.32         {$A0-$A3},[$aptr]!
451         add             $bnptr,sp,#8
452         vld1.32         {${M0}[0]},[$n0,:32]
453         mov             $outer,$num
454         b               .LNEON_8n_outer
455
456 .align  4
457 .LNEON_8n_outer:
458         vld1.32         {${Bi}[0]},[$bptr,:32]! @ *b++
459         veor            $zero,$zero,$zero
460         vzip.16         $Bi,$zero
461         add             $toutptr,sp,#128
462         vld1.32         {$N0-$N3},[$nptr]!
463
464         vmlal.u32       @ACC[0],$Bi,${A0}[0]
465         vmlal.u32       @ACC[1],$Bi,${A0}[1]
466          veor           $zero,$zero,$zero
467         vmlal.u32       @ACC[2],$Bi,${A1}[0]
468          vshl.i64       $Ni,@ACC[0]#hi,#16
469         vmlal.u32       @ACC[3],$Bi,${A1}[1]
470          vadd.u64       $Ni,$Ni,@ACC[0]#lo
471         vmlal.u32       @ACC[4],$Bi,${A2}[0]
472          vmul.u32       $Ni,$Ni,$M0
473         vmlal.u32       @ACC[5],$Bi,${A2}[1]
474         vst1.32         {$Bi},[sp,:64]          @ put aside smashed b[8*i+0]
475         vmlal.u32       @ACC[6],$Bi,${A3}[0]
476          vzip.16        $Ni,$zero
477         vmlal.u32       @ACC[7],$Bi,${A3}[1]
478 ___
479 for ($i=0; $i<7;) {
480 $code.=<<___;
481         vld1.32         {${Bi}[0]},[$bptr,:32]! @ *b++
482         vmlal.u32       @ACC[0],$Ni,${N0}[0]
483         veor            $temp,$temp,$temp
484         vmlal.u32       @ACC[1],$Ni,${N0}[1]
485         vzip.16         $Bi,$temp
486         vmlal.u32       @ACC[2],$Ni,${N1}[0]
487          vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
488         vmlal.u32       @ACC[3],$Ni,${N1}[1]
489         vmlal.u32       @ACC[4],$Ni,${N2}[0]
490          vadd.u64       @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
491         vmlal.u32       @ACC[5],$Ni,${N2}[1]
492          vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
493         vmlal.u32       @ACC[6],$Ni,${N3}[0]
494         vmlal.u32       @ACC[7],$Ni,${N3}[1]
495          vadd.u64       @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
496         vst1.32         {$Ni},[$bnptr,:64]!     @ put aside smashed m[8*i+$i]
497 ___
498         push(@ACC,shift(@ACC)); $i++;
499 $code.=<<___;
500         vmlal.u32       @ACC[0],$Bi,${A0}[0]
501         vld1.64         {@ACC[7]},[$tinptr,:128]!
502         vmlal.u32       @ACC[1],$Bi,${A0}[1]
503          veor           $zero,$zero,$zero
504         vmlal.u32       @ACC[2],$Bi,${A1}[0]
505          vshl.i64       $Ni,@ACC[0]#hi,#16
506         vmlal.u32       @ACC[3],$Bi,${A1}[1]
507          vadd.u64       $Ni,$Ni,@ACC[0]#lo
508         vmlal.u32       @ACC[4],$Bi,${A2}[0]
509          vmul.u32       $Ni,$Ni,$M0
510         vmlal.u32       @ACC[5],$Bi,${A2}[1]
511         vst1.32         {$Bi},[$bnptr,:64]!     @ put aside smashed b[8*i+$i]
512         vmlal.u32       @ACC[6],$Bi,${A3}[0]
513          vzip.16        $Ni,$zero
514         vmlal.u32       @ACC[7],$Bi,${A3}[1]
515 ___
516 }
517 $code.=<<___;
518         vld1.32         {$Bi},[sp,:64]          @ pull smashed b[8*i+0]
519         vmlal.u32       @ACC[0],$Ni,${N0}[0]
520         vld1.32         {$A0-$A3},[$aptr]!
521         vmlal.u32       @ACC[1],$Ni,${N0}[1]
522         vmlal.u32       @ACC[2],$Ni,${N1}[0]
523          vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
524         vmlal.u32       @ACC[3],$Ni,${N1}[1]
525         vmlal.u32       @ACC[4],$Ni,${N2}[0]
526          vadd.u64       @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
527         vmlal.u32       @ACC[5],$Ni,${N2}[1]
528          vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
529         vmlal.u32       @ACC[6],$Ni,${N3}[0]
530         vmlal.u32       @ACC[7],$Ni,${N3}[1]
531          vadd.u64       @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
532         vst1.32         {$Ni},[$bnptr,:64]      @ put aside smashed m[8*i+$i]
533         add             $bnptr,sp,#8            @ rewind
534 ___
535         push(@ACC,shift(@ACC));
536 $code.=<<___;
537         sub             $inner,$num,#8
538         b               .LNEON_8n_inner
539
540 .align  4
541 .LNEON_8n_inner:
542         subs            $inner,$inner,#8
543         vmlal.u32       @ACC[0],$Bi,${A0}[0]
544         vld1.64         {@ACC[7]},[$tinptr,:128]
545         vmlal.u32       @ACC[1],$Bi,${A0}[1]
546         vld1.32         {$Ni},[$bnptr,:64]!     @ pull smashed m[8*i+0]
547         vmlal.u32       @ACC[2],$Bi,${A1}[0]
548         vld1.32         {$N0-$N3},[$nptr]!
549         vmlal.u32       @ACC[3],$Bi,${A1}[1]
550         it              ne
551         addne           $tinptr,$tinptr,#16     @ don't advance in last iteration
552         vmlal.u32       @ACC[4],$Bi,${A2}[0]
553         vmlal.u32       @ACC[5],$Bi,${A2}[1]
554         vmlal.u32       @ACC[6],$Bi,${A3}[0]
555         vmlal.u32       @ACC[7],$Bi,${A3}[1]
556 ___
557 for ($i=1; $i<8; $i++) {
558 $code.=<<___;
559         vld1.32         {$Bi},[$bnptr,:64]!     @ pull smashed b[8*i+$i]
560         vmlal.u32       @ACC[0],$Ni,${N0}[0]
561         vmlal.u32       @ACC[1],$Ni,${N0}[1]
562         vmlal.u32       @ACC[2],$Ni,${N1}[0]
563         vmlal.u32       @ACC[3],$Ni,${N1}[1]
564         vmlal.u32       @ACC[4],$Ni,${N2}[0]
565         vmlal.u32       @ACC[5],$Ni,${N2}[1]
566         vmlal.u32       @ACC[6],$Ni,${N3}[0]
567         vmlal.u32       @ACC[7],$Ni,${N3}[1]
568         vst1.64         {@ACC[0]},[$toutptr,:128]!
569 ___
570         push(@ACC,shift(@ACC));
571 $code.=<<___;
572         vmlal.u32       @ACC[0],$Bi,${A0}[0]
573         vld1.64         {@ACC[7]},[$tinptr,:128]
574         vmlal.u32       @ACC[1],$Bi,${A0}[1]
575         vld1.32         {$Ni},[$bnptr,:64]!     @ pull smashed m[8*i+$i]
576         vmlal.u32       @ACC[2],$Bi,${A1}[0]
577         it              ne
578         addne           $tinptr,$tinptr,#16     @ don't advance in last iteration
579         vmlal.u32       @ACC[3],$Bi,${A1}[1]
580         vmlal.u32       @ACC[4],$Bi,${A2}[0]
581         vmlal.u32       @ACC[5],$Bi,${A2}[1]
582         vmlal.u32       @ACC[6],$Bi,${A3}[0]
583         vmlal.u32       @ACC[7],$Bi,${A3}[1]
584 ___
585 }
586 $code.=<<___;
587         it              eq
588         subeq           $aptr,$aptr,$num,lsl#2  @ rewind
589         vmlal.u32       @ACC[0],$Ni,${N0}[0]
590         vld1.32         {$Bi},[sp,:64]          @ pull smashed b[8*i+0]
591         vmlal.u32       @ACC[1],$Ni,${N0}[1]
592         vld1.32         {$A0-$A3},[$aptr]!
593         vmlal.u32       @ACC[2],$Ni,${N1}[0]
594         add             $bnptr,sp,#8            @ rewind
595         vmlal.u32       @ACC[3],$Ni,${N1}[1]
596         vmlal.u32       @ACC[4],$Ni,${N2}[0]
597         vmlal.u32       @ACC[5],$Ni,${N2}[1]
598         vmlal.u32       @ACC[6],$Ni,${N3}[0]
599         vst1.64         {@ACC[0]},[$toutptr,:128]!
600         vmlal.u32       @ACC[7],$Ni,${N3}[1]
601
602         bne             .LNEON_8n_inner
603 ___
604         push(@ACC,shift(@ACC));
605 $code.=<<___;
606         add             $tinptr,sp,#128
607         vst1.64         {@ACC[0]-@ACC[1]},[$toutptr,:256]!
608         veor            q2,q2,q2                @ $N0-$N1
609         vst1.64         {@ACC[2]-@ACC[3]},[$toutptr,:256]!
610         veor            q3,q3,q3                @ $N2-$N3
611         vst1.64         {@ACC[4]-@ACC[5]},[$toutptr,:256]!
612         vst1.64         {@ACC[6]},[$toutptr,:128]
613
614         subs            $outer,$outer,#8
615         vld1.64         {@ACC[0]-@ACC[1]},[$tinptr,:256]!
616         vld1.64         {@ACC[2]-@ACC[3]},[$tinptr,:256]!
617         vld1.64         {@ACC[4]-@ACC[5]},[$tinptr,:256]!
618         vld1.64         {@ACC[6]-@ACC[7]},[$tinptr,:256]!
619
620         itt             ne
621         subne           $nptr,$nptr,$num,lsl#2  @ rewind
622         bne             .LNEON_8n_outer
623
624         add             $toutptr,sp,#128
625         vst1.64         {q2-q3}, [sp,:256]!     @ start wiping stack frame
626         vshr.u64        $temp,@ACC[0]#lo,#16
627         vst1.64         {q2-q3},[sp,:256]!
628         vadd.u64        @ACC[0]#hi,@ACC[0]#hi,$temp
629         vst1.64         {q2-q3}, [sp,:256]!
630         vshr.u64        $temp,@ACC[0]#hi,#16
631         vst1.64         {q2-q3}, [sp,:256]!
632         vzip.16         @ACC[0]#lo,@ACC[0]#hi
633
634         mov             $inner,$num
635         b               .LNEON_tail_entry
636
637 .align  4
638 .LNEON_tail:
639         vadd.u64        @ACC[0]#lo,@ACC[0]#lo,$temp
640         vshr.u64        $temp,@ACC[0]#lo,#16
641         vld1.64         {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
642         vadd.u64        @ACC[0]#hi,@ACC[0]#hi,$temp
643         vld1.64         {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
644         vshr.u64        $temp,@ACC[0]#hi,#16
645         vld1.64         {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
646         vzip.16         @ACC[0]#lo,@ACC[0]#hi
647
648 .LNEON_tail_entry:
649 ___
650 for ($i=1; $i<8; $i++) {
651 $code.=<<___;
652         vadd.u64        @ACC[1]#lo,@ACC[1]#lo,$temp
653         vst1.32         {@ACC[0]#lo[0]}, [$toutptr, :32]!
654         vshr.u64        $temp,@ACC[1]#lo,#16
655         vadd.u64        @ACC[1]#hi,@ACC[1]#hi,$temp
656         vshr.u64        $temp,@ACC[1]#hi,#16
657         vzip.16         @ACC[1]#lo,@ACC[1]#hi
658 ___
659         push(@ACC,shift(@ACC));
660 }
661         push(@ACC,shift(@ACC));
662 $code.=<<___;
663         vld1.64         {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
664         subs            $inner,$inner,#8
665         vst1.32         {@ACC[7]#lo[0]},   [$toutptr, :32]!
666         bne     .LNEON_tail
667
668         vst1.32 {${temp}[0]}, [$toutptr, :32]           @ top-most bit
669         sub     $nptr,$nptr,$num,lsl#2                  @ rewind $nptr
670         subs    $aptr,sp,#0                             @ clear carry flag
671         add     $bptr,sp,$num,lsl#2
672
673 .LNEON_sub:
674         ldmia   $aptr!, {r4-r7}
675         ldmia   $nptr!, {r8-r11}
676         sbcs    r8, r4,r8
677         sbcs    r9, r5,r9
678         sbcs    r10,r6,r10
679         sbcs    r11,r7,r11
680         teq     $aptr,$bptr                             @ preserves carry
681         stmia   $rptr!, {r8-r11}
682         bne     .LNEON_sub
683
684         ldr     r10, [$aptr]                            @ load top-most bit
685         mov     r11,sp
686         veor    q0,q0,q0
687         sub     r11,$bptr,r11                           @ this is num*4
688         veor    q1,q1,q1
689         mov     $aptr,sp
690         sub     $rptr,$rptr,r11                         @ rewind $rptr
691         mov     $nptr,$bptr                             @ second 3/4th of frame
692         sbcs    r10,r10,#0                              @ result is carry flag
693
694 .LNEON_copy_n_zap:
695         ldmia   $aptr!, {r4-r7}
696         ldmia   $rptr,  {r8-r11}
697         it      cc
698         movcc   r8, r4
699         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
700         itt     cc
701         movcc   r9, r5
702         movcc   r10,r6
703         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
704         it      cc
705         movcc   r11,r7
706         ldmia   $aptr, {r4-r7}
707         stmia   $rptr!, {r8-r11}
708         sub     $aptr,$aptr,#16
709         ldmia   $rptr, {r8-r11}
710         it      cc
711         movcc   r8, r4
712         vst1.64 {q0-q1}, [$aptr,:256]!                  @ wipe
713         itt     cc
714         movcc   r9, r5
715         movcc   r10,r6
716         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
717         it      cc
718         movcc   r11,r7
719         teq     $aptr,$bptr                             @ preserves carry
720         stmia   $rptr!, {r8-r11}
721         bne     .LNEON_copy_n_zap
722
723         mov     sp,ip
724         vldmia  sp!,{d8-d15}
725         ldmia   sp!,{r4-r11}
726         ret                                             @ bx lr
727 .size   bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
728 #endif
729 ___
730 }
731 $code.=<<___;
732 .asciz  "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
733 .align  2
734 #if __ARM_MAX_ARCH__>=7
735 .comm   OPENSSL_armcap_P,4,4
736 #endif
737 ___
738
739 foreach (split("\n",$code)) {
740         s/\`([^\`]*)\`/eval $1/ge;
741
742         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge        or
743         s/\bret\b/bx    lr/g                                            or
744         s/\bbx\s+lr\b/.word\t0xe12fff1e/g;      # make it possible to compile with -march=armv4
745
746         print $_,"\n";
747 }
748
749 close STDOUT;