59f218b5cf3d3b72070c76ab309818c2b7eadcd5
[openssl.git] / crypto / bn / asm / armv4-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # January 2007.
11
12 # Montgomery multiplication for ARMv4.
13 #
14 # Performance improvement naturally varies among CPU implementations
15 # and compilers. The code was observed to provide +65-35% improvement
16 # [depending on key length, less for longer keys] on ARM920T, and
17 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18 # base and compiler generated code with in-lined umull and even umlal
19 # instructions. The latter means that this code didn't really have an 
20 # "advantage" of utilizing some "secret" instruction.
21 #
22 # The code is interoperable with Thumb ISA and is rather compact, less
23 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
24 # about decorations, ABI and instruction syntax are identical.
25
26 # November 2013
27 #
28 # Add NEON code path, which handles lengths divisible by 8. RSA/DSA
29 # performance improvement on Cortex-A8 is ~45-100% depending on key
30 # length, more for longer keys. On Cortex-A15 the span is ~10-105%.
31 # On Snapdragon S4 improvement was measured to vary from ~70% to
32 # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
33 # rather because original integer-only code seems to perform
34 # suboptimally on S4. Situation on Cortex-A9 is unfortunately
35 # different. It's being looked into, but the trouble is that
36 # performance for vectors longer than 256 bits is actually couple
37 # of percent worse than for integer-only code. The code is chosen
38 # for execution on all NEON-capable processors, because gain on
39 # others outweighs the marginal loss on Cortex-A9.
40
41 $flavour = shift;
42 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
43 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
44
45 if ($flavour && $flavour ne "void") {
46     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49     die "can't locate arm-xlate.pl";
50
51     open STDOUT,"| \"$^X\" $xlate $flavour $output";
52 } else {
53     open STDOUT,">$output";
54 }
55
56 $num="r0";      # starts as num argument, but holds &tp[num-1]
57 $ap="r1";
58 $bp="r2"; $bi="r2"; $rp="r2";
59 $np="r3";
60 $tp="r4";
61 $aj="r5";
62 $nj="r6";
63 $tj="r7";
64 $n0="r8";
65 ###########     # r9 is reserved by ELF as platform specific, e.g. TLS pointer
66 $alo="r10";     # sl, gcc uses it to keep @GOT
67 $ahi="r11";     # fp
68 $nlo="r12";     # ip
69 ###########     # r13 is stack pointer
70 $nhi="r14";     # lr
71 ###########     # r15 is program counter
72
73 #### argument block layout relative to &tp[num-1], a.k.a. $num
74 $_rp="$num,#12*4";
75 # ap permanently resides in r1
76 $_bp="$num,#13*4";
77 # np permanently resides in r3
78 $_n0="$num,#14*4";
79 $_num="$num,#15*4";     $_bpend=$_num;
80
81 $code=<<___;
82 #include "arm_arch.h"
83
84 .text
85 .code   32
86
87 #if __ARM_MAX_ARCH__>=7
88 .align  5
89 .LOPENSSL_armcap:
90 .word   OPENSSL_armcap_P-.Lbn_mul_mont
91 #endif
92
93 .global bn_mul_mont
94 .type   bn_mul_mont,%function
95
96 .align  5
97 bn_mul_mont:
98 .Lbn_mul_mont:
99         ldr     ip,[sp,#4]              @ load num
100         stmdb   sp!,{r0,r2}             @ sp points at argument block
101 #if __ARM_MAX_ARCH__>=7
102         tst     ip,#7
103         bne     .Lialu
104         adr     r0,bn_mul_mont
105         ldr     r2,.LOPENSSL_armcap
106         ldr     r0,[r0,r2]
107 #ifdef  __APPLE__
108         ldr     r0,[r0]
109 #endif
110         tst     r0,#1                   @ NEON available?
111         ldmia   sp, {r0,r2}
112         beq     .Lialu
113         add     sp,sp,#8
114         b       bn_mul8x_mont_neon
115 .align  4
116 .Lialu:
117 #endif
118         cmp     ip,#2
119         mov     $num,ip                 @ load num
120         movlt   r0,#0
121         addlt   sp,sp,#2*4
122         blt     .Labrt
123
124         stmdb   sp!,{r4-r12,lr}         @ save 10 registers
125
126         mov     $num,$num,lsl#2         @ rescale $num for byte count
127         sub     sp,sp,$num              @ alloca(4*num)
128         sub     sp,sp,#4                @ +extra dword
129         sub     $num,$num,#4            @ "num=num-1"
130         add     $tp,$bp,$num            @ &bp[num-1]
131
132         add     $num,sp,$num            @ $num to point at &tp[num-1]
133         ldr     $n0,[$_n0]              @ &n0
134         ldr     $bi,[$bp]               @ bp[0]
135         ldr     $aj,[$ap],#4            @ ap[0],ap++
136         ldr     $nj,[$np],#4            @ np[0],np++
137         ldr     $n0,[$n0]               @ *n0
138         str     $tp,[$_bpend]           @ save &bp[num]
139
140         umull   $alo,$ahi,$aj,$bi       @ ap[0]*bp[0]
141         str     $n0,[$_n0]              @ save n0 value
142         mul     $n0,$alo,$n0            @ "tp[0]"*n0
143         mov     $nlo,#0
144         umlal   $alo,$nlo,$nj,$n0       @ np[0]*n0+"t[0]"
145         mov     $tp,sp
146
147 .L1st:
148         ldr     $aj,[$ap],#4            @ ap[j],ap++
149         mov     $alo,$ahi
150         ldr     $nj,[$np],#4            @ np[j],np++
151         mov     $ahi,#0
152         umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[0]
153         mov     $nhi,#0
154         umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
155         adds    $nlo,$nlo,$alo
156         str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
157         adc     $nlo,$nhi,#0
158         cmp     $tp,$num
159         bne     .L1st
160
161         adds    $nlo,$nlo,$ahi
162         ldr     $tp,[$_bp]              @ restore bp
163         mov     $nhi,#0
164         ldr     $n0,[$_n0]              @ restore n0
165         adc     $nhi,$nhi,#0
166         str     $nlo,[$num]             @ tp[num-1]=
167         str     $nhi,[$num,#4]          @ tp[num]=
168 \f
169 .Louter:
170         sub     $tj,$num,sp             @ "original" $num-1 value
171         sub     $ap,$ap,$tj             @ "rewind" ap to &ap[1]
172         ldr     $bi,[$tp,#4]!           @ *(++bp)
173         sub     $np,$np,$tj             @ "rewind" np to &np[1]
174         ldr     $aj,[$ap,#-4]           @ ap[0]
175         ldr     $alo,[sp]               @ tp[0]
176         ldr     $nj,[$np,#-4]           @ np[0]
177         ldr     $tj,[sp,#4]             @ tp[1]
178
179         mov     $ahi,#0
180         umlal   $alo,$ahi,$aj,$bi       @ ap[0]*bp[i]+tp[0]
181         str     $tp,[$_bp]              @ save bp
182         mul     $n0,$alo,$n0
183         mov     $nlo,#0
184         umlal   $alo,$nlo,$nj,$n0       @ np[0]*n0+"tp[0]"
185         mov     $tp,sp
186
187 .Linner:
188         ldr     $aj,[$ap],#4            @ ap[j],ap++
189         adds    $alo,$ahi,$tj           @ +=tp[j]
190         ldr     $nj,[$np],#4            @ np[j],np++
191         mov     $ahi,#0
192         umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[i]
193         mov     $nhi,#0
194         umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
195         adc     $ahi,$ahi,#0
196         ldr     $tj,[$tp,#8]            @ tp[j+1]
197         adds    $nlo,$nlo,$alo
198         str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
199         adc     $nlo,$nhi,#0
200         cmp     $tp,$num
201         bne     .Linner
202
203         adds    $nlo,$nlo,$ahi
204         mov     $nhi,#0
205         ldr     $tp,[$_bp]              @ restore bp
206         adc     $nhi,$nhi,#0
207         ldr     $n0,[$_n0]              @ restore n0
208         adds    $nlo,$nlo,$tj
209         ldr     $tj,[$_bpend]           @ restore &bp[num]
210         adc     $nhi,$nhi,#0
211         str     $nlo,[$num]             @ tp[num-1]=
212         str     $nhi,[$num,#4]          @ tp[num]=
213
214         cmp     $tp,$tj
215         bne     .Louter
216 \f
217         ldr     $rp,[$_rp]              @ pull rp
218         add     $num,$num,#4            @ $num to point at &tp[num]
219         sub     $aj,$num,sp             @ "original" num value
220         mov     $tp,sp                  @ "rewind" $tp
221         mov     $ap,$tp                 @ "borrow" $ap
222         sub     $np,$np,$aj             @ "rewind" $np to &np[0]
223
224         subs    $tj,$tj,$tj             @ "clear" carry flag
225 .Lsub:  ldr     $tj,[$tp],#4
226         ldr     $nj,[$np],#4
227         sbcs    $tj,$tj,$nj             @ tp[j]-np[j]
228         str     $tj,[$rp],#4            @ rp[j]=
229         teq     $tp,$num                @ preserve carry
230         bne     .Lsub
231         sbcs    $nhi,$nhi,#0            @ upmost carry
232         mov     $tp,sp                  @ "rewind" $tp
233         sub     $rp,$rp,$aj             @ "rewind" $rp
234
235         and     $ap,$tp,$nhi
236         bic     $np,$rp,$nhi
237         orr     $ap,$ap,$np             @ ap=borrow?tp:rp
238
239 .Lcopy: ldr     $tj,[$ap],#4            @ copy or in-place refresh
240         str     sp,[$tp],#4             @ zap tp
241         str     $tj,[$rp],#4
242         cmp     $tp,$num
243         bne     .Lcopy
244
245         add     sp,$num,#4              @ skip over tp[num+1]
246         ldmia   sp!,{r4-r12,lr}         @ restore registers
247         add     sp,sp,#2*4              @ skip over {r0,r2}
248         mov     r0,#1
249 .Labrt:
250 #if __ARM_ARCH__>=5
251         ret                             @ bx lr
252 #else
253         tst     lr,#1
254         moveq   pc,lr                   @ be binary compatible with V4, yet
255         bx      lr                      @ interoperable with Thumb ISA:-)
256 #endif
257 .size   bn_mul_mont,.-bn_mul_mont
258 ___
259 {
260 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
261 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
262
263 my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
264 my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
265 my ($Z,$Temp)=("q4","q5");
266 my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
267 my ($Bi,$Ni,$M0)=map("d$_",(28..31));
268 my $zero=&Dlo($Z);
269 my $temp=&Dlo($Temp);
270
271 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
272 my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
273
274 $code.=<<___;
275 #if __ARM_MAX_ARCH__>=7
276 .arch   armv7-a
277 .fpu    neon
278
279 .type   bn_mul8x_mont_neon,%function
280 .align  5
281 bn_mul8x_mont_neon:
282         mov     ip,sp
283         stmdb   sp!,{r4-r11}
284         vstmdb  sp!,{d8-d15}            @ ABI specification says so
285         ldmia   ip,{r4-r5}              @ load rest of parameter block
286
287         sub             $toutptr,sp,#16
288         vld1.32         {${Bi}[0]}, [$bptr,:32]!
289         sub             $toutptr,$toutptr,$num,lsl#4
290         vld1.32         {$A0-$A3},  [$aptr]!            @ can't specify :32 :-(
291         and             $toutptr,$toutptr,#-64
292         vld1.32         {${M0}[0]}, [$n0,:32]
293         mov             sp,$toutptr                     @ alloca
294         veor            $zero,$zero,$zero
295         subs            $inner,$num,#8
296         vzip.16         $Bi,$zero
297
298         vmull.u32       $A0xB,$Bi,${A0}[0]
299         vmull.u32       $A1xB,$Bi,${A0}[1]
300         vmull.u32       $A2xB,$Bi,${A1}[0]
301         vshl.i64        $temp,`&Dhi("$A0xB")`,#16
302         vmull.u32       $A3xB,$Bi,${A1}[1]
303
304         vadd.u64        $temp,$temp,`&Dlo("$A0xB")`
305         veor            $zero,$zero,$zero
306         vmul.u32        $Ni,$temp,$M0
307
308         vmull.u32       $A4xB,$Bi,${A2}[0]
309          vld1.32        {$N0-$N3}, [$nptr]!
310         vmull.u32       $A5xB,$Bi,${A2}[1]
311         vmull.u32       $A6xB,$Bi,${A3}[0]
312         vzip.16         $Ni,$zero
313         vmull.u32       $A7xB,$Bi,${A3}[1]
314
315         bne     .LNEON_1st
316
317         @ special case for num=8, everything is in register bank...
318
319         vmlal.u32       $A0xB,$Ni,${N0}[0]
320         sub             $outer,$num,#1
321         vmlal.u32       $A1xB,$Ni,${N0}[1]
322         vmlal.u32       $A2xB,$Ni,${N1}[0]
323         vmlal.u32       $A3xB,$Ni,${N1}[1]
324
325         vmlal.u32       $A4xB,$Ni,${N2}[0]
326         vmov            $Temp,$A0xB
327         vmlal.u32       $A5xB,$Ni,${N2}[1]
328         vmov            $A0xB,$A1xB
329         vmlal.u32       $A6xB,$Ni,${N3}[0]
330         vmov            $A1xB,$A2xB
331         vmlal.u32       $A7xB,$Ni,${N3}[1]
332         vmov            $A2xB,$A3xB
333         vmov            $A3xB,$A4xB
334         vshr.u64        $temp,$temp,#16
335         vmov            $A4xB,$A5xB
336         vmov            $A5xB,$A6xB
337         vadd.u64        $temp,$temp,`&Dhi("$Temp")`
338         vmov            $A6xB,$A7xB
339         veor            $A7xB,$A7xB
340         vshr.u64        $temp,$temp,#16
341
342         b       .LNEON_outer8
343
344 .align  4
345 .LNEON_outer8:
346         vld1.32         {${Bi}[0]}, [$bptr,:32]!
347         veor            $zero,$zero,$zero
348         vzip.16         $Bi,$zero
349         vadd.u64        `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
350
351         vmlal.u32       $A0xB,$Bi,${A0}[0]
352         vmlal.u32       $A1xB,$Bi,${A0}[1]
353         vmlal.u32       $A2xB,$Bi,${A1}[0]
354         vshl.i64        $temp,`&Dhi("$A0xB")`,#16
355         vmlal.u32       $A3xB,$Bi,${A1}[1]
356
357         vadd.u64        $temp,$temp,`&Dlo("$A0xB")`
358         veor            $zero,$zero,$zero
359         subs            $outer,$outer,#1
360         vmul.u32        $Ni,$temp,$M0
361
362         vmlal.u32       $A4xB,$Bi,${A2}[0]
363         vmlal.u32       $A5xB,$Bi,${A2}[1]
364         vmlal.u32       $A6xB,$Bi,${A3}[0]
365         vzip.16         $Ni,$zero
366         vmlal.u32       $A7xB,$Bi,${A3}[1]
367
368         vmlal.u32       $A0xB,$Ni,${N0}[0]
369         vmlal.u32       $A1xB,$Ni,${N0}[1]
370         vmlal.u32       $A2xB,$Ni,${N1}[0]
371         vmlal.u32       $A3xB,$Ni,${N1}[1]
372
373         vmlal.u32       $A4xB,$Ni,${N2}[0]
374         vmov            $Temp,$A0xB
375         vmlal.u32       $A5xB,$Ni,${N2}[1]
376         vmov            $A0xB,$A1xB
377         vmlal.u32       $A6xB,$Ni,${N3}[0]
378         vmov            $A1xB,$A2xB
379         vmlal.u32       $A7xB,$Ni,${N3}[1]
380         vmov            $A2xB,$A3xB
381         vmov            $A3xB,$A4xB
382         vshr.u64        $temp,$temp,#16
383         vmov            $A4xB,$A5xB
384         vmov            $A5xB,$A6xB
385         vadd.u64        $temp,$temp,`&Dhi("$Temp")`
386         vmov            $A6xB,$A7xB
387         veor            $A7xB,$A7xB
388         vshr.u64        $temp,$temp,#16
389
390         bne     .LNEON_outer8
391
392         vadd.u64        `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
393         mov             $toutptr,sp
394         vshr.u64        $temp,`&Dlo("$A0xB")`,#16
395         mov             $inner,$num
396         vadd.u64        `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
397         add             $tinptr,sp,#16
398         vshr.u64        $temp,`&Dhi("$A0xB")`,#16
399         vzip.16         `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
400
401         b       .LNEON_tail2
402
403 .align  4
404 .LNEON_1st:
405         vmlal.u32       $A0xB,$Ni,${N0}[0]
406          vld1.32        {$A0-$A3}, [$aptr]!
407         vmlal.u32       $A1xB,$Ni,${N0}[1]
408         subs            $inner,$inner,#8
409         vmlal.u32       $A2xB,$Ni,${N1}[0]
410         vmlal.u32       $A3xB,$Ni,${N1}[1]
411
412         vmlal.u32       $A4xB,$Ni,${N2}[0]
413          vld1.32        {$N0-$N1}, [$nptr]!
414         vmlal.u32       $A5xB,$Ni,${N2}[1]
415          vst1.64        {$A0xB-$A1xB}, [$toutptr,:256]!
416         vmlal.u32       $A6xB,$Ni,${N3}[0]
417         vmlal.u32       $A7xB,$Ni,${N3}[1]
418          vst1.64        {$A2xB-$A3xB}, [$toutptr,:256]!
419
420         vmull.u32       $A0xB,$Bi,${A0}[0]
421          vld1.32        {$N2-$N3}, [$nptr]!
422         vmull.u32       $A1xB,$Bi,${A0}[1]
423          vst1.64        {$A4xB-$A5xB}, [$toutptr,:256]!
424         vmull.u32       $A2xB,$Bi,${A1}[0]
425         vmull.u32       $A3xB,$Bi,${A1}[1]
426          vst1.64        {$A6xB-$A7xB}, [$toutptr,:256]!
427
428         vmull.u32       $A4xB,$Bi,${A2}[0]
429         vmull.u32       $A5xB,$Bi,${A2}[1]
430         vmull.u32       $A6xB,$Bi,${A3}[0]
431         vmull.u32       $A7xB,$Bi,${A3}[1]
432
433         bne     .LNEON_1st
434
435         vmlal.u32       $A0xB,$Ni,${N0}[0]
436         add             $tinptr,sp,#16
437         vmlal.u32       $A1xB,$Ni,${N0}[1]
438         sub             $aptr,$aptr,$num,lsl#2          @ rewind $aptr
439         vmlal.u32       $A2xB,$Ni,${N1}[0]
440          vld1.64        {$Temp}, [sp,:128]
441         vmlal.u32       $A3xB,$Ni,${N1}[1]
442         sub             $outer,$num,#1
443
444         vmlal.u32       $A4xB,$Ni,${N2}[0]
445         vst1.64         {$A0xB-$A1xB}, [$toutptr,:256]!
446         vmlal.u32       $A5xB,$Ni,${N2}[1]
447         vshr.u64        $temp,$temp,#16
448          vld1.64        {$A0xB},       [$tinptr, :128]!
449         vmlal.u32       $A6xB,$Ni,${N3}[0]
450         vst1.64         {$A2xB-$A3xB}, [$toutptr,:256]!
451         vmlal.u32       $A7xB,$Ni,${N3}[1]
452
453         vst1.64         {$A4xB-$A5xB}, [$toutptr,:256]!
454         vadd.u64        $temp,$temp,`&Dhi("$Temp")`
455         veor            $Z,$Z,$Z
456         vst1.64         {$A6xB-$A7xB}, [$toutptr,:256]!
457          vld1.64        {$A1xB-$A2xB}, [$tinptr, :256]!
458         vst1.64         {$Z},          [$toutptr,:128]
459         vshr.u64        $temp,$temp,#16
460
461         b               .LNEON_outer
462
463 .align  4
464 .LNEON_outer:
465         vld1.32         {${Bi}[0]}, [$bptr,:32]!
466         sub             $nptr,$nptr,$num,lsl#2          @ rewind $nptr
467         vld1.32         {$A0-$A3},  [$aptr]!
468         veor            $zero,$zero,$zero
469         mov             $toutptr,sp
470         vzip.16         $Bi,$zero
471         sub             $inner,$num,#8
472         vadd.u64        `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
473
474         vmlal.u32       $A0xB,$Bi,${A0}[0]
475          vld1.64        {$A3xB-$A4xB},[$tinptr,:256]!
476         vmlal.u32       $A1xB,$Bi,${A0}[1]
477         vmlal.u32       $A2xB,$Bi,${A1}[0]
478          vld1.64        {$A5xB-$A6xB},[$tinptr,:256]!
479         vmlal.u32       $A3xB,$Bi,${A1}[1]
480
481         vshl.i64        $temp,`&Dhi("$A0xB")`,#16
482         veor            $zero,$zero,$zero
483         vadd.u64        $temp,$temp,`&Dlo("$A0xB")`
484          vld1.64        {$A7xB},[$tinptr,:128]!
485         vmul.u32        $Ni,$temp,$M0
486
487         vmlal.u32       $A4xB,$Bi,${A2}[0]
488          vld1.32        {$N0-$N3}, [$nptr]!
489         vmlal.u32       $A5xB,$Bi,${A2}[1]
490         vmlal.u32       $A6xB,$Bi,${A3}[0]
491         vzip.16         $Ni,$zero
492         vmlal.u32       $A7xB,$Bi,${A3}[1]
493
494 .LNEON_inner:
495         vmlal.u32       $A0xB,$Ni,${N0}[0]
496          vld1.32        {$A0-$A3}, [$aptr]!
497         vmlal.u32       $A1xB,$Ni,${N0}[1]
498          subs           $inner,$inner,#8
499         vmlal.u32       $A2xB,$Ni,${N1}[0]
500         vmlal.u32       $A3xB,$Ni,${N1}[1]
501         vst1.64         {$A0xB-$A1xB}, [$toutptr,:256]!
502
503         vmlal.u32       $A4xB,$Ni,${N2}[0]
504          vld1.64        {$A0xB},       [$tinptr, :128]!
505         vmlal.u32       $A5xB,$Ni,${N2}[1]
506         vst1.64         {$A2xB-$A3xB}, [$toutptr,:256]!
507         vmlal.u32       $A6xB,$Ni,${N3}[0]
508          vld1.64        {$A1xB-$A2xB}, [$tinptr, :256]!
509         vmlal.u32       $A7xB,$Ni,${N3}[1]
510         vst1.64         {$A4xB-$A5xB}, [$toutptr,:256]!
511
512         vmlal.u32       $A0xB,$Bi,${A0}[0]
513          vld1.64        {$A3xB-$A4xB}, [$tinptr, :256]!
514         vmlal.u32       $A1xB,$Bi,${A0}[1]
515         vst1.64         {$A6xB-$A7xB}, [$toutptr,:256]!
516         vmlal.u32       $A2xB,$Bi,${A1}[0]
517          vld1.64        {$A5xB-$A6xB}, [$tinptr, :256]!
518         vmlal.u32       $A3xB,$Bi,${A1}[1]
519          vld1.32        {$N0-$N3}, [$nptr]!
520
521         vmlal.u32       $A4xB,$Bi,${A2}[0]
522          vld1.64        {$A7xB},       [$tinptr, :128]!
523         vmlal.u32       $A5xB,$Bi,${A2}[1]
524         vmlal.u32       $A6xB,$Bi,${A3}[0]
525         vmlal.u32       $A7xB,$Bi,${A3}[1]
526
527         bne     .LNEON_inner
528
529         vmlal.u32       $A0xB,$Ni,${N0}[0]
530         add             $tinptr,sp,#16
531         vmlal.u32       $A1xB,$Ni,${N0}[1]
532         sub             $aptr,$aptr,$num,lsl#2          @ rewind $aptr
533         vmlal.u32       $A2xB,$Ni,${N1}[0]
534          vld1.64        {$Temp}, [sp,:128]
535         vmlal.u32       $A3xB,$Ni,${N1}[1]
536         subs            $outer,$outer,#1
537
538         vmlal.u32       $A4xB,$Ni,${N2}[0]
539         vst1.64         {$A0xB-$A1xB}, [$toutptr,:256]!
540         vmlal.u32       $A5xB,$Ni,${N2}[1]
541          vld1.64        {$A0xB},       [$tinptr, :128]!
542         vshr.u64        $temp,$temp,#16
543         vst1.64         {$A2xB-$A3xB}, [$toutptr,:256]!
544         vmlal.u32       $A6xB,$Ni,${N3}[0]
545          vld1.64        {$A1xB-$A2xB}, [$tinptr, :256]!
546         vmlal.u32       $A7xB,$Ni,${N3}[1]
547
548         vst1.64         {$A4xB-$A5xB}, [$toutptr,:256]!
549         vadd.u64        $temp,$temp,`&Dhi("$Temp")`
550         vst1.64         {$A6xB-$A7xB}, [$toutptr,:256]!
551         vshr.u64        $temp,$temp,#16
552
553         bne     .LNEON_outer
554
555         mov             $toutptr,sp
556         mov             $inner,$num
557
558 .LNEON_tail:
559         vadd.u64        `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
560         vld1.64         {$A3xB-$A4xB}, [$tinptr, :256]!
561         vshr.u64        $temp,`&Dlo("$A0xB")`,#16
562         vadd.u64        `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
563         vld1.64         {$A5xB-$A6xB}, [$tinptr, :256]!
564         vshr.u64        $temp,`&Dhi("$A0xB")`,#16
565         vld1.64         {$A7xB},       [$tinptr, :128]!
566         vzip.16         `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
567
568 .LNEON_tail2:
569         vadd.u64        `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
570         vst1.32         {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
571         vshr.u64        $temp,`&Dlo("$A1xB")`,#16
572         vadd.u64        `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
573         vshr.u64        $temp,`&Dhi("$A1xB")`,#16
574         vzip.16         `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
575
576         vadd.u64        `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
577         vst1.32         {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
578         vshr.u64        $temp,`&Dlo("$A2xB")`,#16
579         vadd.u64        `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
580         vshr.u64        $temp,`&Dhi("$A2xB")`,#16
581         vzip.16         `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
582
583         vadd.u64        `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
584         vst1.32         {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
585         vshr.u64        $temp,`&Dlo("$A3xB")`,#16
586         vadd.u64        `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
587         vshr.u64        $temp,`&Dhi("$A3xB")`,#16
588         vzip.16         `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
589
590         vadd.u64        `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
591         vst1.32         {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
592         vshr.u64        $temp,`&Dlo("$A4xB")`,#16
593         vadd.u64        `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
594         vshr.u64        $temp,`&Dhi("$A4xB")`,#16
595         vzip.16         `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
596
597         vadd.u64        `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
598         vst1.32         {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
599         vshr.u64        $temp,`&Dlo("$A5xB")`,#16
600         vadd.u64        `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
601         vshr.u64        $temp,`&Dhi("$A5xB")`,#16
602         vzip.16         `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
603
604         vadd.u64        `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
605         vst1.32         {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
606         vshr.u64        $temp,`&Dlo("$A6xB")`,#16
607         vadd.u64        `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
608         vld1.64         {$A0xB}, [$tinptr, :128]!
609         vshr.u64        $temp,`&Dhi("$A6xB")`,#16
610         vzip.16         `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
611
612         vadd.u64        `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
613         vst1.32         {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
614         vshr.u64        $temp,`&Dlo("$A7xB")`,#16
615         vadd.u64        `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
616         vld1.64         {$A1xB-$A2xB},  [$tinptr, :256]!
617         vshr.u64        $temp,`&Dhi("$A7xB")`,#16
618         vzip.16         `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
619         subs            $inner,$inner,#8
620         vst1.32         {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
621
622         bne     .LNEON_tail
623
624         vst1.32 {${temp}[0]}, [$toutptr, :32]           @ top-most bit
625         sub     $nptr,$nptr,$num,lsl#2                  @ rewind $nptr
626         subs    $aptr,sp,#0                             @ clear carry flag
627         add     $bptr,sp,$num,lsl#2
628
629 .LNEON_sub:
630         ldmia   $aptr!, {r4-r7}
631         ldmia   $nptr!, {r8-r11}
632         sbcs    r8, r4,r8
633         sbcs    r9, r5,r9
634         sbcs    r10,r6,r10
635         sbcs    r11,r7,r11
636         teq     $aptr,$bptr                             @ preserves carry
637         stmia   $rptr!, {r8-r11}
638         bne     .LNEON_sub
639
640         ldr     r10, [$aptr]                            @ load top-most bit
641         veor    q0,q0,q0
642         sub     r11,$bptr,sp                            @ this is num*4
643         veor    q1,q1,q1
644         mov     $aptr,sp
645         sub     $rptr,$rptr,r11                         @ rewind $rptr
646         mov     $nptr,$bptr                             @ second 3/4th of frame
647         sbcs    r10,r10,#0                              @ result is carry flag
648
649 .LNEON_copy_n_zap:
650         ldmia   $aptr!, {r4-r7}
651         ldmia   $rptr,  {r8-r11}
652         movcc   r8, r4
653         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
654         movcc   r9, r5
655         movcc   r10,r6
656         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
657         movcc   r11,r7
658         ldmia   $aptr, {r4-r7}
659         stmia   $rptr!, {r8-r11}
660         sub     $aptr,$aptr,#16
661         ldmia   $rptr, {r8-r11}
662         movcc   r8, r4
663         vst1.64 {q0-q1}, [$aptr,:256]!                  @ wipe
664         movcc   r9, r5
665         movcc   r10,r6
666         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
667         movcc   r11,r7
668         teq     $aptr,$bptr                             @ preserves carry
669         stmia   $rptr!, {r8-r11}
670         bne     .LNEON_copy_n_zap
671
672         sub     sp,ip,#96
673         vldmia  sp!,{d8-d15}
674         ldmia   sp!,{r4-r11}
675         ret                                             @ bx lr
676 .size   bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
677 #endif
678 ___
679 }
680 $code.=<<___;
681 .asciz  "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
682 .align  2
683 #if __ARM_MAX_ARCH__>=7
684 .comm   OPENSSL_armcap_P,4,4
685 #endif
686 ___
687
688 $code =~ s/\`([^\`]*)\`/eval $1/gem;
689 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
690 $code =~ s/\bret\b/bx   lr/gm;
691 print $code;
692 close STDOUT;