ARMv4 assembly pack: implement support for Thumb2.
[openssl.git] / crypto / bn / asm / armv4-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # January 2007.
11
12 # Montgomery multiplication for ARMv4.
13 #
14 # Performance improvement naturally varies among CPU implementations
15 # and compilers. The code was observed to provide +65-35% improvement
16 # [depending on key length, less for longer keys] on ARM920T, and
17 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18 # base and compiler generated code with in-lined umull and even umlal
19 # instructions. The latter means that this code didn't really have an 
20 # "advantage" of utilizing some "secret" instruction.
21 #
22 # The code is interoperable with Thumb ISA and is rather compact, less
23 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
24 # about decorations, ABI and instruction syntax are identical.
25
26 # November 2013
27 #
28 # Add NEON code path, which handles lengths divisible by 8. RSA/DSA
29 # performance improvement on Cortex-A8 is ~45-100% depending on key
30 # length, more for longer keys. On Cortex-A15 the span is ~10-105%.
31 # On Snapdragon S4 improvement was measured to vary from ~70% to
32 # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
33 # rather because original integer-only code seems to perform
34 # suboptimally on S4. Situation on Cortex-A9 is unfortunately
35 # different. It's being looked into, but the trouble is that
36 # performance for vectors longer than 256 bits is actually couple
37 # of percent worse than for integer-only code. The code is chosen
38 # for execution on all NEON-capable processors, because gain on
39 # others outweighs the marginal loss on Cortex-A9.
40
41 $flavour = shift;
42 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
43 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
44
45 if ($flavour && $flavour ne "void") {
46     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49     die "can't locate arm-xlate.pl";
50
51     open STDOUT,"| \"$^X\" $xlate $flavour $output";
52 } else {
53     open STDOUT,">$output";
54 }
55
56 $num="r0";      # starts as num argument, but holds &tp[num-1]
57 $ap="r1";
58 $bp="r2"; $bi="r2"; $rp="r2";
59 $np="r3";
60 $tp="r4";
61 $aj="r5";
62 $nj="r6";
63 $tj="r7";
64 $n0="r8";
65 ###########     # r9 is reserved by ELF as platform specific, e.g. TLS pointer
66 $alo="r10";     # sl, gcc uses it to keep @GOT
67 $ahi="r11";     # fp
68 $nlo="r12";     # ip
69 ###########     # r13 is stack pointer
70 $nhi="r14";     # lr
71 ###########     # r15 is program counter
72
73 #### argument block layout relative to &tp[num-1], a.k.a. $num
74 $_rp="$num,#12*4";
75 # ap permanently resides in r1
76 $_bp="$num,#13*4";
77 # np permanently resides in r3
78 $_n0="$num,#14*4";
79 $_num="$num,#15*4";     $_bpend=$_num;
80
81 $code=<<___;
82 #include "arm_arch.h"
83
84 .text
85 #if defined(__thumb2__) && !defined(__APPLE__)
86 .syntax unified
87 .thumb
88 #else
89 .code   32
90 #endif
91
92 #if __ARM_MAX_ARCH__>=7
93 .align  5
94 .LOPENSSL_armcap:
95 .word   OPENSSL_armcap_P-.Lbn_mul_mont
96 #endif
97
98 .global bn_mul_mont
99 .type   bn_mul_mont,%function
100
101 .align  5
102 bn_mul_mont:
103 .Lbn_mul_mont:
104         ldr     ip,[sp,#4]              @ load num
105         stmdb   sp!,{r0,r2}             @ sp points at argument block
106 #if __ARM_MAX_ARCH__>=7
107         tst     ip,#7
108         bne     .Lialu
109         adr     r0,.Lbn_mul_mont
110         ldr     r2,.LOPENSSL_armcap
111         ldr     r0,[r0,r2]
112 #ifdef  __APPLE__
113         ldr     r0,[r0]
114 #endif
115         tst     r0,#1                   @ NEON available?
116         ldmia   sp, {r0,r2}
117         beq     .Lialu
118         add     sp,sp,#8
119         b       bn_mul8x_mont_neon
120 .align  4
121 .Lialu:
122 #endif
123         cmp     ip,#2
124         mov     $num,ip                 @ load num
125 #ifdef  __thumb2__
126         ittt    lt
127 #endif
128         movlt   r0,#0
129         addlt   sp,sp,#2*4
130         blt     .Labrt
131
132         stmdb   sp!,{r4-r12,lr}         @ save 10 registers
133
134         mov     $num,$num,lsl#2         @ rescale $num for byte count
135         sub     sp,sp,$num              @ alloca(4*num)
136         sub     sp,sp,#4                @ +extra dword
137         sub     $num,$num,#4            @ "num=num-1"
138         add     $tp,$bp,$num            @ &bp[num-1]
139
140         add     $num,sp,$num            @ $num to point at &tp[num-1]
141         ldr     $n0,[$_n0]              @ &n0
142         ldr     $bi,[$bp]               @ bp[0]
143         ldr     $aj,[$ap],#4            @ ap[0],ap++
144         ldr     $nj,[$np],#4            @ np[0],np++
145         ldr     $n0,[$n0]               @ *n0
146         str     $tp,[$_bpend]           @ save &bp[num]
147
148         umull   $alo,$ahi,$aj,$bi       @ ap[0]*bp[0]
149         str     $n0,[$_n0]              @ save n0 value
150         mul     $n0,$alo,$n0            @ "tp[0]"*n0
151         mov     $nlo,#0
152         umlal   $alo,$nlo,$nj,$n0       @ np[0]*n0+"t[0]"
153         mov     $tp,sp
154
155 .L1st:
156         ldr     $aj,[$ap],#4            @ ap[j],ap++
157         mov     $alo,$ahi
158         ldr     $nj,[$np],#4            @ np[j],np++
159         mov     $ahi,#0
160         umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[0]
161         mov     $nhi,#0
162         umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
163         adds    $nlo,$nlo,$alo
164         str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
165         adc     $nlo,$nhi,#0
166         cmp     $tp,$num
167         bne     .L1st
168
169         adds    $nlo,$nlo,$ahi
170         ldr     $tp,[$_bp]              @ restore bp
171         mov     $nhi,#0
172         ldr     $n0,[$_n0]              @ restore n0
173         adc     $nhi,$nhi,#0
174         str     $nlo,[$num]             @ tp[num-1]=
175         mov     $tj,sp
176         str     $nhi,[$num,#4]          @ tp[num]=
177 \f
178 .Louter:
179         sub     $tj,$num,$tj            @ "original" $num-1 value
180         sub     $ap,$ap,$tj             @ "rewind" ap to &ap[1]
181         ldr     $bi,[$tp,#4]!           @ *(++bp)
182         sub     $np,$np,$tj             @ "rewind" np to &np[1]
183         ldr     $aj,[$ap,#-4]           @ ap[0]
184         ldr     $alo,[sp]               @ tp[0]
185         ldr     $nj,[$np,#-4]           @ np[0]
186         ldr     $tj,[sp,#4]             @ tp[1]
187
188         mov     $ahi,#0
189         umlal   $alo,$ahi,$aj,$bi       @ ap[0]*bp[i]+tp[0]
190         str     $tp,[$_bp]              @ save bp
191         mul     $n0,$alo,$n0
192         mov     $nlo,#0
193         umlal   $alo,$nlo,$nj,$n0       @ np[0]*n0+"tp[0]"
194         mov     $tp,sp
195
196 .Linner:
197         ldr     $aj,[$ap],#4            @ ap[j],ap++
198         adds    $alo,$ahi,$tj           @ +=tp[j]
199         ldr     $nj,[$np],#4            @ np[j],np++
200         mov     $ahi,#0
201         umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[i]
202         mov     $nhi,#0
203         umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
204         adc     $ahi,$ahi,#0
205         ldr     $tj,[$tp,#8]            @ tp[j+1]
206         adds    $nlo,$nlo,$alo
207         str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
208         adc     $nlo,$nhi,#0
209         cmp     $tp,$num
210         bne     .Linner
211
212         adds    $nlo,$nlo,$ahi
213         mov     $nhi,#0
214         ldr     $tp,[$_bp]              @ restore bp
215         adc     $nhi,$nhi,#0
216         ldr     $n0,[$_n0]              @ restore n0
217         adds    $nlo,$nlo,$tj
218         ldr     $tj,[$_bpend]           @ restore &bp[num]
219         adc     $nhi,$nhi,#0
220         str     $nlo,[$num]             @ tp[num-1]=
221         str     $nhi,[$num,#4]          @ tp[num]=
222
223         cmp     $tp,$tj
224 #ifdef  __thumb2__
225         itt     ne
226 #endif
227         movne   $tj,sp
228         bne     .Louter
229 \f
230         ldr     $rp,[$_rp]              @ pull rp
231         mov     $aj,sp
232         add     $num,$num,#4            @ $num to point at &tp[num]
233         sub     $aj,$num,$aj            @ "original" num value
234         mov     $tp,sp                  @ "rewind" $tp
235         mov     $ap,$tp                 @ "borrow" $ap
236         sub     $np,$np,$aj             @ "rewind" $np to &np[0]
237
238         subs    $tj,$tj,$tj             @ "clear" carry flag
239 .Lsub:  ldr     $tj,[$tp],#4
240         ldr     $nj,[$np],#4
241         sbcs    $tj,$tj,$nj             @ tp[j]-np[j]
242         str     $tj,[$rp],#4            @ rp[j]=
243         teq     $tp,$num                @ preserve carry
244         bne     .Lsub
245         sbcs    $nhi,$nhi,#0            @ upmost carry
246         mov     $tp,sp                  @ "rewind" $tp
247         sub     $rp,$rp,$aj             @ "rewind" $rp
248
249         and     $ap,$tp,$nhi
250         bic     $np,$rp,$nhi
251         orr     $ap,$ap,$np             @ ap=borrow?tp:rp
252
253 .Lcopy: ldr     $tj,[$ap],#4            @ copy or in-place refresh
254         str     sp,[$tp],#4             @ zap tp
255         str     $tj,[$rp],#4
256         cmp     $tp,$num
257         bne     .Lcopy
258
259         mov     sp,$num
260         add     sp,sp,#4                @ skip over tp[num+1]
261         ldmia   sp!,{r4-r12,lr}         @ restore registers
262         add     sp,sp,#2*4              @ skip over {r0,r2}
263         mov     r0,#1
264 .Labrt:
265 #if __ARM_ARCH__>=5
266         ret                             @ bx lr
267 #else
268         tst     lr,#1
269         moveq   pc,lr                   @ be binary compatible with V4, yet
270         bx      lr                      @ interoperable with Thumb ISA:-)
271 #endif
272 .size   bn_mul_mont,.-bn_mul_mont
273 ___
274 {
275 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
276 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
277
278 my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
279 my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
280 my ($Z,$Temp)=("q4","q5");
281 my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
282 my ($Bi,$Ni,$M0)=map("d$_",(28..31));
283 my $zero=&Dlo($Z);
284 my $temp=&Dlo($Temp);
285
286 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
287 my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
288
289 $code.=<<___;
290 #if __ARM_MAX_ARCH__>=7
291 .arch   armv7-a
292 .fpu    neon
293
294 .type   bn_mul8x_mont_neon,%function
295 .align  5
296 bn_mul8x_mont_neon:
297         mov     ip,sp
298         stmdb   sp!,{r4-r11}
299         vstmdb  sp!,{d8-d15}            @ ABI specification says so
300         ldmia   ip,{r4-r5}              @ load rest of parameter block
301         mov     ip,sp
302
303         sub             $toutptr,sp,#16
304         vld1.32         {${Bi}[0]}, [$bptr,:32]!
305         sub             $toutptr,$toutptr,$num,lsl#4
306         vld1.32         {$A0-$A3},  [$aptr]!            @ can't specify :32 :-(
307         and             $toutptr,$toutptr,#-64
308         vld1.32         {${M0}[0]}, [$n0,:32]
309         mov             sp,$toutptr                     @ alloca
310         veor            $zero,$zero,$zero
311         subs            $inner,$num,#8
312         vzip.16         $Bi,$zero
313
314         vmull.u32       $A0xB,$Bi,${A0}[0]
315         vmull.u32       $A1xB,$Bi,${A0}[1]
316         vmull.u32       $A2xB,$Bi,${A1}[0]
317         vshl.i64        $temp,`&Dhi("$A0xB")`,#16
318         vmull.u32       $A3xB,$Bi,${A1}[1]
319
320         vadd.u64        $temp,$temp,`&Dlo("$A0xB")`
321         veor            $zero,$zero,$zero
322         vmul.u32        $Ni,$temp,$M0
323
324         vmull.u32       $A4xB,$Bi,${A2}[0]
325          vld1.32        {$N0-$N3}, [$nptr]!
326         vmull.u32       $A5xB,$Bi,${A2}[1]
327         vmull.u32       $A6xB,$Bi,${A3}[0]
328         vzip.16         $Ni,$zero
329         vmull.u32       $A7xB,$Bi,${A3}[1]
330
331         bne     .LNEON_1st
332
333         @ special case for num=8, everything is in register bank...
334
335         vmlal.u32       $A0xB,$Ni,${N0}[0]
336         sub             $outer,$num,#1
337         vmlal.u32       $A1xB,$Ni,${N0}[1]
338         vmlal.u32       $A2xB,$Ni,${N1}[0]
339         vmlal.u32       $A3xB,$Ni,${N1}[1]
340
341         vmlal.u32       $A4xB,$Ni,${N2}[0]
342         vmov            $Temp,$A0xB
343         vmlal.u32       $A5xB,$Ni,${N2}[1]
344         vmov            $A0xB,$A1xB
345         vmlal.u32       $A6xB,$Ni,${N3}[0]
346         vmov            $A1xB,$A2xB
347         vmlal.u32       $A7xB,$Ni,${N3}[1]
348         vmov            $A2xB,$A3xB
349         vmov            $A3xB,$A4xB
350         vshr.u64        $temp,$temp,#16
351         vmov            $A4xB,$A5xB
352         vmov            $A5xB,$A6xB
353         vadd.u64        $temp,$temp,`&Dhi("$Temp")`
354         vmov            $A6xB,$A7xB
355         veor            $A7xB,$A7xB
356         vshr.u64        $temp,$temp,#16
357
358         b       .LNEON_outer8
359
360 .align  4
361 .LNEON_outer8:
362         vld1.32         {${Bi}[0]}, [$bptr,:32]!
363         veor            $zero,$zero,$zero
364         vzip.16         $Bi,$zero
365         vadd.u64        `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
366
367         vmlal.u32       $A0xB,$Bi,${A0}[0]
368         vmlal.u32       $A1xB,$Bi,${A0}[1]
369         vmlal.u32       $A2xB,$Bi,${A1}[0]
370         vshl.i64        $temp,`&Dhi("$A0xB")`,#16
371         vmlal.u32       $A3xB,$Bi,${A1}[1]
372
373         vadd.u64        $temp,$temp,`&Dlo("$A0xB")`
374         veor            $zero,$zero,$zero
375         subs            $outer,$outer,#1
376         vmul.u32        $Ni,$temp,$M0
377
378         vmlal.u32       $A4xB,$Bi,${A2}[0]
379         vmlal.u32       $A5xB,$Bi,${A2}[1]
380         vmlal.u32       $A6xB,$Bi,${A3}[0]
381         vzip.16         $Ni,$zero
382         vmlal.u32       $A7xB,$Bi,${A3}[1]
383
384         vmlal.u32       $A0xB,$Ni,${N0}[0]
385         vmlal.u32       $A1xB,$Ni,${N0}[1]
386         vmlal.u32       $A2xB,$Ni,${N1}[0]
387         vmlal.u32       $A3xB,$Ni,${N1}[1]
388
389         vmlal.u32       $A4xB,$Ni,${N2}[0]
390         vmov            $Temp,$A0xB
391         vmlal.u32       $A5xB,$Ni,${N2}[1]
392         vmov            $A0xB,$A1xB
393         vmlal.u32       $A6xB,$Ni,${N3}[0]
394         vmov            $A1xB,$A2xB
395         vmlal.u32       $A7xB,$Ni,${N3}[1]
396         vmov            $A2xB,$A3xB
397         vmov            $A3xB,$A4xB
398         vshr.u64        $temp,$temp,#16
399         vmov            $A4xB,$A5xB
400         vmov            $A5xB,$A6xB
401         vadd.u64        $temp,$temp,`&Dhi("$Temp")`
402         vmov            $A6xB,$A7xB
403         veor            $A7xB,$A7xB
404         vshr.u64        $temp,$temp,#16
405
406         bne     .LNEON_outer8
407
408         vadd.u64        `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
409         mov             $toutptr,sp
410         vshr.u64        $temp,`&Dlo("$A0xB")`,#16
411         mov             $inner,$num
412         vadd.u64        `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
413         add             $tinptr,sp,#16
414         vshr.u64        $temp,`&Dhi("$A0xB")`,#16
415         vzip.16         `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
416
417         b       .LNEON_tail2
418
419 .align  4
420 .LNEON_1st:
421         vmlal.u32       $A0xB,$Ni,${N0}[0]
422          vld1.32        {$A0-$A3}, [$aptr]!
423         vmlal.u32       $A1xB,$Ni,${N0}[1]
424         subs            $inner,$inner,#8
425         vmlal.u32       $A2xB,$Ni,${N1}[0]
426         vmlal.u32       $A3xB,$Ni,${N1}[1]
427
428         vmlal.u32       $A4xB,$Ni,${N2}[0]
429          vld1.32        {$N0-$N1}, [$nptr]!
430         vmlal.u32       $A5xB,$Ni,${N2}[1]
431          vst1.64        {$A0xB-$A1xB}, [$toutptr,:256]!
432         vmlal.u32       $A6xB,$Ni,${N3}[0]
433         vmlal.u32       $A7xB,$Ni,${N3}[1]
434          vst1.64        {$A2xB-$A3xB}, [$toutptr,:256]!
435
436         vmull.u32       $A0xB,$Bi,${A0}[0]
437          vld1.32        {$N2-$N3}, [$nptr]!
438         vmull.u32       $A1xB,$Bi,${A0}[1]
439          vst1.64        {$A4xB-$A5xB}, [$toutptr,:256]!
440         vmull.u32       $A2xB,$Bi,${A1}[0]
441         vmull.u32       $A3xB,$Bi,${A1}[1]
442          vst1.64        {$A6xB-$A7xB}, [$toutptr,:256]!
443
444         vmull.u32       $A4xB,$Bi,${A2}[0]
445         vmull.u32       $A5xB,$Bi,${A2}[1]
446         vmull.u32       $A6xB,$Bi,${A3}[0]
447         vmull.u32       $A7xB,$Bi,${A3}[1]
448
449         bne     .LNEON_1st
450
451         vmlal.u32       $A0xB,$Ni,${N0}[0]
452         add             $tinptr,sp,#16
453         vmlal.u32       $A1xB,$Ni,${N0}[1]
454         sub             $aptr,$aptr,$num,lsl#2          @ rewind $aptr
455         vmlal.u32       $A2xB,$Ni,${N1}[0]
456          vld1.64        {$Temp}, [sp,:128]
457         vmlal.u32       $A3xB,$Ni,${N1}[1]
458         sub             $outer,$num,#1
459
460         vmlal.u32       $A4xB,$Ni,${N2}[0]
461         vst1.64         {$A0xB-$A1xB}, [$toutptr,:256]!
462         vmlal.u32       $A5xB,$Ni,${N2}[1]
463         vshr.u64        $temp,$temp,#16
464          vld1.64        {$A0xB},       [$tinptr, :128]!
465         vmlal.u32       $A6xB,$Ni,${N3}[0]
466         vst1.64         {$A2xB-$A3xB}, [$toutptr,:256]!
467         vmlal.u32       $A7xB,$Ni,${N3}[1]
468
469         vst1.64         {$A4xB-$A5xB}, [$toutptr,:256]!
470         vadd.u64        $temp,$temp,`&Dhi("$Temp")`
471         veor            $Z,$Z,$Z
472         vst1.64         {$A6xB-$A7xB}, [$toutptr,:256]!
473          vld1.64        {$A1xB-$A2xB}, [$tinptr, :256]!
474         vst1.64         {$Z},          [$toutptr,:128]
475         vshr.u64        $temp,$temp,#16
476
477         b               .LNEON_outer
478
479 .align  4
480 .LNEON_outer:
481         vld1.32         {${Bi}[0]}, [$bptr,:32]!
482         sub             $nptr,$nptr,$num,lsl#2          @ rewind $nptr
483         vld1.32         {$A0-$A3},  [$aptr]!
484         veor            $zero,$zero,$zero
485         mov             $toutptr,sp
486         vzip.16         $Bi,$zero
487         sub             $inner,$num,#8
488         vadd.u64        `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
489
490         vmlal.u32       $A0xB,$Bi,${A0}[0]
491          vld1.64        {$A3xB-$A4xB},[$tinptr,:256]!
492         vmlal.u32       $A1xB,$Bi,${A0}[1]
493         vmlal.u32       $A2xB,$Bi,${A1}[0]
494          vld1.64        {$A5xB-$A6xB},[$tinptr,:256]!
495         vmlal.u32       $A3xB,$Bi,${A1}[1]
496
497         vshl.i64        $temp,`&Dhi("$A0xB")`,#16
498         veor            $zero,$zero,$zero
499         vadd.u64        $temp,$temp,`&Dlo("$A0xB")`
500          vld1.64        {$A7xB},[$tinptr,:128]!
501         vmul.u32        $Ni,$temp,$M0
502
503         vmlal.u32       $A4xB,$Bi,${A2}[0]
504          vld1.32        {$N0-$N3}, [$nptr]!
505         vmlal.u32       $A5xB,$Bi,${A2}[1]
506         vmlal.u32       $A6xB,$Bi,${A3}[0]
507         vzip.16         $Ni,$zero
508         vmlal.u32       $A7xB,$Bi,${A3}[1]
509
510 .LNEON_inner:
511         vmlal.u32       $A0xB,$Ni,${N0}[0]
512          vld1.32        {$A0-$A3}, [$aptr]!
513         vmlal.u32       $A1xB,$Ni,${N0}[1]
514          subs           $inner,$inner,#8
515         vmlal.u32       $A2xB,$Ni,${N1}[0]
516         vmlal.u32       $A3xB,$Ni,${N1}[1]
517         vst1.64         {$A0xB-$A1xB}, [$toutptr,:256]!
518
519         vmlal.u32       $A4xB,$Ni,${N2}[0]
520          vld1.64        {$A0xB},       [$tinptr, :128]!
521         vmlal.u32       $A5xB,$Ni,${N2}[1]
522         vst1.64         {$A2xB-$A3xB}, [$toutptr,:256]!
523         vmlal.u32       $A6xB,$Ni,${N3}[0]
524          vld1.64        {$A1xB-$A2xB}, [$tinptr, :256]!
525         vmlal.u32       $A7xB,$Ni,${N3}[1]
526         vst1.64         {$A4xB-$A5xB}, [$toutptr,:256]!
527
528         vmlal.u32       $A0xB,$Bi,${A0}[0]
529          vld1.64        {$A3xB-$A4xB}, [$tinptr, :256]!
530         vmlal.u32       $A1xB,$Bi,${A0}[1]
531         vst1.64         {$A6xB-$A7xB}, [$toutptr,:256]!
532         vmlal.u32       $A2xB,$Bi,${A1}[0]
533          vld1.64        {$A5xB-$A6xB}, [$tinptr, :256]!
534         vmlal.u32       $A3xB,$Bi,${A1}[1]
535          vld1.32        {$N0-$N3}, [$nptr]!
536
537         vmlal.u32       $A4xB,$Bi,${A2}[0]
538          vld1.64        {$A7xB},       [$tinptr, :128]!
539         vmlal.u32       $A5xB,$Bi,${A2}[1]
540         vmlal.u32       $A6xB,$Bi,${A3}[0]
541         vmlal.u32       $A7xB,$Bi,${A3}[1]
542
543         bne     .LNEON_inner
544
545         vmlal.u32       $A0xB,$Ni,${N0}[0]
546         add             $tinptr,sp,#16
547         vmlal.u32       $A1xB,$Ni,${N0}[1]
548         sub             $aptr,$aptr,$num,lsl#2          @ rewind $aptr
549         vmlal.u32       $A2xB,$Ni,${N1}[0]
550          vld1.64        {$Temp}, [sp,:128]
551         vmlal.u32       $A3xB,$Ni,${N1}[1]
552         subs            $outer,$outer,#1
553
554         vmlal.u32       $A4xB,$Ni,${N2}[0]
555         vst1.64         {$A0xB-$A1xB}, [$toutptr,:256]!
556         vmlal.u32       $A5xB,$Ni,${N2}[1]
557          vld1.64        {$A0xB},       [$tinptr, :128]!
558         vshr.u64        $temp,$temp,#16
559         vst1.64         {$A2xB-$A3xB}, [$toutptr,:256]!
560         vmlal.u32       $A6xB,$Ni,${N3}[0]
561          vld1.64        {$A1xB-$A2xB}, [$tinptr, :256]!
562         vmlal.u32       $A7xB,$Ni,${N3}[1]
563
564         vst1.64         {$A4xB-$A5xB}, [$toutptr,:256]!
565         vadd.u64        $temp,$temp,`&Dhi("$Temp")`
566         vst1.64         {$A6xB-$A7xB}, [$toutptr,:256]!
567         vshr.u64        $temp,$temp,#16
568
569         bne     .LNEON_outer
570
571         mov             $toutptr,sp
572         mov             $inner,$num
573
574 .LNEON_tail:
575         vadd.u64        `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
576         vld1.64         {$A3xB-$A4xB}, [$tinptr, :256]!
577         vshr.u64        $temp,`&Dlo("$A0xB")`,#16
578         vadd.u64        `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
579         vld1.64         {$A5xB-$A6xB}, [$tinptr, :256]!
580         vshr.u64        $temp,`&Dhi("$A0xB")`,#16
581         vld1.64         {$A7xB},       [$tinptr, :128]!
582         vzip.16         `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
583
584 .LNEON_tail2:
585         vadd.u64        `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
586         vst1.32         {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
587         vshr.u64        $temp,`&Dlo("$A1xB")`,#16
588         vadd.u64        `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
589         vshr.u64        $temp,`&Dhi("$A1xB")`,#16
590         vzip.16         `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
591
592         vadd.u64        `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
593         vst1.32         {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
594         vshr.u64        $temp,`&Dlo("$A2xB")`,#16
595         vadd.u64        `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
596         vshr.u64        $temp,`&Dhi("$A2xB")`,#16
597         vzip.16         `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
598
599         vadd.u64        `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
600         vst1.32         {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
601         vshr.u64        $temp,`&Dlo("$A3xB")`,#16
602         vadd.u64        `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
603         vshr.u64        $temp,`&Dhi("$A3xB")`,#16
604         vzip.16         `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
605
606         vadd.u64        `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
607         vst1.32         {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
608         vshr.u64        $temp,`&Dlo("$A4xB")`,#16
609         vadd.u64        `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
610         vshr.u64        $temp,`&Dhi("$A4xB")`,#16
611         vzip.16         `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
612
613         vadd.u64        `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
614         vst1.32         {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
615         vshr.u64        $temp,`&Dlo("$A5xB")`,#16
616         vadd.u64        `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
617         vshr.u64        $temp,`&Dhi("$A5xB")`,#16
618         vzip.16         `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
619
620         vadd.u64        `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
621         vst1.32         {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
622         vshr.u64        $temp,`&Dlo("$A6xB")`,#16
623         vadd.u64        `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
624         vld1.64         {$A0xB}, [$tinptr, :128]!
625         vshr.u64        $temp,`&Dhi("$A6xB")`,#16
626         vzip.16         `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
627
628         vadd.u64        `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
629         vst1.32         {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
630         vshr.u64        $temp,`&Dlo("$A7xB")`,#16
631         vadd.u64        `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
632         vld1.64         {$A1xB-$A2xB},  [$tinptr, :256]!
633         vshr.u64        $temp,`&Dhi("$A7xB")`,#16
634         vzip.16         `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
635         subs            $inner,$inner,#8
636         vst1.32         {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
637
638         bne     .LNEON_tail
639
640         vst1.32 {${temp}[0]}, [$toutptr, :32]           @ top-most bit
641         sub     $nptr,$nptr,$num,lsl#2                  @ rewind $nptr
642         subs    $aptr,sp,#0                             @ clear carry flag
643         add     $bptr,sp,$num,lsl#2
644
645 .LNEON_sub:
646         ldmia   $aptr!, {r4-r7}
647         ldmia   $nptr!, {r8-r11}
648         sbcs    r8, r4,r8
649         sbcs    r9, r5,r9
650         sbcs    r10,r6,r10
651         sbcs    r11,r7,r11
652         teq     $aptr,$bptr                             @ preserves carry
653         stmia   $rptr!, {r8-r11}
654         bne     .LNEON_sub
655
656         ldr     r10, [$aptr]                            @ load top-most bit
657         mov     r11,sp
658         veor    q0,q0,q0
659         sub     r11,$bptr,r11                           @ this is num*4
660         veor    q1,q1,q1
661         mov     $aptr,sp
662         sub     $rptr,$rptr,r11                         @ rewind $rptr
663         mov     $nptr,$bptr                             @ second 3/4th of frame
664         sbcs    r10,r10,#0                              @ result is carry flag
665
666 .LNEON_copy_n_zap:
667         ldmia   $aptr!, {r4-r7}
668         ldmia   $rptr,  {r8-r11}
669         it      cc
670         movcc   r8, r4
671         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
672         itt     cc
673         movcc   r9, r5
674         movcc   r10,r6
675         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
676         it      cc
677         movcc   r11,r7
678         ldmia   $aptr, {r4-r7}
679         stmia   $rptr!, {r8-r11}
680         sub     $aptr,$aptr,#16
681         ldmia   $rptr, {r8-r11}
682         it      cc
683         movcc   r8, r4
684         vst1.64 {q0-q1}, [$aptr,:256]!                  @ wipe
685         itt     cc
686         movcc   r9, r5
687         movcc   r10,r6
688         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
689         it      cc
690         movcc   r11,r7
691         teq     $aptr,$bptr                             @ preserves carry
692         stmia   $rptr!, {r8-r11}
693         bne     .LNEON_copy_n_zap
694
695         mov     sp,ip
696         vldmia  sp!,{d8-d15}
697         ldmia   sp!,{r4-r11}
698         ret                                             @ bx lr
699 .size   bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
700 #endif
701 ___
702 }
703 $code.=<<___;
704 .asciz  "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
705 .align  2
706 #if __ARM_MAX_ARCH__>=7
707 .comm   OPENSSL_armcap_P,4,4
708 #endif
709 ___
710
711 $code =~ s/\`([^\`]*)\`/eval $1/gem;
712 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
713 $code =~ s/\bret\b/bx   lr/gm;
714 print $code;
715 close STDOUT;