916913702f76831edaecf337ff5fed8af96b3b13
[openssl.git] / crypto / bn / asm / armv4-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # January 2007.
18
19 # Montgomery multiplication for ARMv4.
20 #
21 # Performance improvement naturally varies among CPU implementations
22 # and compilers. The code was observed to provide +65-35% improvement
23 # [depending on key length, less for longer keys] on ARM920T, and
24 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
25 # base and compiler generated code with in-lined umull and even umlal
26 # instructions. The latter means that this code didn't really have an
27 # "advantage" of utilizing some "secret" instruction.
28 #
29 # The code is interoperable with Thumb ISA and is rather compact, less
30 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
31 # about decorations, ABI and instruction syntax are identical.
32
33 # November 2013
34 #
35 # Add NEON code path, which handles lengths divisible by 8. RSA/DSA
36 # performance improvement on Cortex-A8 is ~45-100% depending on key
37 # length, more for longer keys. On Cortex-A15 the span is ~10-105%.
38 # On Snapdragon S4 improvement was measured to vary from ~70% to
39 # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
40 # rather because original integer-only code seems to perform
41 # suboptimally on S4. Situation on Cortex-A9 is unfortunately
42 # different. It's being looked into, but the trouble is that
43 # performance for vectors longer than 256 bits is actually couple
44 # of percent worse than for integer-only code. The code is chosen
45 # for execution on all NEON-capable processors, because gain on
46 # others outweighs the marginal loss on Cortex-A9.
47
48 # September 2015
49 #
50 # Align Cortex-A9 performance with November 2013 improvements, i.e.
51 # NEON code is now ~20-105% faster than integer-only one on this
52 # processor. But this optimization further improved performance even
53 # on other processors: NEON code path is ~45-180% faster than original
54 # integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
55 # Snapdragon S4.
56
57 $flavour = shift;
58 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
59 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
60
61 if ($flavour && $flavour ne "void") {
62     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
64     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
65     die "can't locate arm-xlate.pl";
66
67     open STDOUT,"| \"$^X\" $xlate $flavour $output";
68 } else {
69     open STDOUT,">$output";
70 }
71
72 $num="r0";      # starts as num argument, but holds &tp[num-1]
73 $ap="r1";
74 $bp="r2"; $bi="r2"; $rp="r2";
75 $np="r3";
76 $tp="r4";
77 $aj="r5";
78 $nj="r6";
79 $tj="r7";
80 $n0="r8";
81 ###########     # r9 is reserved by ELF as platform specific, e.g. TLS pointer
82 $alo="r10";     # sl, gcc uses it to keep @GOT
83 $ahi="r11";     # fp
84 $nlo="r12";     # ip
85 ###########     # r13 is stack pointer
86 $nhi="r14";     # lr
87 ###########     # r15 is program counter
88
89 #### argument block layout relative to &tp[num-1], a.k.a. $num
90 $_rp="$num,#12*4";
91 # ap permanently resides in r1
92 $_bp="$num,#13*4";
93 # np permanently resides in r3
94 $_n0="$num,#14*4";
95 $_num="$num,#15*4";     $_bpend=$_num;
96
97 $code=<<___;
98 #include "arm_arch.h"
99
100 .text
101 #if defined(__thumb2__)
102 .syntax unified
103 .thumb
104 #else
105 .code   32
106 #endif
107
108 #if __ARM_MAX_ARCH__>=7
109 .align  5
110 .LOPENSSL_armcap:
111 .word   OPENSSL_armcap_P-.Lbn_mul_mont
112 #endif
113
114 .global bn_mul_mont
115 .type   bn_mul_mont,%function
116
117 .align  5
118 bn_mul_mont:
119 .Lbn_mul_mont:
120         ldr     ip,[sp,#4]              @ load num
121         stmdb   sp!,{r0,r2}             @ sp points at argument block
122 #if __ARM_MAX_ARCH__>=7
123         tst     ip,#7
124         bne     .Lialu
125         adr     r0,.Lbn_mul_mont
126         ldr     r2,.LOPENSSL_armcap
127         ldr     r0,[r0,r2]
128 #ifdef  __APPLE__
129         ldr     r0,[r0]
130 #endif
131         tst     r0,#ARMV7_NEON          @ NEON available?
132         ldmia   sp, {r0,r2}
133         beq     .Lialu
134         add     sp,sp,#8
135         b       bn_mul8x_mont_neon
136 .align  4
137 .Lialu:
138 #endif
139         cmp     ip,#2
140         mov     $num,ip                 @ load num
141 #ifdef  __thumb2__
142         ittt    lt
143 #endif
144         movlt   r0,#0
145         addlt   sp,sp,#2*4
146         blt     .Labrt
147
148         stmdb   sp!,{r4-r12,lr}         @ save 10 registers
149
150         mov     $num,$num,lsl#2         @ rescale $num for byte count
151         sub     sp,sp,$num              @ alloca(4*num)
152         sub     sp,sp,#4                @ +extra dword
153         sub     $num,$num,#4            @ "num=num-1"
154         add     $tp,$bp,$num            @ &bp[num-1]
155
156         add     $num,sp,$num            @ $num to point at &tp[num-1]
157         ldr     $n0,[$_n0]              @ &n0
158         ldr     $bi,[$bp]               @ bp[0]
159         ldr     $aj,[$ap],#4            @ ap[0],ap++
160         ldr     $nj,[$np],#4            @ np[0],np++
161         ldr     $n0,[$n0]               @ *n0
162         str     $tp,[$_bpend]           @ save &bp[num]
163
164         umull   $alo,$ahi,$aj,$bi       @ ap[0]*bp[0]
165         str     $n0,[$_n0]              @ save n0 value
166         mul     $n0,$alo,$n0            @ "tp[0]"*n0
167         mov     $nlo,#0
168         umlal   $alo,$nlo,$nj,$n0       @ np[0]*n0+"t[0]"
169         mov     $tp,sp
170
171 .L1st:
172         ldr     $aj,[$ap],#4            @ ap[j],ap++
173         mov     $alo,$ahi
174         ldr     $nj,[$np],#4            @ np[j],np++
175         mov     $ahi,#0
176         umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[0]
177         mov     $nhi,#0
178         umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
179         adds    $nlo,$nlo,$alo
180         str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
181         adc     $nlo,$nhi,#0
182         cmp     $tp,$num
183         bne     .L1st
184
185         adds    $nlo,$nlo,$ahi
186         ldr     $tp,[$_bp]              @ restore bp
187         mov     $nhi,#0
188         ldr     $n0,[$_n0]              @ restore n0
189         adc     $nhi,$nhi,#0
190         str     $nlo,[$num]             @ tp[num-1]=
191         mov     $tj,sp
192         str     $nhi,[$num,#4]          @ tp[num]=
193 \f
194 .Louter:
195         sub     $tj,$num,$tj            @ "original" $num-1 value
196         sub     $ap,$ap,$tj             @ "rewind" ap to &ap[1]
197         ldr     $bi,[$tp,#4]!           @ *(++bp)
198         sub     $np,$np,$tj             @ "rewind" np to &np[1]
199         ldr     $aj,[$ap,#-4]           @ ap[0]
200         ldr     $alo,[sp]               @ tp[0]
201         ldr     $nj,[$np,#-4]           @ np[0]
202         ldr     $tj,[sp,#4]             @ tp[1]
203
204         mov     $ahi,#0
205         umlal   $alo,$ahi,$aj,$bi       @ ap[0]*bp[i]+tp[0]
206         str     $tp,[$_bp]              @ save bp
207         mul     $n0,$alo,$n0
208         mov     $nlo,#0
209         umlal   $alo,$nlo,$nj,$n0       @ np[0]*n0+"tp[0]"
210         mov     $tp,sp
211
212 .Linner:
213         ldr     $aj,[$ap],#4            @ ap[j],ap++
214         adds    $alo,$ahi,$tj           @ +=tp[j]
215         ldr     $nj,[$np],#4            @ np[j],np++
216         mov     $ahi,#0
217         umlal   $alo,$ahi,$aj,$bi       @ ap[j]*bp[i]
218         mov     $nhi,#0
219         umlal   $nlo,$nhi,$nj,$n0       @ np[j]*n0
220         adc     $ahi,$ahi,#0
221         ldr     $tj,[$tp,#8]            @ tp[j+1]
222         adds    $nlo,$nlo,$alo
223         str     $nlo,[$tp],#4           @ tp[j-1]=,tp++
224         adc     $nlo,$nhi,#0
225         cmp     $tp,$num
226         bne     .Linner
227
228         adds    $nlo,$nlo,$ahi
229         mov     $nhi,#0
230         ldr     $tp,[$_bp]              @ restore bp
231         adc     $nhi,$nhi,#0
232         ldr     $n0,[$_n0]              @ restore n0
233         adds    $nlo,$nlo,$tj
234         ldr     $tj,[$_bpend]           @ restore &bp[num]
235         adc     $nhi,$nhi,#0
236         str     $nlo,[$num]             @ tp[num-1]=
237         str     $nhi,[$num,#4]          @ tp[num]=
238
239         cmp     $tp,$tj
240 #ifdef  __thumb2__
241         itt     ne
242 #endif
243         movne   $tj,sp
244         bne     .Louter
245 \f
246         ldr     $rp,[$_rp]              @ pull rp
247         mov     $aj,sp
248         add     $num,$num,#4            @ $num to point at &tp[num]
249         sub     $aj,$num,$aj            @ "original" num value
250         mov     $tp,sp                  @ "rewind" $tp
251         mov     $ap,$tp                 @ "borrow" $ap
252         sub     $np,$np,$aj             @ "rewind" $np to &np[0]
253
254         subs    $tj,$tj,$tj             @ "clear" carry flag
255 .Lsub:  ldr     $tj,[$tp],#4
256         ldr     $nj,[$np],#4
257         sbcs    $tj,$tj,$nj             @ tp[j]-np[j]
258         str     $tj,[$rp],#4            @ rp[j]=
259         teq     $tp,$num                @ preserve carry
260         bne     .Lsub
261         sbcs    $nhi,$nhi,#0            @ upmost carry
262         mov     $tp,sp                  @ "rewind" $tp
263         sub     $rp,$rp,$aj             @ "rewind" $rp
264
265 .Lcopy: ldr     $tj,[$tp]               @ conditional copy
266         ldr     $aj,[$rp]
267         str     sp,[$tp],#4             @ zap tp
268 #ifdef  __thumb2__
269         it      cc
270 #endif
271         movcc   $aj,$tj
272         str     $aj,[$rp],#4
273         teq     $tp,$num                @ preserve carry
274         bne     .Lcopy
275
276         mov     sp,$num
277         add     sp,sp,#4                @ skip over tp[num+1]
278         ldmia   sp!,{r4-r12,lr}         @ restore registers
279         add     sp,sp,#2*4              @ skip over {r0,r2}
280         mov     r0,#1
281 .Labrt:
282 #if __ARM_ARCH__>=5
283         ret                             @ bx lr
284 #else
285         tst     lr,#1
286         moveq   pc,lr                   @ be binary compatible with V4, yet
287         bx      lr                      @ interoperable with Thumb ISA:-)
288 #endif
289 .size   bn_mul_mont,.-bn_mul_mont
290 ___
291 {
292 my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
293 my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
294 my ($Z,$Temp)=("q4","q5");
295 my @ACC=map("q$_",(6..13));
296 my ($Bi,$Ni,$M0)=map("d$_",(28..31));
297 my $zero="$Z#lo";
298 my $temp="$Temp#lo";
299
300 my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
301 my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
302
303 $code.=<<___;
304 #if __ARM_MAX_ARCH__>=7
305 .arch   armv7-a
306 .fpu    neon
307
308 .type   bn_mul8x_mont_neon,%function
309 .align  5
310 bn_mul8x_mont_neon:
311         mov     ip,sp
312         stmdb   sp!,{r4-r11}
313         vstmdb  sp!,{d8-d15}            @ ABI specification says so
314         ldmia   ip,{r4-r5}              @ load rest of parameter block
315         mov     ip,sp
316
317         cmp     $num,#8
318         bhi     .LNEON_8n
319
320         @ special case for $num==8, everything is in register bank...
321
322         vld1.32         {${Bi}[0]}, [$bptr,:32]!
323         veor            $zero,$zero,$zero
324         sub             $toutptr,sp,$num,lsl#4
325         vld1.32         {$A0-$A3},  [$aptr]!            @ can't specify :32 :-(
326         and             $toutptr,$toutptr,#-64
327         vld1.32         {${M0}[0]}, [$n0,:32]
328         mov             sp,$toutptr                     @ alloca
329         vzip.16         $Bi,$zero
330
331         vmull.u32       @ACC[0],$Bi,${A0}[0]
332         vmull.u32       @ACC[1],$Bi,${A0}[1]
333         vmull.u32       @ACC[2],$Bi,${A1}[0]
334         vshl.i64        $Ni,@ACC[0]#hi,#16
335         vmull.u32       @ACC[3],$Bi,${A1}[1]
336
337         vadd.u64        $Ni,$Ni,@ACC[0]#lo
338         veor            $zero,$zero,$zero
339         vmul.u32        $Ni,$Ni,$M0
340
341         vmull.u32       @ACC[4],$Bi,${A2}[0]
342          vld1.32        {$N0-$N3}, [$nptr]!
343         vmull.u32       @ACC[5],$Bi,${A2}[1]
344         vmull.u32       @ACC[6],$Bi,${A3}[0]
345         vzip.16         $Ni,$zero
346         vmull.u32       @ACC[7],$Bi,${A3}[1]
347
348         vmlal.u32       @ACC[0],$Ni,${N0}[0]
349         sub             $outer,$num,#1
350         vmlal.u32       @ACC[1],$Ni,${N0}[1]
351         vmlal.u32       @ACC[2],$Ni,${N1}[0]
352         vmlal.u32       @ACC[3],$Ni,${N1}[1]
353
354         vmlal.u32       @ACC[4],$Ni,${N2}[0]
355         vmov            $Temp,@ACC[0]
356         vmlal.u32       @ACC[5],$Ni,${N2}[1]
357         vmov            @ACC[0],@ACC[1]
358         vmlal.u32       @ACC[6],$Ni,${N3}[0]
359         vmov            @ACC[1],@ACC[2]
360         vmlal.u32       @ACC[7],$Ni,${N3}[1]
361         vmov            @ACC[2],@ACC[3]
362         vmov            @ACC[3],@ACC[4]
363         vshr.u64        $temp,$temp,#16
364         vmov            @ACC[4],@ACC[5]
365         vmov            @ACC[5],@ACC[6]
366         vadd.u64        $temp,$temp,$Temp#hi
367         vmov            @ACC[6],@ACC[7]
368         veor            @ACC[7],@ACC[7]
369         vshr.u64        $temp,$temp,#16
370
371         b       .LNEON_outer8
372
373 .align  4
374 .LNEON_outer8:
375         vld1.32         {${Bi}[0]}, [$bptr,:32]!
376         veor            $zero,$zero,$zero
377         vzip.16         $Bi,$zero
378         vadd.u64        @ACC[0]#lo,@ACC[0]#lo,$temp
379
380         vmlal.u32       @ACC[0],$Bi,${A0}[0]
381         vmlal.u32       @ACC[1],$Bi,${A0}[1]
382         vmlal.u32       @ACC[2],$Bi,${A1}[0]
383         vshl.i64        $Ni,@ACC[0]#hi,#16
384         vmlal.u32       @ACC[3],$Bi,${A1}[1]
385
386         vadd.u64        $Ni,$Ni,@ACC[0]#lo
387         veor            $zero,$zero,$zero
388         subs            $outer,$outer,#1
389         vmul.u32        $Ni,$Ni,$M0
390
391         vmlal.u32       @ACC[4],$Bi,${A2}[0]
392         vmlal.u32       @ACC[5],$Bi,${A2}[1]
393         vmlal.u32       @ACC[6],$Bi,${A3}[0]
394         vzip.16         $Ni,$zero
395         vmlal.u32       @ACC[7],$Bi,${A3}[1]
396
397         vmlal.u32       @ACC[0],$Ni,${N0}[0]
398         vmlal.u32       @ACC[1],$Ni,${N0}[1]
399         vmlal.u32       @ACC[2],$Ni,${N1}[0]
400         vmlal.u32       @ACC[3],$Ni,${N1}[1]
401
402         vmlal.u32       @ACC[4],$Ni,${N2}[0]
403         vmov            $Temp,@ACC[0]
404         vmlal.u32       @ACC[5],$Ni,${N2}[1]
405         vmov            @ACC[0],@ACC[1]
406         vmlal.u32       @ACC[6],$Ni,${N3}[0]
407         vmov            @ACC[1],@ACC[2]
408         vmlal.u32       @ACC[7],$Ni,${N3}[1]
409         vmov            @ACC[2],@ACC[3]
410         vmov            @ACC[3],@ACC[4]
411         vshr.u64        $temp,$temp,#16
412         vmov            @ACC[4],@ACC[5]
413         vmov            @ACC[5],@ACC[6]
414         vadd.u64        $temp,$temp,$Temp#hi
415         vmov            @ACC[6],@ACC[7]
416         veor            @ACC[7],@ACC[7]
417         vshr.u64        $temp,$temp,#16
418
419         bne     .LNEON_outer8
420
421         vadd.u64        @ACC[0]#lo,@ACC[0]#lo,$temp
422         mov             $toutptr,sp
423         vshr.u64        $temp,@ACC[0]#lo,#16
424         mov             $inner,$num
425         vadd.u64        @ACC[0]#hi,@ACC[0]#hi,$temp
426         add             $tinptr,sp,#96
427         vshr.u64        $temp,@ACC[0]#hi,#16
428         vzip.16         @ACC[0]#lo,@ACC[0]#hi
429
430         b       .LNEON_tail_entry
431
432 .align  4
433 .LNEON_8n:
434         veor            @ACC[0],@ACC[0],@ACC[0]
435          sub            $toutptr,sp,#128
436         veor            @ACC[1],@ACC[1],@ACC[1]
437          sub            $toutptr,$toutptr,$num,lsl#4
438         veor            @ACC[2],@ACC[2],@ACC[2]
439          and            $toutptr,$toutptr,#-64
440         veor            @ACC[3],@ACC[3],@ACC[3]
441          mov            sp,$toutptr                     @ alloca
442         veor            @ACC[4],@ACC[4],@ACC[4]
443          add            $toutptr,$toutptr,#256
444         veor            @ACC[5],@ACC[5],@ACC[5]
445          sub            $inner,$num,#8
446         veor            @ACC[6],@ACC[6],@ACC[6]
447         veor            @ACC[7],@ACC[7],@ACC[7]
448
449 .LNEON_8n_init:
450         vst1.64         {@ACC[0]-@ACC[1]},[$toutptr,:256]!
451         subs            $inner,$inner,#8
452         vst1.64         {@ACC[2]-@ACC[3]},[$toutptr,:256]!
453         vst1.64         {@ACC[4]-@ACC[5]},[$toutptr,:256]!
454         vst1.64         {@ACC[6]-@ACC[7]},[$toutptr,:256]!
455         bne             .LNEON_8n_init
456
457         add             $tinptr,sp,#256
458         vld1.32         {$A0-$A3},[$aptr]!
459         add             $bnptr,sp,#8
460         vld1.32         {${M0}[0]},[$n0,:32]
461         mov             $outer,$num
462         b               .LNEON_8n_outer
463
464 .align  4
465 .LNEON_8n_outer:
466         vld1.32         {${Bi}[0]},[$bptr,:32]! @ *b++
467         veor            $zero,$zero,$zero
468         vzip.16         $Bi,$zero
469         add             $toutptr,sp,#128
470         vld1.32         {$N0-$N3},[$nptr]!
471
472         vmlal.u32       @ACC[0],$Bi,${A0}[0]
473         vmlal.u32       @ACC[1],$Bi,${A0}[1]
474          veor           $zero,$zero,$zero
475         vmlal.u32       @ACC[2],$Bi,${A1}[0]
476          vshl.i64       $Ni,@ACC[0]#hi,#16
477         vmlal.u32       @ACC[3],$Bi,${A1}[1]
478          vadd.u64       $Ni,$Ni,@ACC[0]#lo
479         vmlal.u32       @ACC[4],$Bi,${A2}[0]
480          vmul.u32       $Ni,$Ni,$M0
481         vmlal.u32       @ACC[5],$Bi,${A2}[1]
482         vst1.32         {$Bi},[sp,:64]          @ put aside smashed b[8*i+0]
483         vmlal.u32       @ACC[6],$Bi,${A3}[0]
484          vzip.16        $Ni,$zero
485         vmlal.u32       @ACC[7],$Bi,${A3}[1]
486 ___
487 for ($i=0; $i<7;) {
488 $code.=<<___;
489         vld1.32         {${Bi}[0]},[$bptr,:32]! @ *b++
490         vmlal.u32       @ACC[0],$Ni,${N0}[0]
491         veor            $temp,$temp,$temp
492         vmlal.u32       @ACC[1],$Ni,${N0}[1]
493         vzip.16         $Bi,$temp
494         vmlal.u32       @ACC[2],$Ni,${N1}[0]
495          vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
496         vmlal.u32       @ACC[3],$Ni,${N1}[1]
497         vmlal.u32       @ACC[4],$Ni,${N2}[0]
498          vadd.u64       @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
499         vmlal.u32       @ACC[5],$Ni,${N2}[1]
500          vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
501         vmlal.u32       @ACC[6],$Ni,${N3}[0]
502         vmlal.u32       @ACC[7],$Ni,${N3}[1]
503          vadd.u64       @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
504         vst1.32         {$Ni},[$bnptr,:64]!     @ put aside smashed m[8*i+$i]
505 ___
506         push(@ACC,shift(@ACC)); $i++;
507 $code.=<<___;
508         vmlal.u32       @ACC[0],$Bi,${A0}[0]
509         vld1.64         {@ACC[7]},[$tinptr,:128]!
510         vmlal.u32       @ACC[1],$Bi,${A0}[1]
511          veor           $zero,$zero,$zero
512         vmlal.u32       @ACC[2],$Bi,${A1}[0]
513          vshl.i64       $Ni,@ACC[0]#hi,#16
514         vmlal.u32       @ACC[3],$Bi,${A1}[1]
515          vadd.u64       $Ni,$Ni,@ACC[0]#lo
516         vmlal.u32       @ACC[4],$Bi,${A2}[0]
517          vmul.u32       $Ni,$Ni,$M0
518         vmlal.u32       @ACC[5],$Bi,${A2}[1]
519         vst1.32         {$Bi},[$bnptr,:64]!     @ put aside smashed b[8*i+$i]
520         vmlal.u32       @ACC[6],$Bi,${A3}[0]
521          vzip.16        $Ni,$zero
522         vmlal.u32       @ACC[7],$Bi,${A3}[1]
523 ___
524 }
525 $code.=<<___;
526         vld1.32         {$Bi},[sp,:64]          @ pull smashed b[8*i+0]
527         vmlal.u32       @ACC[0],$Ni,${N0}[0]
528         vld1.32         {$A0-$A3},[$aptr]!
529         vmlal.u32       @ACC[1],$Ni,${N0}[1]
530         vmlal.u32       @ACC[2],$Ni,${N1}[0]
531          vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
532         vmlal.u32       @ACC[3],$Ni,${N1}[1]
533         vmlal.u32       @ACC[4],$Ni,${N2}[0]
534          vadd.u64       @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
535         vmlal.u32       @ACC[5],$Ni,${N2}[1]
536          vshr.u64       @ACC[0]#lo,@ACC[0]#lo,#16
537         vmlal.u32       @ACC[6],$Ni,${N3}[0]
538         vmlal.u32       @ACC[7],$Ni,${N3}[1]
539          vadd.u64       @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
540         vst1.32         {$Ni},[$bnptr,:64]      @ put aside smashed m[8*i+$i]
541         add             $bnptr,sp,#8            @ rewind
542 ___
543         push(@ACC,shift(@ACC));
544 $code.=<<___;
545         sub             $inner,$num,#8
546         b               .LNEON_8n_inner
547
548 .align  4
549 .LNEON_8n_inner:
550         subs            $inner,$inner,#8
551         vmlal.u32       @ACC[0],$Bi,${A0}[0]
552         vld1.64         {@ACC[7]},[$tinptr,:128]
553         vmlal.u32       @ACC[1],$Bi,${A0}[1]
554         vld1.32         {$Ni},[$bnptr,:64]!     @ pull smashed m[8*i+0]
555         vmlal.u32       @ACC[2],$Bi,${A1}[0]
556         vld1.32         {$N0-$N3},[$nptr]!
557         vmlal.u32       @ACC[3],$Bi,${A1}[1]
558         it              ne
559         addne           $tinptr,$tinptr,#16     @ don't advance in last iteration
560         vmlal.u32       @ACC[4],$Bi,${A2}[0]
561         vmlal.u32       @ACC[5],$Bi,${A2}[1]
562         vmlal.u32       @ACC[6],$Bi,${A3}[0]
563         vmlal.u32       @ACC[7],$Bi,${A3}[1]
564 ___
565 for ($i=1; $i<8; $i++) {
566 $code.=<<___;
567         vld1.32         {$Bi},[$bnptr,:64]!     @ pull smashed b[8*i+$i]
568         vmlal.u32       @ACC[0],$Ni,${N0}[0]
569         vmlal.u32       @ACC[1],$Ni,${N0}[1]
570         vmlal.u32       @ACC[2],$Ni,${N1}[0]
571         vmlal.u32       @ACC[3],$Ni,${N1}[1]
572         vmlal.u32       @ACC[4],$Ni,${N2}[0]
573         vmlal.u32       @ACC[5],$Ni,${N2}[1]
574         vmlal.u32       @ACC[6],$Ni,${N3}[0]
575         vmlal.u32       @ACC[7],$Ni,${N3}[1]
576         vst1.64         {@ACC[0]},[$toutptr,:128]!
577 ___
578         push(@ACC,shift(@ACC));
579 $code.=<<___;
580         vmlal.u32       @ACC[0],$Bi,${A0}[0]
581         vld1.64         {@ACC[7]},[$tinptr,:128]
582         vmlal.u32       @ACC[1],$Bi,${A0}[1]
583         vld1.32         {$Ni},[$bnptr,:64]!     @ pull smashed m[8*i+$i]
584         vmlal.u32       @ACC[2],$Bi,${A1}[0]
585         it              ne
586         addne           $tinptr,$tinptr,#16     @ don't advance in last iteration
587         vmlal.u32       @ACC[3],$Bi,${A1}[1]
588         vmlal.u32       @ACC[4],$Bi,${A2}[0]
589         vmlal.u32       @ACC[5],$Bi,${A2}[1]
590         vmlal.u32       @ACC[6],$Bi,${A3}[0]
591         vmlal.u32       @ACC[7],$Bi,${A3}[1]
592 ___
593 }
594 $code.=<<___;
595         it              eq
596         subeq           $aptr,$aptr,$num,lsl#2  @ rewind
597         vmlal.u32       @ACC[0],$Ni,${N0}[0]
598         vld1.32         {$Bi},[sp,:64]          @ pull smashed b[8*i+0]
599         vmlal.u32       @ACC[1],$Ni,${N0}[1]
600         vld1.32         {$A0-$A3},[$aptr]!
601         vmlal.u32       @ACC[2],$Ni,${N1}[0]
602         add             $bnptr,sp,#8            @ rewind
603         vmlal.u32       @ACC[3],$Ni,${N1}[1]
604         vmlal.u32       @ACC[4],$Ni,${N2}[0]
605         vmlal.u32       @ACC[5],$Ni,${N2}[1]
606         vmlal.u32       @ACC[6],$Ni,${N3}[0]
607         vst1.64         {@ACC[0]},[$toutptr,:128]!
608         vmlal.u32       @ACC[7],$Ni,${N3}[1]
609
610         bne             .LNEON_8n_inner
611 ___
612         push(@ACC,shift(@ACC));
613 $code.=<<___;
614         add             $tinptr,sp,#128
615         vst1.64         {@ACC[0]-@ACC[1]},[$toutptr,:256]!
616         veor            q2,q2,q2                @ $N0-$N1
617         vst1.64         {@ACC[2]-@ACC[3]},[$toutptr,:256]!
618         veor            q3,q3,q3                @ $N2-$N3
619         vst1.64         {@ACC[4]-@ACC[5]},[$toutptr,:256]!
620         vst1.64         {@ACC[6]},[$toutptr,:128]
621
622         subs            $outer,$outer,#8
623         vld1.64         {@ACC[0]-@ACC[1]},[$tinptr,:256]!
624         vld1.64         {@ACC[2]-@ACC[3]},[$tinptr,:256]!
625         vld1.64         {@ACC[4]-@ACC[5]},[$tinptr,:256]!
626         vld1.64         {@ACC[6]-@ACC[7]},[$tinptr,:256]!
627
628         itt             ne
629         subne           $nptr,$nptr,$num,lsl#2  @ rewind
630         bne             .LNEON_8n_outer
631
632         add             $toutptr,sp,#128
633         vst1.64         {q2-q3}, [sp,:256]!     @ start wiping stack frame
634         vshr.u64        $temp,@ACC[0]#lo,#16
635         vst1.64         {q2-q3},[sp,:256]!
636         vadd.u64        @ACC[0]#hi,@ACC[0]#hi,$temp
637         vst1.64         {q2-q3}, [sp,:256]!
638         vshr.u64        $temp,@ACC[0]#hi,#16
639         vst1.64         {q2-q3}, [sp,:256]!
640         vzip.16         @ACC[0]#lo,@ACC[0]#hi
641
642         mov             $inner,$num
643         b               .LNEON_tail_entry
644
645 .align  4
646 .LNEON_tail:
647         vadd.u64        @ACC[0]#lo,@ACC[0]#lo,$temp
648         vshr.u64        $temp,@ACC[0]#lo,#16
649         vld1.64         {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
650         vadd.u64        @ACC[0]#hi,@ACC[0]#hi,$temp
651         vld1.64         {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
652         vshr.u64        $temp,@ACC[0]#hi,#16
653         vld1.64         {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
654         vzip.16         @ACC[0]#lo,@ACC[0]#hi
655
656 .LNEON_tail_entry:
657 ___
658 for ($i=1; $i<8; $i++) {
659 $code.=<<___;
660         vadd.u64        @ACC[1]#lo,@ACC[1]#lo,$temp
661         vst1.32         {@ACC[0]#lo[0]}, [$toutptr, :32]!
662         vshr.u64        $temp,@ACC[1]#lo,#16
663         vadd.u64        @ACC[1]#hi,@ACC[1]#hi,$temp
664         vshr.u64        $temp,@ACC[1]#hi,#16
665         vzip.16         @ACC[1]#lo,@ACC[1]#hi
666 ___
667         push(@ACC,shift(@ACC));
668 }
669         push(@ACC,shift(@ACC));
670 $code.=<<___;
671         vld1.64         {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
672         subs            $inner,$inner,#8
673         vst1.32         {@ACC[7]#lo[0]},   [$toutptr, :32]!
674         bne     .LNEON_tail
675
676         vst1.32 {${temp}[0]}, [$toutptr, :32]           @ top-most bit
677         sub     $nptr,$nptr,$num,lsl#2                  @ rewind $nptr
678         subs    $aptr,sp,#0                             @ clear carry flag
679         add     $bptr,sp,$num,lsl#2
680
681 .LNEON_sub:
682         ldmia   $aptr!, {r4-r7}
683         ldmia   $nptr!, {r8-r11}
684         sbcs    r8, r4,r8
685         sbcs    r9, r5,r9
686         sbcs    r10,r6,r10
687         sbcs    r11,r7,r11
688         teq     $aptr,$bptr                             @ preserves carry
689         stmia   $rptr!, {r8-r11}
690         bne     .LNEON_sub
691
692         ldr     r10, [$aptr]                            @ load top-most bit
693         mov     r11,sp
694         veor    q0,q0,q0
695         sub     r11,$bptr,r11                           @ this is num*4
696         veor    q1,q1,q1
697         mov     $aptr,sp
698         sub     $rptr,$rptr,r11                         @ rewind $rptr
699         mov     $nptr,$bptr                             @ second 3/4th of frame
700         sbcs    r10,r10,#0                              @ result is carry flag
701
702 .LNEON_copy_n_zap:
703         ldmia   $aptr!, {r4-r7}
704         ldmia   $rptr,  {r8-r11}
705         it      cc
706         movcc   r8, r4
707         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
708         itt     cc
709         movcc   r9, r5
710         movcc   r10,r6
711         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
712         it      cc
713         movcc   r11,r7
714         ldmia   $aptr, {r4-r7}
715         stmia   $rptr!, {r8-r11}
716         sub     $aptr,$aptr,#16
717         ldmia   $rptr, {r8-r11}
718         it      cc
719         movcc   r8, r4
720         vst1.64 {q0-q1}, [$aptr,:256]!                  @ wipe
721         itt     cc
722         movcc   r9, r5
723         movcc   r10,r6
724         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
725         it      cc
726         movcc   r11,r7
727         teq     $aptr,$bptr                             @ preserves carry
728         stmia   $rptr!, {r8-r11}
729         bne     .LNEON_copy_n_zap
730
731         mov     sp,ip
732         vldmia  sp!,{d8-d15}
733         ldmia   sp!,{r4-r11}
734         ret                                             @ bx lr
735 .size   bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
736 #endif
737 ___
738 }
739 $code.=<<___;
740 .asciz  "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
741 .align  2
742 #if __ARM_MAX_ARCH__>=7
743 .comm   OPENSSL_armcap_P,4,4
744 #endif
745 ___
746
747 foreach (split("\n",$code)) {
748         s/\`([^\`]*)\`/eval $1/ge;
749
750         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge        or
751         s/\bret\b/bx    lr/g                                            or
752         s/\bbx\s+lr\b/.word\t0xe12fff1e/g;      # make it possible to compile with -march=armv4
753
754         print $_,"\n";
755 }
756
757 close STDOUT;