modes/asm/ghashv8-armx.pl: implement 4x aggregate factor.
[openssl.git] / crypto / modes / asm / ghashv8-armx.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
18 #
19 # June 2014
20 #
21 # Initial version was developed in tight cooperation with Ard
22 # Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
23 # Just like aesv8-armx.pl this module supports both AArch32 and
24 # AArch64 execution modes.
25 #
26 # July 2014
27 #
28 # Implement 2x aggregated reduction [see ghash-x86.pl for background
29 # information].
30 #
31 # November 2017
32 #
33 # AArch64 register bank to "accommodate" 4x aggregated reduction...
34 #
35 # Current performance in cycles per processed byte:
36 #
37 #               64-bit PMULL    32-bit PMULL    32-bit NEON(*)
38 # Apple A7                      0.92            5.62
39 # Cortex-A53                    1.01            8.39
40 # Cortex-A57                    1.17            7.61
41 # Denver                        0.71            6.02
42 # Mongoose                      1.10            8.06
43 # Kryo                          1.16            8.00
44 #
45 # (*)   presented for reference/comparison purposes;
46
47 $flavour = shift;
48 $output  = shift;
49
50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
52 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
53 die "can't locate arm-xlate.pl";
54
55 open OUT,"| \"$^X\" $xlate $flavour $output";
56 *STDOUT=*OUT;
57
58 $Xi="x0";       # argument block
59 $Htbl="x1";
60 $inp="x2";
61 $len="x3";
62
63 $inc="x12";
64
65 {
66 my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
67 my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
68
69 $code=<<___;
70 #include "arm_arch.h"
71
72 .text
73 ___
74 $code.=".arch   armv8-a+crypto\n"       if ($flavour =~ /64/);
75 $code.=<<___                            if ($flavour !~ /64/);
76 .fpu    neon
77 .code   32
78 #undef  __thumb2__
79 ___
80
81 ################################################################################
82 # void gcm_init_v8(u128 Htable[16],const u64 H[2]);
83 #
84 # input:        128-bit H - secret parameter E(K,0^128)
85 # output:       precomputed table filled with degrees of twisted H;
86 #               H is twisted to handle reverse bitness of GHASH;
87 #               only few of 16 slots of Htable[16] are used;
88 #               data is opaque to outside world (which allows to
89 #               optimize the code independently);
90 #
91 $code.=<<___;
92 .global gcm_init_v8
93 .type   gcm_init_v8,%function
94 .align  4
95 gcm_init_v8:
96         vld1.64         {$t1},[x1]              @ load input H
97         vmov.i8         $xC2,#0xe1
98         vshl.i64        $xC2,$xC2,#57           @ 0xc2.0
99         vext.8          $IN,$t1,$t1,#8
100         vshr.u64        $t2,$xC2,#63
101         vdup.32         $t1,${t1}[1]
102         vext.8          $t0,$t2,$xC2,#8         @ t0=0xc2....01
103         vshr.u64        $t2,$IN,#63
104         vshr.s32        $t1,$t1,#31             @ broadcast carry bit
105         vand            $t2,$t2,$t0
106         vshl.i64        $IN,$IN,#1
107         vext.8          $t2,$t2,$t2,#8
108         vand            $t0,$t0,$t1
109         vorr            $IN,$IN,$t2             @ H<<<=1
110         veor            $H,$IN,$t0              @ twisted H
111         vst1.64         {$H},[x0],#16           @ store Htable[0]
112
113         @ calculate H^2
114         vext.8          $t0,$H,$H,#8            @ Karatsuba pre-processing
115         vpmull.p64      $Xl,$H,$H
116         veor            $t0,$t0,$H
117         vpmull2.p64     $Xh,$H,$H
118         vpmull.p64      $Xm,$t0,$t0
119
120         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
121         veor            $t2,$Xl,$Xh
122         veor            $Xm,$Xm,$t1
123         veor            $Xm,$Xm,$t2
124         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
125
126         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
127         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
128         veor            $Xl,$Xm,$t2
129
130         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
131         vpmull.p64      $Xl,$Xl,$xC2
132         veor            $t2,$t2,$Xh
133         veor            $H2,$Xl,$t2
134
135         vext.8          $t1,$H2,$H2,#8          @ Karatsuba pre-processing
136         veor            $t1,$t1,$H2
137         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
138         vst1.64         {$Hhl-$H2},[x0],#32     @ store Htable[1..2]
139 ___
140 if ($flavour =~ /64/) {
141 my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
142
143 $code.=<<___;
144         @ calculate H^3 and H^4
145         vpmull.p64      $Xl,$H, $H2
146          vpmull.p64     $Yl,$H2,$H2
147         vpmull2.p64     $Xh,$H, $H2
148          vpmull2.p64    $Yh,$H2,$H2
149         vpmull.p64      $Xm,$t0,$t1
150          vpmull.p64     $Ym,$t1,$t1
151
152         vext.8          $t0,$Xl,$Xh,#8          @ Karatsuba post-processing
153          vext.8         $t1,$Yl,$Yh,#8
154         veor            $t2,$Xl,$Xh
155         veor            $Xm,$Xm,$t0
156          veor           $t3,$Yl,$Yh
157          veor           $Ym,$Ym,$t1
158         veor            $Xm,$Xm,$t2
159         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
160          veor           $Ym,$Ym,$t3
161          vpmull.p64     $t3,$Yl,$xC2
162
163         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
164          vmov           $Yh#lo,$Ym#hi
165         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
166          vmov           $Ym#hi,$Yl#lo
167         veor            $Xl,$Xm,$t2
168          veor           $Yl,$Ym,$t3
169
170         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
171          vext.8         $t3,$Yl,$Yl,#8
172         vpmull.p64      $Xl,$Xl,$xC2
173          vpmull.p64     $Yl,$Yl,$xC2
174         veor            $t2,$t2,$Xh
175          veor           $t3,$t3,$Yh
176         veor            $H, $Xl,$t2             @ H^3
177          veor           $H2,$Yl,$t3             @ H^4
178
179         vext.8          $t0,$H, $H,#8           @ Karatsuba pre-processing
180          vext.8         $t1,$H2,$H2,#8
181         veor            $t0,$t0,$H
182          veor           $t1,$t1,$H2
183         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
184         vst1.64         {$H-$H2},[x0]           @ store Htable[3..5]
185 ___
186 }
187 $code.=<<___;
188         ret
189 .size   gcm_init_v8,.-gcm_init_v8
190 ___
191 ################################################################################
192 # void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
193 #
194 # input:        Xi - current hash value;
195 #               Htable - table precomputed in gcm_init_v8;
196 # output:       Xi - next hash value Xi;
197 #
198 $code.=<<___;
199 .global gcm_gmult_v8
200 .type   gcm_gmult_v8,%function
201 .align  4
202 gcm_gmult_v8:
203         vld1.64         {$t1},[$Xi]             @ load Xi
204         vmov.i8         $xC2,#0xe1
205         vld1.64         {$H-$Hhl},[$Htbl]       @ load twisted H, ...
206         vshl.u64        $xC2,$xC2,#57
207 #ifndef __ARMEB__
208         vrev64.8        $t1,$t1
209 #endif
210         vext.8          $IN,$t1,$t1,#8
211
212         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
213         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
214         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
215         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
216
217         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
218         veor            $t2,$Xl,$Xh
219         veor            $Xm,$Xm,$t1
220         veor            $Xm,$Xm,$t2
221         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
222
223         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
224         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
225         veor            $Xl,$Xm,$t2
226
227         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
228         vpmull.p64      $Xl,$Xl,$xC2
229         veor            $t2,$t2,$Xh
230         veor            $Xl,$Xl,$t2
231
232 #ifndef __ARMEB__
233         vrev64.8        $Xl,$Xl
234 #endif
235         vext.8          $Xl,$Xl,$Xl,#8
236         vst1.64         {$Xl},[$Xi]             @ write out Xi
237
238         ret
239 .size   gcm_gmult_v8,.-gcm_gmult_v8
240 ___
241 ################################################################################
242 # void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
243 #
244 # input:        table precomputed in gcm_init_v8;
245 #               current hash value Xi;
246 #               pointer to input data;
247 #               length of input data in bytes, but divisible by block size;
248 # output:       next hash value Xi;
249 #
250 $code.=<<___;
251 .global gcm_ghash_v8
252 .type   gcm_ghash_v8,%function
253 .align  4
254 gcm_ghash_v8:
255 ___
256 $code.=<<___    if ($flavour =~ /64/);
257         bic             $inc,$len,#63
258         cmp             $len,$inc
259         b.eq            .Lgcm_ghash_v8_4x
260 ___
261 $code.=<<___            if ($flavour !~ /64/);
262         vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
263 ___
264 $code.=<<___;
265         vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
266                                                 @ "[rotated]" means that
267                                                 @ loaded value would have
268                                                 @ to be rotated in order to
269                                                 @ make it appear as in
270                                                 @ algorithm specification
271         subs            $len,$len,#32           @ see if $len is 32 or larger
272         mov             $inc,#16                @ $inc is used as post-
273                                                 @ increment for input pointer;
274                                                 @ as loop is modulo-scheduled
275                                                 @ $inc is zeroed just in time
276                                                 @ to preclude overstepping
277                                                 @ inp[len], which means that
278                                                 @ last block[s] are actually
279                                                 @ loaded twice, but last
280                                                 @ copy is not processed
281         vld1.64         {$H-$Hhl},[$Htbl],#32   @ load twisted H, ..., H^2
282         vmov.i8         $xC2,#0xe1
283         vld1.64         {$H2},[$Htbl]
284         cclr            $inc,eq                 @ is it time to zero $inc?
285         vext.8          $Xl,$Xl,$Xl,#8          @ rotate Xi
286         vld1.64         {$t0},[$inp],#16        @ load [rotated] I[0]
287         vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
288 #ifndef __ARMEB__
289         vrev64.8        $t0,$t0
290         vrev64.8        $Xl,$Xl
291 #endif
292         vext.8          $IN,$t0,$t0,#8          @ rotate I[0]
293         b.lo            .Lodd_tail_v8           @ $len was less than 32
294 ___
295 { my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
296         #######
297         # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
298         #       [(H*Ii+1) + (H*Xi+1)] mod P =
299         #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
300         #
301 $code.=<<___;
302         vld1.64         {$t1},[$inp],$inc       @ load [rotated] I[1]
303 #ifndef __ARMEB__
304         vrev64.8        $t1,$t1
305 #endif
306         vext.8          $In,$t1,$t1,#8
307         veor            $IN,$IN,$Xl             @ I[i]^=Xi
308         vpmull.p64      $Xln,$H,$In             @ H·Ii+1
309         veor            $t1,$t1,$In             @ Karatsuba pre-processing
310         vpmull2.p64     $Xhn,$H,$In
311         b               .Loop_mod2x_v8
312
313 .align  4
314 .Loop_mod2x_v8:
315         vext.8          $t2,$IN,$IN,#8
316         subs            $len,$len,#32           @ is there more data?
317         vpmull.p64      $Xl,$H2,$IN             @ H^2.lo·Xi.lo
318         cclr            $inc,lo                 @ is it time to zero $inc?
319
320          vpmull.p64     $Xmn,$Hhl,$t1
321         veor            $t2,$t2,$IN             @ Karatsuba pre-processing
322         vpmull2.p64     $Xh,$H2,$IN             @ H^2.hi·Xi.hi
323         veor            $Xl,$Xl,$Xln            @ accumulate
324         vpmull2.p64     $Xm,$Hhl,$t2            @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
325          vld1.64        {$t0},[$inp],$inc       @ load [rotated] I[i+2]
326
327         veor            $Xh,$Xh,$Xhn
328          cclr           $inc,eq                 @ is it time to zero $inc?
329         veor            $Xm,$Xm,$Xmn
330
331         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
332         veor            $t2,$Xl,$Xh
333         veor            $Xm,$Xm,$t1
334          vld1.64        {$t1},[$inp],$inc       @ load [rotated] I[i+3]
335 #ifndef __ARMEB__
336          vrev64.8       $t0,$t0
337 #endif
338         veor            $Xm,$Xm,$t2
339         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
340
341 #ifndef __ARMEB__
342          vrev64.8       $t1,$t1
343 #endif
344         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
345         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
346          vext.8         $In,$t1,$t1,#8
347          vext.8         $IN,$t0,$t0,#8
348         veor            $Xl,$Xm,$t2
349          vpmull.p64     $Xln,$H,$In             @ H·Ii+1
350         veor            $IN,$IN,$Xh             @ accumulate $IN early
351
352         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
353         vpmull.p64      $Xl,$Xl,$xC2
354         veor            $IN,$IN,$t2
355          veor           $t1,$t1,$In             @ Karatsuba pre-processing
356         veor            $IN,$IN,$Xl
357          vpmull2.p64    $Xhn,$H,$In
358         b.hs            .Loop_mod2x_v8          @ there was at least 32 more bytes
359
360         veor            $Xh,$Xh,$t2
361         vext.8          $IN,$t0,$t0,#8          @ re-construct $IN
362         adds            $len,$len,#32           @ re-construct $len
363         veor            $Xl,$Xl,$Xh             @ re-construct $Xl
364         b.eq            .Ldone_v8               @ is $len zero?
365 ___
366 }
367 $code.=<<___;
368 .Lodd_tail_v8:
369         vext.8          $t2,$Xl,$Xl,#8
370         veor            $IN,$IN,$Xl             @ inp^=Xi
371         veor            $t1,$t0,$t2             @ $t1 is rotated inp^Xi
372
373         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
374         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
375         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
376         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
377
378         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
379         veor            $t2,$Xl,$Xh
380         veor            $Xm,$Xm,$t1
381         veor            $Xm,$Xm,$t2
382         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
383
384         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
385         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
386         veor            $Xl,$Xm,$t2
387
388         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
389         vpmull.p64      $Xl,$Xl,$xC2
390         veor            $t2,$t2,$Xh
391         veor            $Xl,$Xl,$t2
392
393 .Ldone_v8:
394 #ifndef __ARMEB__
395         vrev64.8        $Xl,$Xl
396 #endif
397         vext.8          $Xl,$Xl,$Xl,#8
398         vst1.64         {$Xl},[$Xi]             @ write out Xi
399
400 ___
401 $code.=<<___            if ($flavour !~ /64/);
402         vldmia          sp!,{d8-d15}            @ 32-bit ABI says so
403 ___
404 $code.=<<___;
405         ret
406 .size   gcm_ghash_v8,.-gcm_ghash_v8
407 ___
408
409 if ($flavour =~ /64/) {                         # 4x subroutine
410 my ($I0,$j1,$j2,$j3,
411     $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
412
413 $code.=<<___;
414 .type   gcm_ghash_v8_4x,%function
415 .align  4
416 gcm_ghash_v8_4x:
417 .Lgcm_ghash_v8_4x:
418         vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
419         vld1.64         {$H-$H2},[$Htbl],#48    @ load twisted H, ..., H^2
420         vmov.i8         $xC2,#0xe1
421         vld1.64         {$H3-$H4},[$Htbl]       @ load twisted H^3, ..., H^4
422         vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
423 #ifndef __ARMEB__
424         vrev64.8        $Xl,$Xl
425 #endif
426         b               .Loop4x
427
428 .align  4
429 .Loop4x:
430         vld1.64         {$I0-$j3},[$inp],#64
431 #ifndef __ARMEB__
432         vrev64.8        $j1,$j1
433         vrev64.8        $j2,$j2
434         vrev64.8        $j3,$j3
435         vrev64.8        $I0,$I0
436 #endif
437         vext.8          $I3,$j3,$j3,#8
438         vext.8          $I2,$j2,$j2,#8
439         vext.8          $I1,$j1,$j1,#8
440
441         vpmull.p64      $Yl,$H,$I3              @ H·Ii+3
442         veor            $j3,$j3,$I3
443         vpmull2.p64     $Yh,$H,$I3
444         vpmull.p64      $Ym,$Hhl,$j3
445
446         vpmull.p64      $t0,$H2,$I2             @ H^2·Ii+2
447         veor            $j2,$j2,$I2
448         vpmull2.p64     $I2,$H2,$I2
449         vpmull2.p64     $j2,$Hhl,$j2
450
451         veor            $Yl,$Yl,$t0
452         veor            $Yh,$Yh,$I2
453         veor            $Ym,$Ym,$j2
454
455         vpmull.p64      $j3,$H3,$I1             @ H^3·Ii+1
456         veor            $j1,$j1,$I1
457         vpmull2.p64     $I1,$H3,$I1
458         vpmull.p64      $j1,$H34,$j1
459
460         veor            $Yl,$Yl,$j3
461         veor            $Yh,$Yh,$I1
462         veor            $Ym,$Ym,$j1
463
464         veor            $t0,$I0,$Xl
465         vext.8          $IN,$t0,$t0,#8
466
467         vpmull.p64      $Xl,$H4,$IN             @ H^4·(Xi+Ii)
468         veor            $t0,$t0,$IN
469         vpmull2.p64     $Xh,$H4,$IN
470         vpmull2.p64     $Xm,$H34,$t0
471
472         veor            $Xl,$Xl,$Yl
473         veor            $Xh,$Xh,$Yh
474         veor            $Xm,$Xm,$Ym
475
476         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
477         veor            $t2,$Xl,$Xh
478         veor            $Xm,$Xm,$t1
479         veor            $Xm,$Xm,$t2
480
481         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
482         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
483         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
484         veor            $Xl,$Xm,$t2
485
486         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
487         vpmull.p64      $Xl,$Xl,$xC2
488         veor            $t2,$t2,$Xh
489         veor            $Xl,$Xl,$t2
490         vext.8          $Xl,$Xl,$Xl,#8
491
492         subs            $len,$len,#64
493         b.ne            .Loop4x
494
495 #ifndef __ARMEB__
496         vrev64.8        $Xl,$Xl
497 #endif
498         vst1.64         {$Xl},[$Xi]             @ write out Xi
499
500         ret
501 .size   gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
502 ___
503
504 }
505 }
506
507 $code.=<<___;
508 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
509 .align  2
510 ___
511
512 if ($flavour =~ /64/) {                 ######## 64-bit code
513     sub unvmov {
514         my $arg=shift;
515
516         $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
517         sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
518                                              $3<8?$3:$3+8,($4 eq "lo")?0:1;
519     }
520     foreach(split("\n",$code)) {
521         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
522         s/vmov\.i8/movi/o               or      # fix up legacy mnemonics
523         s/vmov\s+(.*)/unvmov($1)/geo    or
524         s/vext\.8/ext/o                 or
525         s/vshr\.s/sshr\.s/o             or
526         s/vshr/ushr/o                   or
527         s/^(\s+)v/$1/o                  or      # strip off v prefix
528         s/\bbx\s+lr\b/ret/o;
529
530         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
531         s/@\s/\/\//o;                           # old->new style commentary
532
533         # fix up remaining legacy suffixes
534         s/\.[ui]?8(\s)/$1/o;
535         s/\.[uis]?32//o and s/\.16b/\.4s/go;
536         m/\.p64/o and s/\.16b/\.1q/o;           # 1st pmull argument
537         m/l\.p64/o and s/\.16b/\.1d/go;         # 2nd and 3rd pmull arguments
538         s/\.[uisp]?64//o and s/\.16b/\.2d/go;
539         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
540
541         print $_,"\n";
542     }
543 } else {                                ######## 32-bit code
544     sub unvdup32 {
545         my $arg=shift;
546
547         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
548         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
549     }
550     sub unvpmullp64 {
551         my ($mnemonic,$arg)=@_;
552
553         if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
554             my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
555                                  |(($2&7)<<17)|(($2&8)<<4)
556                                  |(($3&7)<<1) |(($3&8)<<2);
557             $word |= 0x00010001  if ($mnemonic =~ "2");
558             # since ARMv7 instructions are always encoded little-endian.
559             # correct solution is to use .inst directive, but older
560             # assemblers don't implement it:-(
561             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
562                         $word&0xff,($word>>8)&0xff,
563                         ($word>>16)&0xff,($word>>24)&0xff,
564                         $mnemonic,$arg;
565         }
566     }
567
568     foreach(split("\n",$code)) {
569         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
570         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
571         s/\/\/\s?/@ /o;                         # new->old style commentary
572
573         # fix up remaining new-style suffixes
574         s/\],#[0-9]+/]!/o;
575
576         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o                 or
577         s/vdup\.32\s+(.*)/unvdup32($1)/geo                              or
578         s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo                or
579         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
580         s/^(\s+)b\./$1b/o                                               or
581         s/^(\s+)ret/$1bx\tlr/o;
582
583         print $_,"\n";
584     }
585 }
586
587 close STDOUT; # enforce flush