Do not silently truncate files on perlasm errors
[openssl.git] / crypto / modes / asm / ghashv8-armx.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
18 #
19 # June 2014
20 #
21 # Initial version was developed in tight cooperation with Ard
22 # Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
23 # Just like aesv8-armx.pl this module supports both AArch32 and
24 # AArch64 execution modes.
25 #
26 # July 2014
27 #
28 # Implement 2x aggregated reduction [see ghash-x86.pl for background
29 # information].
30 #
31 # November 2017
32 #
33 # AArch64 register bank to "accommodate" 4x aggregated reduction and
34 # improve performance by 20-70% depending on processor.
35 #
36 # Current performance in cycles per processed byte:
37 #
38 #               64-bit PMULL    32-bit PMULL    32-bit NEON(*)
39 # Apple A7      0.58            0.92            5.62
40 # Cortex-A53    0.85            1.01            8.39
41 # Cortex-A57    0.73            1.17            7.61
42 # Denver        0.51            0.65            6.02
43 # Mongoose      0.65            1.10            8.06
44 # Kryo          0.76            1.16            8.00
45 # ThunderX2     1.05
46 #
47 # (*)   presented for reference/comparison purposes;
48
49 # $output is the last argument if it looks like a file (it has an extension)
50 # $flavour is the first argument if it doesn't look like a file
51 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
52 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
53
54 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
56 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
57 die "can't locate arm-xlate.pl";
58
59 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
60     or die "can't call $xlate: $!";
61 *STDOUT=*OUT;
62
63 $Xi="x0";       # argument block
64 $Htbl="x1";
65 $inp="x2";
66 $len="x3";
67
68 $inc="x12";
69
70 {
71 my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
72 my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
73 my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
74
75 $code=<<___;
76 #include "arm_arch.h"
77
78 #if __ARM_MAX_ARCH__>=7
79 ___
80 $code.=".arch   armv8-a+crypto\n.text\n"        if ($flavour =~ /64/);
81 $code.=<<___                                    if ($flavour !~ /64/);
82 .fpu    neon
83 #ifdef __thumb2__
84 .syntax        unified
85 .thumb
86 # define INST(a,b,c,d) $_byte  c,0xef,a,b
87 #else
88 .code  32
89 # define INST(a,b,c,d) $_byte  a,b,c,0xf2
90 #endif
91
92 .text
93 ___
94
95 ################################################################################
96 # void gcm_init_v8(u128 Htable[16],const u64 H[2]);
97 #
98 # input:        128-bit H - secret parameter E(K,0^128)
99 # output:       precomputed table filled with degrees of twisted H;
100 #               H is twisted to handle reverse bitness of GHASH;
101 #               only few of 16 slots of Htable[16] are used;
102 #               data is opaque to outside world (which allows to
103 #               optimize the code independently);
104 #
105 $code.=<<___;
106 .global gcm_init_v8
107 .type   gcm_init_v8,%function
108 .align  4
109 gcm_init_v8:
110         vld1.64         {$t1},[x1]              @ load input H
111         vmov.i8         $xC2,#0xe1
112         vshl.i64        $xC2,$xC2,#57           @ 0xc2.0
113         vext.8          $IN,$t1,$t1,#8
114         vshr.u64        $t2,$xC2,#63
115         vdup.32         $t1,${t1}[1]
116         vext.8          $t0,$t2,$xC2,#8         @ t0=0xc2....01
117         vshr.u64        $t2,$IN,#63
118         vshr.s32        $t1,$t1,#31             @ broadcast carry bit
119         vand            $t2,$t2,$t0
120         vshl.i64        $IN,$IN,#1
121         vext.8          $t2,$t2,$t2,#8
122         vand            $t0,$t0,$t1
123         vorr            $IN,$IN,$t2             @ H<<<=1
124         veor            $H,$IN,$t0              @ twisted H
125         vst1.64         {$H},[x0],#16           @ store Htable[0]
126
127         @ calculate H^2
128         vext.8          $t0,$H,$H,#8            @ Karatsuba pre-processing
129         vpmull.p64      $Xl,$H,$H
130         veor            $t0,$t0,$H
131         vpmull2.p64     $Xh,$H,$H
132         vpmull.p64      $Xm,$t0,$t0
133
134         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
135         veor            $t2,$Xl,$Xh
136         veor            $Xm,$Xm,$t1
137         veor            $Xm,$Xm,$t2
138         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
139
140         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
141         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
142         veor            $Xl,$Xm,$t2
143
144         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
145         vpmull.p64      $Xl,$Xl,$xC2
146         veor            $t2,$t2,$Xh
147         veor            $H2,$Xl,$t2
148
149         vext.8          $t1,$H2,$H2,#8          @ Karatsuba pre-processing
150         veor            $t1,$t1,$H2
151         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
152         vst1.64         {$Hhl-$H2},[x0],#32     @ store Htable[1..2]
153 ___
154 if ($flavour =~ /64/) {
155 my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
156
157 $code.=<<___;
158         @ calculate H^3 and H^4
159         vpmull.p64      $Xl,$H, $H2
160          vpmull.p64     $Yl,$H2,$H2
161         vpmull2.p64     $Xh,$H, $H2
162          vpmull2.p64    $Yh,$H2,$H2
163         vpmull.p64      $Xm,$t0,$t1
164          vpmull.p64     $Ym,$t1,$t1
165
166         vext.8          $t0,$Xl,$Xh,#8          @ Karatsuba post-processing
167          vext.8         $t1,$Yl,$Yh,#8
168         veor            $t2,$Xl,$Xh
169         veor            $Xm,$Xm,$t0
170          veor           $t3,$Yl,$Yh
171          veor           $Ym,$Ym,$t1
172         veor            $Xm,$Xm,$t2
173         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
174          veor           $Ym,$Ym,$t3
175          vpmull.p64     $t3,$Yl,$xC2
176
177         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
178          vmov           $Yh#lo,$Ym#hi
179         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
180          vmov           $Ym#hi,$Yl#lo
181         veor            $Xl,$Xm,$t2
182          veor           $Yl,$Ym,$t3
183
184         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
185          vext.8         $t3,$Yl,$Yl,#8
186         vpmull.p64      $Xl,$Xl,$xC2
187          vpmull.p64     $Yl,$Yl,$xC2
188         veor            $t2,$t2,$Xh
189          veor           $t3,$t3,$Yh
190         veor            $H, $Xl,$t2             @ H^3
191          veor           $H2,$Yl,$t3             @ H^4
192
193         vext.8          $t0,$H, $H,#8           @ Karatsuba pre-processing
194          vext.8         $t1,$H2,$H2,#8
195         veor            $t0,$t0,$H
196          veor           $t1,$t1,$H2
197         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
198         vst1.64         {$H-$H2},[x0]           @ store Htable[3..5]
199 ___
200 }
201 $code.=<<___;
202         ret
203 .size   gcm_init_v8,.-gcm_init_v8
204 ___
205 ################################################################################
206 # void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
207 #
208 # input:        Xi - current hash value;
209 #               Htable - table precomputed in gcm_init_v8;
210 # output:       Xi - next hash value Xi;
211 #
212 $code.=<<___;
213 .global gcm_gmult_v8
214 .type   gcm_gmult_v8,%function
215 .align  4
216 gcm_gmult_v8:
217         vld1.64         {$t1},[$Xi]             @ load Xi
218         vmov.i8         $xC2,#0xe1
219         vld1.64         {$H-$Hhl},[$Htbl]       @ load twisted H, ...
220         vshl.u64        $xC2,$xC2,#57
221 #ifndef __ARMEB__
222         vrev64.8        $t1,$t1
223 #endif
224         vext.8          $IN,$t1,$t1,#8
225
226         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
227         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
228         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
229         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
230
231         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
232         veor            $t2,$Xl,$Xh
233         veor            $Xm,$Xm,$t1
234         veor            $Xm,$Xm,$t2
235         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
236
237         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
238         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
239         veor            $Xl,$Xm,$t2
240
241         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
242         vpmull.p64      $Xl,$Xl,$xC2
243         veor            $t2,$t2,$Xh
244         veor            $Xl,$Xl,$t2
245
246 #ifndef __ARMEB__
247         vrev64.8        $Xl,$Xl
248 #endif
249         vext.8          $Xl,$Xl,$Xl,#8
250         vst1.64         {$Xl},[$Xi]             @ write out Xi
251
252         ret
253 .size   gcm_gmult_v8,.-gcm_gmult_v8
254 ___
255 ################################################################################
256 # void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
257 #
258 # input:        table precomputed in gcm_init_v8;
259 #               current hash value Xi;
260 #               pointer to input data;
261 #               length of input data in bytes, but divisible by block size;
262 # output:       next hash value Xi;
263 #
264 $code.=<<___;
265 .global gcm_ghash_v8
266 .type   gcm_ghash_v8,%function
267 .align  4
268 gcm_ghash_v8:
269 ___
270 $code.=<<___    if ($flavour =~ /64/);
271         cmp             $len,#64
272         b.hs            .Lgcm_ghash_v8_4x
273 ___
274 $code.=<<___            if ($flavour !~ /64/);
275         vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
276 ___
277 $code.=<<___;
278         vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
279                                                 @ "[rotated]" means that
280                                                 @ loaded value would have
281                                                 @ to be rotated in order to
282                                                 @ make it appear as in
283                                                 @ algorithm specification
284         subs            $len,$len,#32           @ see if $len is 32 or larger
285         mov             $inc,#16                @ $inc is used as post-
286                                                 @ increment for input pointer;
287                                                 @ as loop is modulo-scheduled
288                                                 @ $inc is zeroed just in time
289                                                 @ to preclude overstepping
290                                                 @ inp[len], which means that
291                                                 @ last block[s] are actually
292                                                 @ loaded twice, but last
293                                                 @ copy is not processed
294         vld1.64         {$H-$Hhl},[$Htbl],#32   @ load twisted H, ..., H^2
295         vmov.i8         $xC2,#0xe1
296         vld1.64         {$H2},[$Htbl]
297         cclr            $inc,eq                 @ is it time to zero $inc?
298         vext.8          $Xl,$Xl,$Xl,#8          @ rotate Xi
299         vld1.64         {$t0},[$inp],#16        @ load [rotated] I[0]
300         vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
301 #ifndef __ARMEB__
302         vrev64.8        $t0,$t0
303         vrev64.8        $Xl,$Xl
304 #endif
305         vext.8          $IN,$t0,$t0,#8          @ rotate I[0]
306         b.lo            .Lodd_tail_v8           @ $len was less than 32
307 ___
308 { my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
309         #######
310         # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
311         #       [(H*Ii+1) + (H*Xi+1)] mod P =
312         #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
313         #
314 $code.=<<___;
315         vld1.64         {$t1},[$inp],$inc       @ load [rotated] I[1]
316 #ifndef __ARMEB__
317         vrev64.8        $t1,$t1
318 #endif
319         vext.8          $In,$t1,$t1,#8
320         veor            $IN,$IN,$Xl             @ I[i]^=Xi
321         vpmull.p64      $Xln,$H,$In             @ H·Ii+1
322         veor            $t1,$t1,$In             @ Karatsuba pre-processing
323         vpmull2.p64     $Xhn,$H,$In
324         b               .Loop_mod2x_v8
325
326 .align  4
327 .Loop_mod2x_v8:
328         vext.8          $t2,$IN,$IN,#8
329         subs            $len,$len,#32           @ is there more data?
330         vpmull.p64      $Xl,$H2,$IN             @ H^2.lo·Xi.lo
331         cclr            $inc,lo                 @ is it time to zero $inc?
332
333          vpmull.p64     $Xmn,$Hhl,$t1
334         veor            $t2,$t2,$IN             @ Karatsuba pre-processing
335         vpmull2.p64     $Xh,$H2,$IN             @ H^2.hi·Xi.hi
336         veor            $Xl,$Xl,$Xln            @ accumulate
337         vpmull2.p64     $Xm,$Hhl,$t2            @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
338          vld1.64        {$t0},[$inp],$inc       @ load [rotated] I[i+2]
339
340         veor            $Xh,$Xh,$Xhn
341          cclr           $inc,eq                 @ is it time to zero $inc?
342         veor            $Xm,$Xm,$Xmn
343
344         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
345         veor            $t2,$Xl,$Xh
346         veor            $Xm,$Xm,$t1
347          vld1.64        {$t1},[$inp],$inc       @ load [rotated] I[i+3]
348 #ifndef __ARMEB__
349          vrev64.8       $t0,$t0
350 #endif
351         veor            $Xm,$Xm,$t2
352         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
353
354 #ifndef __ARMEB__
355          vrev64.8       $t1,$t1
356 #endif
357         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
358         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
359          vext.8         $In,$t1,$t1,#8
360          vext.8         $IN,$t0,$t0,#8
361         veor            $Xl,$Xm,$t2
362          vpmull.p64     $Xln,$H,$In             @ H·Ii+1
363         veor            $IN,$IN,$Xh             @ accumulate $IN early
364
365         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
366         vpmull.p64      $Xl,$Xl,$xC2
367         veor            $IN,$IN,$t2
368          veor           $t1,$t1,$In             @ Karatsuba pre-processing
369         veor            $IN,$IN,$Xl
370          vpmull2.p64    $Xhn,$H,$In
371         b.hs            .Loop_mod2x_v8          @ there was at least 32 more bytes
372
373         veor            $Xh,$Xh,$t2
374         vext.8          $IN,$t0,$t0,#8          @ re-construct $IN
375         adds            $len,$len,#32           @ re-construct $len
376         veor            $Xl,$Xl,$Xh             @ re-construct $Xl
377         b.eq            .Ldone_v8               @ is $len zero?
378 ___
379 }
380 $code.=<<___;
381 .Lodd_tail_v8:
382         vext.8          $t2,$Xl,$Xl,#8
383         veor            $IN,$IN,$Xl             @ inp^=Xi
384         veor            $t1,$t0,$t2             @ $t1 is rotated inp^Xi
385
386         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
387         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
388         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
389         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
390
391         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
392         veor            $t2,$Xl,$Xh
393         veor            $Xm,$Xm,$t1
394         veor            $Xm,$Xm,$t2
395         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
396
397         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
398         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
399         veor            $Xl,$Xm,$t2
400
401         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
402         vpmull.p64      $Xl,$Xl,$xC2
403         veor            $t2,$t2,$Xh
404         veor            $Xl,$Xl,$t2
405
406 .Ldone_v8:
407 #ifndef __ARMEB__
408         vrev64.8        $Xl,$Xl
409 #endif
410         vext.8          $Xl,$Xl,$Xl,#8
411         vst1.64         {$Xl},[$Xi]             @ write out Xi
412
413 ___
414 $code.=<<___            if ($flavour !~ /64/);
415         vldmia          sp!,{d8-d15}            @ 32-bit ABI says so
416 ___
417 $code.=<<___;
418         ret
419 .size   gcm_ghash_v8,.-gcm_ghash_v8
420 ___
421
422 if ($flavour =~ /64/) {                         # 4x subroutine
423 my ($I0,$j1,$j2,$j3,
424     $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
425
426 $code.=<<___;
427 .type   gcm_ghash_v8_4x,%function
428 .align  4
429 gcm_ghash_v8_4x:
430 .Lgcm_ghash_v8_4x:
431         vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
432         vld1.64         {$H-$H2},[$Htbl],#48    @ load twisted H, ..., H^2
433         vmov.i8         $xC2,#0xe1
434         vld1.64         {$H3-$H4},[$Htbl]       @ load twisted H^3, ..., H^4
435         vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
436
437         vld1.64         {$I0-$j3},[$inp],#64
438 #ifndef __ARMEB__
439         vrev64.8        $Xl,$Xl
440         vrev64.8        $j1,$j1
441         vrev64.8        $j2,$j2
442         vrev64.8        $j3,$j3
443         vrev64.8        $I0,$I0
444 #endif
445         vext.8          $I3,$j3,$j3,#8
446         vext.8          $I2,$j2,$j2,#8
447         vext.8          $I1,$j1,$j1,#8
448
449         vpmull.p64      $Yl,$H,$I3              @ H·Ii+3
450         veor            $j3,$j3,$I3
451         vpmull2.p64     $Yh,$H,$I3
452         vpmull.p64      $Ym,$Hhl,$j3
453
454         vpmull.p64      $t0,$H2,$I2             @ H^2·Ii+2
455         veor            $j2,$j2,$I2
456         vpmull2.p64     $I2,$H2,$I2
457         vpmull2.p64     $j2,$Hhl,$j2
458
459         veor            $Yl,$Yl,$t0
460         veor            $Yh,$Yh,$I2
461         veor            $Ym,$Ym,$j2
462
463         vpmull.p64      $j3,$H3,$I1             @ H^3·Ii+1
464         veor            $j1,$j1,$I1
465         vpmull2.p64     $I1,$H3,$I1
466         vpmull.p64      $j1,$H34,$j1
467
468         veor            $Yl,$Yl,$j3
469         veor            $Yh,$Yh,$I1
470         veor            $Ym,$Ym,$j1
471
472         subs            $len,$len,#128
473         b.lo            .Ltail4x
474
475         b               .Loop4x
476
477 .align  4
478 .Loop4x:
479         veor            $t0,$I0,$Xl
480          vld1.64        {$I0-$j3},[$inp],#64
481         vext.8          $IN,$t0,$t0,#8
482 #ifndef __ARMEB__
483          vrev64.8       $j1,$j1
484          vrev64.8       $j2,$j2
485          vrev64.8       $j3,$j3
486          vrev64.8       $I0,$I0
487 #endif
488
489         vpmull.p64      $Xl,$H4,$IN             @ H^4·(Xi+Ii)
490         veor            $t0,$t0,$IN
491         vpmull2.p64     $Xh,$H4,$IN
492          vext.8         $I3,$j3,$j3,#8
493         vpmull2.p64     $Xm,$H34,$t0
494
495         veor            $Xl,$Xl,$Yl
496         veor            $Xh,$Xh,$Yh
497          vext.8         $I2,$j2,$j2,#8
498         veor            $Xm,$Xm,$Ym
499          vext.8         $I1,$j1,$j1,#8
500
501         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
502         veor            $t2,$Xl,$Xh
503          vpmull.p64     $Yl,$H,$I3              @ H·Ii+3
504          veor           $j3,$j3,$I3
505         veor            $Xm,$Xm,$t1
506          vpmull2.p64    $Yh,$H,$I3
507         veor            $Xm,$Xm,$t2
508          vpmull.p64     $Ym,$Hhl,$j3
509
510         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
511         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
512         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
513          vpmull.p64     $t0,$H2,$I2             @ H^2·Ii+2
514          veor           $j2,$j2,$I2
515          vpmull2.p64    $I2,$H2,$I2
516         veor            $Xl,$Xm,$t2
517          vpmull2.p64    $j2,$Hhl,$j2
518
519          veor           $Yl,$Yl,$t0
520          veor           $Yh,$Yh,$I2
521          veor           $Ym,$Ym,$j2
522
523         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
524         vpmull.p64      $Xl,$Xl,$xC2
525          vpmull.p64     $j3,$H3,$I1             @ H^3·Ii+1
526          veor           $j1,$j1,$I1
527         veor            $t2,$t2,$Xh
528          vpmull2.p64    $I1,$H3,$I1
529          vpmull.p64     $j1,$H34,$j1
530
531         veor            $Xl,$Xl,$t2
532          veor           $Yl,$Yl,$j3
533          veor           $Yh,$Yh,$I1
534         vext.8          $Xl,$Xl,$Xl,#8
535          veor           $Ym,$Ym,$j1
536
537         subs            $len,$len,#64
538         b.hs            .Loop4x
539
540 .Ltail4x:
541         veor            $t0,$I0,$Xl
542         vext.8          $IN,$t0,$t0,#8
543
544         vpmull.p64      $Xl,$H4,$IN             @ H^4·(Xi+Ii)
545         veor            $t0,$t0,$IN
546         vpmull2.p64     $Xh,$H4,$IN
547         vpmull2.p64     $Xm,$H34,$t0
548
549         veor            $Xl,$Xl,$Yl
550         veor            $Xh,$Xh,$Yh
551         veor            $Xm,$Xm,$Ym
552
553         adds            $len,$len,#64
554         b.eq            .Ldone4x
555
556         cmp             $len,#32
557         b.lo            .Lone
558         b.eq            .Ltwo
559 .Lthree:
560         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
561         veor            $t2,$Xl,$Xh
562         veor            $Xm,$Xm,$t1
563          vld1.64        {$I0-$j2},[$inp]
564         veor            $Xm,$Xm,$t2
565 #ifndef __ARMEB__
566          vrev64.8       $j1,$j1
567          vrev64.8       $j2,$j2
568          vrev64.8       $I0,$I0
569 #endif
570
571         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
572         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
573         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
574          vext.8         $I2,$j2,$j2,#8
575          vext.8         $I1,$j1,$j1,#8
576         veor            $Xl,$Xm,$t2
577
578          vpmull.p64     $Yl,$H,$I2              @ H·Ii+2
579          veor           $j2,$j2,$I2
580
581         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
582         vpmull.p64      $Xl,$Xl,$xC2
583         veor            $t2,$t2,$Xh
584          vpmull2.p64    $Yh,$H,$I2
585          vpmull.p64     $Ym,$Hhl,$j2
586         veor            $Xl,$Xl,$t2
587          vpmull.p64     $j3,$H2,$I1             @ H^2·Ii+1
588          veor           $j1,$j1,$I1
589         vext.8          $Xl,$Xl,$Xl,#8
590
591          vpmull2.p64    $I1,$H2,$I1
592         veor            $t0,$I0,$Xl
593          vpmull2.p64    $j1,$Hhl,$j1
594         vext.8          $IN,$t0,$t0,#8
595
596          veor           $Yl,$Yl,$j3
597          veor           $Yh,$Yh,$I1
598          veor           $Ym,$Ym,$j1
599
600         vpmull.p64      $Xl,$H3,$IN             @ H^3·(Xi+Ii)
601         veor            $t0,$t0,$IN
602         vpmull2.p64     $Xh,$H3,$IN
603         vpmull.p64      $Xm,$H34,$t0
604
605         veor            $Xl,$Xl,$Yl
606         veor            $Xh,$Xh,$Yh
607         veor            $Xm,$Xm,$Ym
608         b               .Ldone4x
609
610 .align  4
611 .Ltwo:
612         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
613         veor            $t2,$Xl,$Xh
614         veor            $Xm,$Xm,$t1
615          vld1.64        {$I0-$j1},[$inp]
616         veor            $Xm,$Xm,$t2
617 #ifndef __ARMEB__
618          vrev64.8       $j1,$j1
619          vrev64.8       $I0,$I0
620 #endif
621
622         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
623         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
624         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
625          vext.8         $I1,$j1,$j1,#8
626         veor            $Xl,$Xm,$t2
627
628         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
629         vpmull.p64      $Xl,$Xl,$xC2
630         veor            $t2,$t2,$Xh
631         veor            $Xl,$Xl,$t2
632         vext.8          $Xl,$Xl,$Xl,#8
633
634          vpmull.p64     $Yl,$H,$I1              @ H·Ii+1
635          veor           $j1,$j1,$I1
636
637         veor            $t0,$I0,$Xl
638         vext.8          $IN,$t0,$t0,#8
639
640          vpmull2.p64    $Yh,$H,$I1
641          vpmull.p64     $Ym,$Hhl,$j1
642
643         vpmull.p64      $Xl,$H2,$IN             @ H^2·(Xi+Ii)
644         veor            $t0,$t0,$IN
645         vpmull2.p64     $Xh,$H2,$IN
646         vpmull2.p64     $Xm,$Hhl,$t0
647
648         veor            $Xl,$Xl,$Yl
649         veor            $Xh,$Xh,$Yh
650         veor            $Xm,$Xm,$Ym
651         b               .Ldone4x
652
653 .align  4
654 .Lone:
655         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
656         veor            $t2,$Xl,$Xh
657         veor            $Xm,$Xm,$t1
658          vld1.64        {$I0},[$inp]
659         veor            $Xm,$Xm,$t2
660 #ifndef __ARMEB__
661          vrev64.8       $I0,$I0
662 #endif
663
664         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
665         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
666         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
667         veor            $Xl,$Xm,$t2
668
669         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
670         vpmull.p64      $Xl,$Xl,$xC2
671         veor            $t2,$t2,$Xh
672         veor            $Xl,$Xl,$t2
673         vext.8          $Xl,$Xl,$Xl,#8
674
675         veor            $t0,$I0,$Xl
676         vext.8          $IN,$t0,$t0,#8
677
678         vpmull.p64      $Xl,$H,$IN
679         veor            $t0,$t0,$IN
680         vpmull2.p64     $Xh,$H,$IN
681         vpmull.p64      $Xm,$Hhl,$t0
682
683 .Ldone4x:
684         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
685         veor            $t2,$Xl,$Xh
686         veor            $Xm,$Xm,$t1
687         veor            $Xm,$Xm,$t2
688
689         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
690         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
691         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
692         veor            $Xl,$Xm,$t2
693
694         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
695         vpmull.p64      $Xl,$Xl,$xC2
696         veor            $t2,$t2,$Xh
697         veor            $Xl,$Xl,$t2
698         vext.8          $Xl,$Xl,$Xl,#8
699
700 #ifndef __ARMEB__
701         vrev64.8        $Xl,$Xl
702 #endif
703         vst1.64         {$Xl},[$Xi]             @ write out Xi
704
705         ret
706 .size   gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
707 ___
708
709 }
710 }
711
712 $code.=<<___;
713 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
714 .align  2
715 #endif
716 ___
717
718 if ($flavour =~ /64/) {                 ######## 64-bit code
719     sub unvmov {
720         my $arg=shift;
721
722         $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
723         sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
724                                              $3<8?$3:$3+8,($4 eq "lo")?0:1;
725     }
726     foreach(split("\n",$code)) {
727         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
728         s/vmov\.i8/movi/o               or      # fix up legacy mnemonics
729         s/vmov\s+(.*)/unvmov($1)/geo    or
730         s/vext\.8/ext/o                 or
731         s/vshr\.s/sshr\.s/o             or
732         s/vshr/ushr/o                   or
733         s/^(\s+)v/$1/o                  or      # strip off v prefix
734         s/\bbx\s+lr\b/ret/o;
735
736         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
737         s/@\s/\/\//o;                           # old->new style commentary
738
739         # fix up remaining legacy suffixes
740         s/\.[ui]?8(\s)/$1/o;
741         s/\.[uis]?32//o and s/\.16b/\.4s/go;
742         m/\.p64/o and s/\.16b/\.1q/o;           # 1st pmull argument
743         m/l\.p64/o and s/\.16b/\.1d/go;         # 2nd and 3rd pmull arguments
744         s/\.[uisp]?64//o and s/\.16b/\.2d/go;
745         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
746
747         print $_,"\n";
748     }
749 } else {                                ######## 32-bit code
750     sub unvdup32 {
751         my $arg=shift;
752
753         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
754         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
755     }
756     sub unvpmullp64 {
757         my ($mnemonic,$arg)=@_;
758
759         if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
760             my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
761                                  |(($2&7)<<17)|(($2&8)<<4)
762                                  |(($3&7)<<1) |(($3&8)<<2);
763             $word |= 0x00010001  if ($mnemonic =~ "2");
764             # since ARMv7 instructions are always encoded little-endian.
765             # correct solution is to use .inst directive, but older
766             # assemblers don't implement it:-(
767             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
768                         $word&0xff,($word>>8)&0xff,
769                         ($word>>16)&0xff,($word>>24)&0xff,
770                         $mnemonic,$arg;
771         }
772     }
773
774     foreach(split("\n",$code)) {
775         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
776         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
777         s/\/\/\s?/@ /o;                         # new->old style commentary
778
779         # fix up remaining new-style suffixes
780         s/\],#[0-9]+/]!/o;
781
782         s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2     $1,#0/o                 or
783         s/vdup\.32\s+(.*)/unvdup32($1)/geo                              or
784         s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo                or
785         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
786         s/^(\s+)b\./$1b/o                                               or
787         s/^(\s+)ret/$1bx\tlr/o;
788
789         if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
790             print "     it      $2\n";
791         }
792
793         print $_,"\n";
794     }
795 }
796
797 close STDOUT or die "error closing STDOUT"; # enforce flush