7187d28b78eafe3c4937e2585a7815531397c97e
[openssl.git] / crypto / modes / asm / ghashv8-armx.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
18 #
19 # June 2014
20 # Initial version was developed in tight cooperation with Ard Biesheuvel
21 # of Linaro from bits-n-pieces from other assembly modules. Just like
22 # aesv8-armx.pl this module supports both AArch32 and AArch64 execution modes.
23 #
24 # July 2014
25 # Implement 2x aggregated reduction [see ghash-x86.pl for background
26 # information].
27 #
28 # Current performance in cycles per processed byte:
29 #
30 #               PMULL[2]        32-bit NEON(*)
31 # Apple A7      0.92            5.62
32 # Cortex-A53    1.01            8.39
33 # Cortex-A57    1.17            7.61
34 # Denver        0.71            6.02
35 # Mongoose      1.10            8.06
36 # Kryo          1.16            8.00
37 #
38 # (*)   presented for reference/comparison purposes;
39
40 $flavour = shift;
41 $output  = shift;
42
43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
45 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
46 die "can't locate arm-xlate.pl";
47
48 open OUT,"| \"$^X\" $xlate $flavour $output";
49 *STDOUT=*OUT;
50
51 $Xi="x0";       # argument block
52 $Htbl="x1";
53 $inp="x2";
54 $len="x3";
55
56 $inc="x12";
57
58 {
59 my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
60 my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
61
62 $code=<<___;
63 #include "arm_arch.h"
64
65 .text
66 ___
67 $code.=".arch   armv8-a+crypto\n"       if ($flavour =~ /64/);
68 $code.=<<___                            if ($flavour !~ /64/);
69 .fpu    neon
70 .code   32
71 #undef  __thumb2__
72 ___
73
74 ################################################################################
75 # void gcm_init_v8(u128 Htable[16],const u64 H[2]);
76 #
77 # input:        128-bit H - secret parameter E(K,0^128)
78 # output:       precomputed table filled with degrees of twisted H;
79 #               H is twisted to handle reverse bitness of GHASH;
80 #               only few of 16 slots of Htable[16] are used;
81 #               data is opaque to outside world (which allows to
82 #               optimize the code independently);
83 #
84 $code.=<<___;
85 .global gcm_init_v8
86 .type   gcm_init_v8,%function
87 .align  4
88 gcm_init_v8:
89         vld1.64         {$t1},[x1]              @ load input H
90         vmov.i8         $xC2,#0xe1
91         vshl.i64        $xC2,$xC2,#57           @ 0xc2.0
92         vext.8          $IN,$t1,$t1,#8
93         vshr.u64        $t2,$xC2,#63
94         vdup.32         $t1,${t1}[1]
95         vext.8          $t0,$t2,$xC2,#8         @ t0=0xc2....01
96         vshr.u64        $t2,$IN,#63
97         vshr.s32        $t1,$t1,#31             @ broadcast carry bit
98         vand            $t2,$t2,$t0
99         vshl.i64        $IN,$IN,#1
100         vext.8          $t2,$t2,$t2,#8
101         vand            $t0,$t0,$t1
102         vorr            $IN,$IN,$t2             @ H<<<=1
103         veor            $H,$IN,$t0              @ twisted H
104         vst1.64         {$H},[x0],#16           @ store Htable[0]
105
106         @ calculate H^2
107         vext.8          $t0,$H,$H,#8            @ Karatsuba pre-processing
108         vpmull.p64      $Xl,$H,$H
109         veor            $t0,$t0,$H
110         vpmull2.p64     $Xh,$H,$H
111         vpmull.p64      $Xm,$t0,$t0
112
113         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
114         veor            $t2,$Xl,$Xh
115         veor            $Xm,$Xm,$t1
116         veor            $Xm,$Xm,$t2
117         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
118
119         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
120         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
121         veor            $Xl,$Xm,$t2
122
123         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
124         vpmull.p64      $Xl,$Xl,$xC2
125         veor            $t2,$t2,$Xh
126         veor            $H2,$Xl,$t2
127
128         vext.8          $t1,$H2,$H2,#8          @ Karatsuba pre-processing
129         veor            $t1,$t1,$H2
130         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
131         vst1.64         {$Hhl-$H2},[x0]         @ store Htable[1..2]
132
133         ret
134 .size   gcm_init_v8,.-gcm_init_v8
135 ___
136 ################################################################################
137 # void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
138 #
139 # input:        Xi - current hash value;
140 #               Htable - table precomputed in gcm_init_v8;
141 # output:       Xi - next hash value Xi;
142 #
143 $code.=<<___;
144 .global gcm_gmult_v8
145 .type   gcm_gmult_v8,%function
146 .align  4
147 gcm_gmult_v8:
148         vld1.64         {$t1},[$Xi]             @ load Xi
149         vmov.i8         $xC2,#0xe1
150         vld1.64         {$H-$Hhl},[$Htbl]       @ load twisted H, ...
151         vshl.u64        $xC2,$xC2,#57
152 #ifndef __ARMEB__
153         vrev64.8        $t1,$t1
154 #endif
155         vext.8          $IN,$t1,$t1,#8
156
157         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
158         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
159         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
160         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
161
162         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
163         veor            $t2,$Xl,$Xh
164         veor            $Xm,$Xm,$t1
165         veor            $Xm,$Xm,$t2
166         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
167
168         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
169         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
170         veor            $Xl,$Xm,$t2
171
172         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
173         vpmull.p64      $Xl,$Xl,$xC2
174         veor            $t2,$t2,$Xh
175         veor            $Xl,$Xl,$t2
176
177 #ifndef __ARMEB__
178         vrev64.8        $Xl,$Xl
179 #endif
180         vext.8          $Xl,$Xl,$Xl,#8
181         vst1.64         {$Xl},[$Xi]             @ write out Xi
182
183         ret
184 .size   gcm_gmult_v8,.-gcm_gmult_v8
185 ___
186 ################################################################################
187 # void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
188 #
189 # input:        table precomputed in gcm_init_v8;
190 #               current hash value Xi;
191 #               pointer to input data;
192 #               length of input data in bytes, but divisible by block size;
193 # output:       next hash value Xi;
194 #
195 $code.=<<___;
196 .global gcm_ghash_v8
197 .type   gcm_ghash_v8,%function
198 .align  4
199 gcm_ghash_v8:
200 ___
201 $code.=<<___            if ($flavour !~ /64/);
202         vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
203 ___
204 $code.=<<___;
205         vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
206                                                 @ "[rotated]" means that
207                                                 @ loaded value would have
208                                                 @ to be rotated in order to
209                                                 @ make it appear as in
210                                                 @ algorithm specification
211         subs            $len,$len,#32           @ see if $len is 32 or larger
212         mov             $inc,#16                @ $inc is used as post-
213                                                 @ increment for input pointer;
214                                                 @ as loop is modulo-scheduled
215                                                 @ $inc is zeroed just in time
216                                                 @ to preclude overstepping
217                                                 @ inp[len], which means that
218                                                 @ last block[s] are actually
219                                                 @ loaded twice, but last
220                                                 @ copy is not processed
221         vld1.64         {$H-$Hhl},[$Htbl],#32   @ load twisted H, ..., H^2
222         vmov.i8         $xC2,#0xe1
223         vld1.64         {$H2},[$Htbl]
224         cclr            $inc,eq                 @ is it time to zero $inc?
225         vext.8          $Xl,$Xl,$Xl,#8          @ rotate Xi
226         vld1.64         {$t0},[$inp],#16        @ load [rotated] I[0]
227         vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
228 #ifndef __ARMEB__
229         vrev64.8        $t0,$t0
230         vrev64.8        $Xl,$Xl
231 #endif
232         vext.8          $IN,$t0,$t0,#8          @ rotate I[0]
233         b.lo            .Lodd_tail_v8           @ $len was less than 32
234 ___
235 { my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
236         #######
237         # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
238         #       [(H*Ii+1) + (H*Xi+1)] mod P =
239         #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
240         #
241 $code.=<<___;
242         vld1.64         {$t1},[$inp],$inc       @ load [rotated] I[1]
243 #ifndef __ARMEB__
244         vrev64.8        $t1,$t1
245 #endif
246         vext.8          $In,$t1,$t1,#8
247         veor            $IN,$IN,$Xl             @ I[i]^=Xi
248         vpmull.p64      $Xln,$H,$In             @ H·Ii+1
249         veor            $t1,$t1,$In             @ Karatsuba pre-processing
250         vpmull2.p64     $Xhn,$H,$In
251         b               .Loop_mod2x_v8
252
253 .align  4
254 .Loop_mod2x_v8:
255         vext.8          $t2,$IN,$IN,#8
256         subs            $len,$len,#32           @ is there more data?
257         vpmull.p64      $Xl,$H2,$IN             @ H^2.lo·Xi.lo
258         cclr            $inc,lo                 @ is it time to zero $inc?
259
260          vpmull.p64     $Xmn,$Hhl,$t1
261         veor            $t2,$t2,$IN             @ Karatsuba pre-processing
262         vpmull2.p64     $Xh,$H2,$IN             @ H^2.hi·Xi.hi
263         veor            $Xl,$Xl,$Xln            @ accumulate
264         vpmull2.p64     $Xm,$Hhl,$t2            @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
265          vld1.64        {$t0},[$inp],$inc       @ load [rotated] I[i+2]
266
267         veor            $Xh,$Xh,$Xhn
268          cclr           $inc,eq                 @ is it time to zero $inc?
269         veor            $Xm,$Xm,$Xmn
270
271         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
272         veor            $t2,$Xl,$Xh
273         veor            $Xm,$Xm,$t1
274          vld1.64        {$t1},[$inp],$inc       @ load [rotated] I[i+3]
275 #ifndef __ARMEB__
276          vrev64.8       $t0,$t0
277 #endif
278         veor            $Xm,$Xm,$t2
279         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
280
281 #ifndef __ARMEB__
282          vrev64.8       $t1,$t1
283 #endif
284         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
285         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
286          vext.8         $In,$t1,$t1,#8
287          vext.8         $IN,$t0,$t0,#8
288         veor            $Xl,$Xm,$t2
289          vpmull.p64     $Xln,$H,$In             @ H·Ii+1
290         veor            $IN,$IN,$Xh             @ accumulate $IN early
291
292         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
293         vpmull.p64      $Xl,$Xl,$xC2
294         veor            $IN,$IN,$t2
295          veor           $t1,$t1,$In             @ Karatsuba pre-processing
296         veor            $IN,$IN,$Xl
297          vpmull2.p64    $Xhn,$H,$In
298         b.hs            .Loop_mod2x_v8          @ there was at least 32 more bytes
299
300         veor            $Xh,$Xh,$t2
301         vext.8          $IN,$t0,$t0,#8          @ re-construct $IN
302         adds            $len,$len,#32           @ re-construct $len
303         veor            $Xl,$Xl,$Xh             @ re-construct $Xl
304         b.eq            .Ldone_v8               @ is $len zero?
305 ___
306 }
307 $code.=<<___;
308 .Lodd_tail_v8:
309         vext.8          $t2,$Xl,$Xl,#8
310         veor            $IN,$IN,$Xl             @ inp^=Xi
311         veor            $t1,$t0,$t2             @ $t1 is rotated inp^Xi
312
313         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
314         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
315         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
316         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
317
318         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
319         veor            $t2,$Xl,$Xh
320         veor            $Xm,$Xm,$t1
321         veor            $Xm,$Xm,$t2
322         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
323
324         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
325         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
326         veor            $Xl,$Xm,$t2
327
328         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
329         vpmull.p64      $Xl,$Xl,$xC2
330         veor            $t2,$t2,$Xh
331         veor            $Xl,$Xl,$t2
332
333 .Ldone_v8:
334 #ifndef __ARMEB__
335         vrev64.8        $Xl,$Xl
336 #endif
337         vext.8          $Xl,$Xl,$Xl,#8
338         vst1.64         {$Xl},[$Xi]             @ write out Xi
339
340 ___
341 $code.=<<___            if ($flavour !~ /64/);
342         vldmia          sp!,{d8-d15}            @ 32-bit ABI says so
343 ___
344 $code.=<<___;
345         ret
346 .size   gcm_ghash_v8,.-gcm_ghash_v8
347 ___
348 }
349 $code.=<<___;
350 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
351 .align  2
352 ___
353
354 if ($flavour =~ /64/) {                 ######## 64-bit code
355     sub unvmov {
356         my $arg=shift;
357
358         $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
359         sprintf "ins    v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
360     }
361     foreach(split("\n",$code)) {
362         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
363         s/vmov\.i8/movi/o               or      # fix up legacy mnemonics
364         s/vmov\s+(.*)/unvmov($1)/geo    or
365         s/vext\.8/ext/o                 or
366         s/vshr\.s/sshr\.s/o             or
367         s/vshr/ushr/o                   or
368         s/^(\s+)v/$1/o                  or      # strip off v prefix
369         s/\bbx\s+lr\b/ret/o;
370
371         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
372         s/@\s/\/\//o;                           # old->new style commentary
373
374         # fix up remaining legacy suffixes
375         s/\.[ui]?8(\s)/$1/o;
376         s/\.[uis]?32//o and s/\.16b/\.4s/go;
377         m/\.p64/o and s/\.16b/\.1q/o;           # 1st pmull argument
378         m/l\.p64/o and s/\.16b/\.1d/go;         # 2nd and 3rd pmull arguments
379         s/\.[uisp]?64//o and s/\.16b/\.2d/go;
380         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
381
382         print $_,"\n";
383     }
384 } else {                                ######## 32-bit code
385     sub unvdup32 {
386         my $arg=shift;
387
388         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
389         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
390     }
391     sub unvpmullp64 {
392         my ($mnemonic,$arg)=@_;
393
394         if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
395             my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
396                                  |(($2&7)<<17)|(($2&8)<<4)
397                                  |(($3&7)<<1) |(($3&8)<<2);
398             $word |= 0x00010001  if ($mnemonic =~ "2");
399             # since ARMv7 instructions are always encoded little-endian.
400             # correct solution is to use .inst directive, but older
401             # assemblers don't implement it:-(
402             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
403                         $word&0xff,($word>>8)&0xff,
404                         ($word>>16)&0xff,($word>>24)&0xff,
405                         $mnemonic,$arg;
406         }
407     }
408
409     foreach(split("\n",$code)) {
410         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
411         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
412         s/\/\/\s?/@ /o;                         # new->old style commentary
413
414         # fix up remaining new-style suffixes
415         s/\],#[0-9]+/]!/o;
416
417         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o                 or
418         s/vdup\.32\s+(.*)/unvdup32($1)/geo                              or
419         s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo                or
420         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
421         s/^(\s+)b\./$1b/o                                               or
422         s/^(\s+)ret/$1bx\tlr/o;
423
424         print $_,"\n";
425     }
426 }
427
428 close STDOUT; # enforce flush