modes/asm/ghashv8-armx.pl: up to 90% performance improvement.
[openssl.git] / crypto / modes / asm / ghashv8-armx.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
11 #
12 # June 2014
13 #
14 # Initial version was developed in tight cooperation with Ard
15 # Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
16 # other assembly modules. Just like aesv8-armx.pl this module
17 # supports both AArch32 and AArch64 execution modes.
18 #
19 # July 2014
20 #
21 # Implement 2x aggregated reduction [see ghash-x86.pl for background
22 # information].
23 #
24 # Current performance in cycles per processed byte:
25 #
26 #               PMULL[2]        32-bit NEON(*)
27 # Apple A7      0.92            5.62
28 # Cortex-A53    1.01            8.39
29 # Cortex-A57    1.17            7.61
30 #
31 # (*)   presented for reference/comparison purposes;
32
33 $flavour = shift;
34 $output  = shift;
35
36 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
38 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
39 die "can't locate arm-xlate.pl";
40
41 open OUT,"| \"$^X\" $xlate $flavour $output";
42 *STDOUT=*OUT;
43
44 $Xi="x0";       # argument block
45 $Htbl="x1";
46 $inp="x2";
47 $len="x3";
48
49 $inc="x12";
50
51 {
52 my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
53 my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
54
55 $code=<<___;
56 #include "arm_arch.h"
57
58 .text
59 ___
60 $code.=".arch   armv8-a+crypto\n"       if ($flavour =~ /64/);
61 $code.=".fpu    neon\n.code     32\n"   if ($flavour !~ /64/);
62
63 ################################################################################
64 # void gcm_init_v8(u128 Htable[16],const u64 H[2]);
65 #
66 # input:        128-bit H - secret parameter E(K,0^128)
67 # output:       precomputed table filled with degrees of twisted H;
68 #               H is twisted to handle reverse bitness of GHASH;
69 #               only few of 16 slots of Htable[16] are used;
70 #               data is opaque to outside world (which allows to
71 #               optimize the code independently);
72 #
73 $code.=<<___;
74 .global gcm_init_v8
75 .type   gcm_init_v8,%function
76 .align  4
77 gcm_init_v8:
78         vld1.64         {$t1},[x1]              @ load input H
79         vmov.i8         $xC2,#0xe1
80         vshl.i64        $xC2,$xC2,#57           @ 0xc2.0
81         vext.8          $IN,$t1,$t1,#8
82         vshr.u64        $t2,$xC2,#63
83         vdup.32         $t1,${t1}[1]
84         vext.8          $t0,$t2,$xC2,#8         @ t0=0xc2....01
85         vshr.u64        $t2,$IN,#63
86         vshr.s32        $t1,$t1,#31             @ broadcast carry bit
87         vand            $t2,$t2,$t0
88         vshl.i64        $IN,$IN,#1
89         vext.8          $t2,$t2,$t2,#8
90         vand            $t0,$t0,$t1
91         vorr            $IN,$IN,$t2             @ H<<<=1
92         veor            $H,$IN,$t0              @ twisted H
93         vst1.64         {$H},[x0],#16           @ store Htable[0]
94
95         @ calculate H^2
96         vext.8          $t0,$H,$H,#8            @ Karatsuba pre-processing
97         vpmull.p64      $Xl,$H,$H
98         veor            $t0,$t0,$H
99         vpmull2.p64     $Xh,$H,$H
100         vpmull.p64      $Xm,$t0,$t0
101
102         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
103         veor            $t2,$Xl,$Xh
104         veor            $Xm,$Xm,$t1
105         veor            $Xm,$Xm,$t2
106         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
107
108         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
109         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
110         veor            $Xl,$Xm,$t2
111
112         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
113         vpmull.p64      $Xl,$Xl,$xC2
114         veor            $t2,$t2,$Xh
115         veor            $H2,$Xl,$t2
116
117         vext.8          $t1,$H2,$H2,#8          @ Karatsuba pre-processing
118         veor            $t1,$t1,$H2
119         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
120         vst1.64         {$Hhl-$H2},[x0]         @ store Htable[1..2]
121
122         ret
123 .size   gcm_init_v8,.-gcm_init_v8
124 ___
125 ################################################################################
126 # void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
127 #
128 # input:        Xi - current hash value;
129 #               Htable - table precomputed in gcm_init_v8;
130 # output:       Xi - next hash value Xi;
131 #
132 $code.=<<___;
133 .global gcm_gmult_v8
134 .type   gcm_gmult_v8,%function
135 .align  4
136 gcm_gmult_v8:
137         vld1.64         {$t1},[$Xi]             @ load Xi
138         vmov.i8         $xC2,#0xe1
139         vld1.64         {$H-$Hhl},[$Htbl]       @ load twisted H, ...
140         vshl.u64        $xC2,$xC2,#57
141 #ifndef __ARMEB__
142         vrev64.8        $t1,$t1
143 #endif
144         vext.8          $IN,$t1,$t1,#8
145
146         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
147         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
148         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
149         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
150
151         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
152         veor            $t2,$Xl,$Xh
153         veor            $Xm,$Xm,$t1
154         veor            $Xm,$Xm,$t2
155         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
156
157         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
158         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
159         veor            $Xl,$Xm,$t2
160
161         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
162         vpmull.p64      $Xl,$Xl,$xC2
163         veor            $t2,$t2,$Xh
164         veor            $Xl,$Xl,$t2
165
166 #ifndef __ARMEB__
167         vrev64.8        $Xl,$Xl
168 #endif
169         vext.8          $Xl,$Xl,$Xl,#8
170         vst1.64         {$Xl},[$Xi]             @ write out Xi
171
172         ret
173 .size   gcm_gmult_v8,.-gcm_gmult_v8
174 ___
175 ################################################################################
176 # void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
177 #
178 # input:        table precomputed in gcm_init_v8;
179 #               current hash value Xi;
180 #               pointer to input data;
181 #               length of input data in bytes, but divisible by block size;
182 # output:       next hash value Xi;
183 #
184 $code.=<<___;
185 .global gcm_ghash_v8
186 .type   gcm_ghash_v8,%function
187 .align  4
188 gcm_ghash_v8:
189 ___
190 $code.=<<___            if ($flavour !~ /64/);
191         vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
192 ___
193 $code.=<<___;
194         vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
195                                                 @ "[rotated]" means that
196                                                 @ loaded value would have
197                                                 @ to be rotated in order to
198                                                 @ make it appear as in
199                                                 @ alorithm specification
200         subs            $len,$len,#32           @ see if $len is 32 or larger
201         mov             $inc,#16                @ $inc is used as post-
202                                                 @ increment for input pointer;
203                                                 @ as loop is modulo-scheduled
204                                                 @ $inc is zeroed just in time
205                                                 @ to preclude oversteping
206                                                 @ inp[len], which means that
207                                                 @ last block[s] are actually
208                                                 @ loaded twice, but last
209                                                 @ copy is not processed
210         vld1.64         {$H-$Hhl},[$Htbl],#32   @ load twisted H, ..., H^2
211         vmov.i8         $xC2,#0xe1
212         vld1.64         {$H2},[$Htbl]
213         cclr            $inc,eq                 @ is it time to zero $inc?
214         vext.8          $Xl,$Xl,$Xl,#8          @ rotate Xi
215         vld1.64         {$t0},[$inp],#16        @ load [rotated] I[0]
216         vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
217 #ifndef __ARMEB__
218         vrev64.8        $t0,$t0
219         vrev64.8        $Xl,$Xl
220 #endif
221         vext.8          $IN,$t0,$t0,#8          @ rotate I[0]
222         b.lo            .Lodd_tail_v8           @ $len was less than 32
223 ___
224 { my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
225         #######
226         # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
227         #       [(H*Ii+1) + (H*Xi+1)] mod P =
228         #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
229         #
230 $code.=<<___;
231         vld1.64         {$t1},[$inp],$inc       @ load [rotated] I[1]
232 #ifndef __ARMEB__
233         vrev64.8        $t1,$t1
234 #endif
235         vext.8          $In,$t1,$t1,#8
236         veor            $IN,$IN,$Xl             @ I[i]^=Xi
237         vpmull.p64      $Xln,$H,$In             @ H·Ii+1
238         veor            $t1,$t1,$In             @ Karatsuba pre-processing
239         vpmull2.p64     $Xhn,$H,$In
240         b               .Loop_mod2x_v8
241
242 .align  4
243 .Loop_mod2x_v8:
244         vext.8          $t2,$IN,$IN,#8
245         subs            $len,$len,#32           @ is there more data?
246         vpmull.p64      $Xl,$H2,$IN             @ H^2.lo·Xi.lo
247         cclr            $inc,lo                 @ is it time to zero $inc?
248
249          vpmull.p64     $Xmn,$Hhl,$t1
250         veor            $t2,$t2,$IN             @ Karatsuba pre-processing
251         vpmull2.p64     $Xh,$H2,$IN             @ H^2.hi·Xi.hi
252         veor            $Xl,$Xl,$Xln            @ accumulate
253         vpmull2.p64     $Xm,$Hhl,$t2            @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
254          vld1.64        {$t0},[$inp],$inc       @ load [rotated] I[i+2]
255
256         veor            $Xh,$Xh,$Xhn
257          cclr           $inc,eq                 @ is it time to zero $inc?
258         veor            $Xm,$Xm,$Xmn
259
260         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
261         veor            $t2,$Xl,$Xh
262         veor            $Xm,$Xm,$t1
263          vld1.64        {$t1},[$inp],$inc       @ load [rotated] I[i+3]
264 #ifndef __ARMEB__
265          vrev64.8       $t0,$t0
266 #endif
267         veor            $Xm,$Xm,$t2
268         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
269
270 #ifndef __ARMEB__
271          vrev64.8       $t1,$t1
272 #endif
273         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
274         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
275          vext.8         $In,$t1,$t1,#8
276          vext.8         $IN,$t0,$t0,#8
277         veor            $Xl,$Xm,$t2
278          vpmull.p64     $Xln,$H,$In             @ H·Ii+1
279         veor            $IN,$IN,$Xh             @ accumulate $IN early
280
281         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
282         vpmull.p64      $Xl,$Xl,$xC2
283         veor            $IN,$IN,$t2
284          veor           $t1,$t1,$In             @ Karatsuba pre-processing
285         veor            $IN,$IN,$Xl
286          vpmull2.p64    $Xhn,$H,$In
287         b.hs            .Loop_mod2x_v8          @ there was at least 32 more bytes
288
289         veor            $Xh,$Xh,$t2
290         vext.8          $IN,$t0,$t0,#8          @ re-construct $IN
291         adds            $len,$len,#32           @ re-construct $len
292         veor            $Xl,$Xl,$Xh             @ re-construct $Xl
293         b.eq            .Ldone_v8               @ is $len zero?
294 ___
295 }
296 $code.=<<___;
297 .Lodd_tail_v8:
298         vext.8          $t2,$Xl,$Xl,#8
299         veor            $IN,$IN,$Xl             @ inp^=Xi
300         veor            $t1,$t0,$t2             @ $t1 is rotated inp^Xi
301
302         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
303         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
304         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
305         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
306
307         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
308         veor            $t2,$Xl,$Xh
309         veor            $Xm,$Xm,$t1
310         veor            $Xm,$Xm,$t2
311         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
312
313         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
314         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
315         veor            $Xl,$Xm,$t2
316
317         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
318         vpmull.p64      $Xl,$Xl,$xC2
319         veor            $t2,$t2,$Xh
320         veor            $Xl,$Xl,$t2
321
322 .Ldone_v8:
323 #ifndef __ARMEB__
324         vrev64.8        $Xl,$Xl
325 #endif
326         vext.8          $Xl,$Xl,$Xl,#8
327         vst1.64         {$Xl},[$Xi]             @ write out Xi
328
329 ___
330 $code.=<<___            if ($flavour !~ /64/);
331         vldmia          sp!,{d8-d15}            @ 32-bit ABI says so
332 ___
333 $code.=<<___;
334         ret
335 .size   gcm_ghash_v8,.-gcm_ghash_v8
336 ___
337 }
338 $code.=<<___;
339 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
340 .align  2
341 ___
342
343 if ($flavour =~ /64/) {                 ######## 64-bit code
344     sub unvmov {
345         my $arg=shift;
346
347         $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
348         sprintf "ins    v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
349     }
350     foreach(split("\n",$code)) {
351         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
352         s/vmov\.i8/movi/o               or      # fix up legacy mnemonics
353         s/vmov\s+(.*)/unvmov($1)/geo    or
354         s/vext\.8/ext/o                 or
355         s/vshr\.s/sshr\.s/o             or
356         s/vshr/ushr/o                   or
357         s/^(\s+)v/$1/o                  or      # strip off v prefix
358         s/\bbx\s+lr\b/ret/o;
359
360         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
361         s/@\s/\/\//o;                           # old->new style commentary
362
363         # fix up remainig legacy suffixes
364         s/\.[ui]?8(\s)/$1/o;
365         s/\.[uis]?32//o and s/\.16b/\.4s/go;
366         m/\.p64/o and s/\.16b/\.1q/o;           # 1st pmull argument
367         m/l\.p64/o and s/\.16b/\.1d/go;         # 2nd and 3rd pmull arguments
368         s/\.[uisp]?64//o and s/\.16b/\.2d/go;
369         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
370
371         print $_,"\n";
372     }
373 } else {                                ######## 32-bit code
374     sub unvdup32 {
375         my $arg=shift;
376
377         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
378         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
379     }
380     sub unvpmullp64 {
381         my ($mnemonic,$arg)=@_;
382
383         if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
384             my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
385                                  |(($2&7)<<17)|(($2&8)<<4)
386                                  |(($3&7)<<1) |(($3&8)<<2);
387             $word |= 0x00010001  if ($mnemonic =~ "2");
388             # since ARMv7 instructions are always encoded little-endian.
389             # correct solution is to use .inst directive, but older
390             # assemblers don't implement it:-(
391             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
392                         $word&0xff,($word>>8)&0xff,
393                         ($word>>16)&0xff,($word>>24)&0xff,
394                         $mnemonic,$arg;
395         }
396     }
397
398     foreach(split("\n",$code)) {
399         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
400         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
401         s/\/\/\s?/@ /o;                         # new->old style commentary
402
403         # fix up remainig new-style suffixes
404         s/\],#[0-9]+/]!/o;
405
406         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o                 or
407         s/vdup\.32\s+(.*)/unvdup32($1)/geo                              or
408         s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo                or
409         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
410         s/^(\s+)b\./$1b/o                                               or
411         s/^(\s+)ret/$1bx\tlr/o;
412
413         print $_,"\n";
414     }
415 }
416
417 close STDOUT; # enforce flush