crypto/modes/asm/ghashv8-armx.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
  18 #
  19 # June 2014
  20 #
  21 # Initial version was developed in tight cooperation with Ard
  22 # Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
  23 # Just like aesv8-armx.pl this module supports both AArch32 and
  24 # AArch64 execution modes.
  25 #
  26 # July 2014
  27 #
  28 # Implement 2x aggregated reduction [see ghash-x86.pl for background
  29 # information].
  30 #
  31 # November 2017
  32 #
  33 # AArch64 register bank to "accommodate" 4x aggregated reduction...
  34 #
  35 # Current performance in cycles per processed byte:
  36 #
  37 #               64-bit PMULL    32-bit PMULL    32-bit NEON(*)
  38 # Apple A7                      0.92            5.62
  39 # Cortex-A53                    1.01            8.39
  40 # Cortex-A57                    1.17            7.61
  41 # Denver                        0.71            6.02
  42 # Mongoose                      1.10            8.06
  43 # Kryo                          1.16            8.00
  44 #
  45 # (*)   presented for reference/comparison purposes;
  46
  47 $flavour = shift;
  48 $output  = shift;
  49
  50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  51 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  52 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  53 die "can't locate arm-xlate.pl";
  54
  55 open OUT,"| \"$^X\" $xlate $flavour $output";
  56 *STDOUT=*OUT;
  57
  58 $Xi="x0";       # argument block
  59 $Htbl="x1";
  60 $inp="x2";
  61 $len="x3";
  62
  63 $inc="x12";
  64
  65 {
  66 my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
  67 my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
  68
  69 $code=<<___;
  70 #include "arm_arch.h"
  71
  72 .text
  73 ___
  74 $code.=".arch   armv8-a+crypto\n"       if ($flavour =~ /64/);
  75 $code.=<<___                            if ($flavour !~ /64/);
  76 .fpu    neon
  77 .code   32
  78 #undef  __thumb2__
  79 ___
  80
  81 ################################################################################
  82 # void gcm_init_v8(u128 Htable[16],const u64 H[2]);
  83 #
  84 # input:        128-bit H - secret parameter E(K,0^128)
  85 # output:       precomputed table filled with degrees of twisted H;
  86 #               H is twisted to handle reverse bitness of GHASH;
  87 #               only few of 16 slots of Htable[16] are used;
  88 #               data is opaque to outside world (which allows to
  89 #               optimize the code independently);
  90 #
  91 $code.=<<___;
  92 .global gcm_init_v8
  93 .type   gcm_init_v8,%function
  94 .align  4
  95 gcm_init_v8:
  96         vld1.64         {$t1},[x1]              @ load input H
  97         vmov.i8         $xC2,#0xe1
  98         vshl.i64        $xC2,$xC2,#57           @ 0xc2.0
  99         vext.8          $IN,$t1,$t1,#8
 100         vshr.u64        $t2,$xC2,#63
 101         vdup.32         $t1,${t1}[1]
 102         vext.8          $t0,$t2,$xC2,#8         @ t0=0xc2....01
 103         vshr.u64        $t2,$IN,#63
 104         vshr.s32        $t1,$t1,#31             @ broadcast carry bit
 105         vand            $t2,$t2,$t0
 106         vshl.i64        $IN,$IN,#1
 107         vext.8          $t2,$t2,$t2,#8
 108         vand            $t0,$t0,$t1
 109         vorr            $IN,$IN,$t2             @ H<<<=1
 110         veor            $H,$IN,$t0              @ twisted H
 111         vst1.64         {$H},[x0],#16           @ store Htable[0]
 112
 113         @ calculate H^2
 114         vext.8          $t0,$H,$H,#8            @ Karatsuba pre-processing
 115         vpmull.p64      $Xl,$H,$H
 116         veor            $t0,$t0,$H
 117         vpmull2.p64     $Xh,$H,$H
 118         vpmull.p64      $Xm,$t0,$t0
 119
 120         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
 121         veor            $t2,$Xl,$Xh
 122         veor            $Xm,$Xm,$t1
 123         veor            $Xm,$Xm,$t2
 124         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
 125
 126         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
 127         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
 128         veor            $Xl,$Xm,$t2
 129
 130         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
 131         vpmull.p64      $Xl,$Xl,$xC2
 132         veor            $t2,$t2,$Xh
 133         veor            $H2,$Xl,$t2
 134
 135         vext.8          $t1,$H2,$H2,#8          @ Karatsuba pre-processing
 136         veor            $t1,$t1,$H2
 137         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
 138         vst1.64         {$Hhl-$H2},[x0],#32     @ store Htable[1..2]
 139 ___
 140 if ($flavour =~ /64/) {
 141 my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
 142
 143 $code.=<<___;
 144         @ calculate H^3 and H^4
 145         vpmull.p64      $Xl,$H, $H2
 146          vpmull.p64     $Yl,$H2,$H2
 147         vpmull2.p64     $Xh,$H, $H2
 148          vpmull2.p64    $Yh,$H2,$H2
 149         vpmull.p64      $Xm,$t0,$t1
 150          vpmull.p64     $Ym,$t1,$t1
 151
 152         vext.8          $t0,$Xl,$Xh,#8          @ Karatsuba post-processing
 153          vext.8         $t1,$Yl,$Yh,#8
 154         veor            $t2,$Xl,$Xh
 155         veor            $Xm,$Xm,$t0
 156          veor           $t3,$Yl,$Yh
 157          veor           $Ym,$Ym,$t1
 158         veor            $Xm,$Xm,$t2
 159         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
 160          veor           $Ym,$Ym,$t3
 161          vpmull.p64     $t3,$Yl,$xC2
 162
 163         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
 164          vmov           $Yh#lo,$Ym#hi
 165         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
 166          vmov           $Ym#hi,$Yl#lo
 167         veor            $Xl,$Xm,$t2
 168          veor           $Yl,$Ym,$t3
 169
 170         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
 171          vext.8         $t3,$Yl,$Yl,#8
 172         vpmull.p64      $Xl,$Xl,$xC2
 173          vpmull.p64     $Yl,$Yl,$xC2
 174         veor            $t2,$t2,$Xh
 175          veor           $t3,$t3,$Yh
 176         veor            $H, $Xl,$t2             @ H^3
 177          veor           $H2,$Yl,$t3             @ H^4
 178
 179         vext.8          $t0,$H, $H,#8           @ Karatsuba pre-processing
 180          vext.8         $t1,$H2,$H2,#8
 181         veor            $t0,$t0,$H
 182          veor           $t1,$t1,$H2
 183         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
 184         vst1.64         {$H-$H2},[x0]           @ store Htable[3..5]
 185 ___
 186 }
 187 $code.=<<___;
 188         ret
 189 .size   gcm_init_v8,.-gcm_init_v8
 190 ___
 191 ################################################################################
 192 # void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
 193 #
 194 # input:        Xi - current hash value;
 195 #               Htable - table precomputed in gcm_init_v8;
 196 # output:       Xi - next hash value Xi;
 197 #
 198 $code.=<<___;
 199 .global gcm_gmult_v8
 200 .type   gcm_gmult_v8,%function
 201 .align  4
 202 gcm_gmult_v8:
 203         vld1.64         {$t1},[$Xi]             @ load Xi
 204         vmov.i8         $xC2,#0xe1
 205         vld1.64         {$H-$Hhl},[$Htbl]       @ load twisted H, ...
 206         vshl.u64        $xC2,$xC2,#57
 207 #ifndef __ARMEB__
 208         vrev64.8        $t1,$t1
 209 #endif
 210         vext.8          $IN,$t1,$t1,#8
 211
 212         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
 213         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
 214         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
 215         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
 216
 217         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
 218         veor            $t2,$Xl,$Xh
 219         veor            $Xm,$Xm,$t1
 220         veor            $Xm,$Xm,$t2
 221         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
 222
 223         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
 224         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
 225         veor            $Xl,$Xm,$t2
 226
 227         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
 228         vpmull.p64      $Xl,$Xl,$xC2
 229         veor            $t2,$t2,$Xh
 230         veor            $Xl,$Xl,$t2
 231
 232 #ifndef __ARMEB__
 233         vrev64.8        $Xl,$Xl
 234 #endif
 235         vext.8          $Xl,$Xl,$Xl,#8
 236         vst1.64         {$Xl},[$Xi]             @ write out Xi
 237
 238         ret
 239 .size   gcm_gmult_v8,.-gcm_gmult_v8
 240 ___
 241 ################################################################################
 242 # void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 243 #
 244 # input:        table precomputed in gcm_init_v8;
 245 #               current hash value Xi;
 246 #               pointer to input data;
 247 #               length of input data in bytes, but divisible by block size;
 248 # output:       next hash value Xi;
 249 #
 250 $code.=<<___;
 251 .global gcm_ghash_v8
 252 .type   gcm_ghash_v8,%function
 253 .align  4
 254 gcm_ghash_v8:
 255 ___
 256 $code.=<<___    if ($flavour =~ /64/);
 257         bic             $inc,$len,#63
 258         cmp             $len,$inc
 259         b.eq            .Lgcm_ghash_v8_4x
 260 ___
 261 $code.=<<___            if ($flavour !~ /64/);
 262         vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
 263 ___
 264 $code.=<<___;
 265         vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
 266                                                 @ "[rotated]" means that
 267                                                 @ loaded value would have
 268                                                 @ to be rotated in order to
 269                                                 @ make it appear as in
 270                                                 @ algorithm specification
 271         subs            $len,$len,#32           @ see if $len is 32 or larger
 272         mov             $inc,#16                @ $inc is used as post-
 273                                                 @ increment for input pointer;
 274                                                 @ as loop is modulo-scheduled
 275                                                 @ $inc is zeroed just in time
 276                                                 @ to preclude overstepping
 277                                                 @ inp[len], which means that
 278                                                 @ last block[s] are actually
 279                                                 @ loaded twice, but last
 280                                                 @ copy is not processed
 281         vld1.64         {$H-$Hhl},[$Htbl],#32   @ load twisted H, ..., H^2
 282         vmov.i8         $xC2,#0xe1
 283         vld1.64         {$H2},[$Htbl]
 284         cclr            $inc,eq                 @ is it time to zero $inc?
 285         vext.8          $Xl,$Xl,$Xl,#8          @ rotate Xi
 286         vld1.64         {$t0},[$inp],#16        @ load [rotated] I[0]
 287         vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
 288 #ifndef __ARMEB__
 289         vrev64.8        $t0,$t0
 290         vrev64.8        $Xl,$Xl
 291 #endif
 292         vext.8          $IN,$t0,$t0,#8          @ rotate I[0]
 293         b.lo            .Lodd_tail_v8           @ $len was less than 32
 294 ___
 295 { my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
 296         #######
 297         # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
 298         #       [(H*Ii+1) + (H*Xi+1)] mod P =
 299         #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
 300         #
 301 $code.=<<___;
 302         vld1.64         {$t1},[$inp],$inc       @ load [rotated] I[1]
 303 #ifndef __ARMEB__
 304         vrev64.8        $t1,$t1
 305 #endif
 306         vext.8          $In,$t1,$t1,#8
 307         veor            $IN,$IN,$Xl             @ I[i]^=Xi
 308         vpmull.p64      $Xln,$H,$In             @ H·Ii+1
 309         veor            $t1,$t1,$In             @ Karatsuba pre-processing
 310         vpmull2.p64     $Xhn,$H,$In
 311         b               .Loop_mod2x_v8
 312
 313 .align  4
 314 .Loop_mod2x_v8:
 315         vext.8          $t2,$IN,$IN,#8
 316         subs            $len,$len,#32           @ is there more data?
 317         vpmull.p64      $Xl,$H2,$IN             @ H^2.lo·Xi.lo
 318         cclr            $inc,lo                 @ is it time to zero $inc?
 319
 320          vpmull.p64     $Xmn,$Hhl,$t1
 321         veor            $t2,$t2,$IN             @ Karatsuba pre-processing
 322         vpmull2.p64     $Xh,$H2,$IN             @ H^2.hi·Xi.hi
 323         veor            $Xl,$Xl,$Xln            @ accumulate
 324         vpmull2.p64     $Xm,$Hhl,$t2            @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
 325          vld1.64        {$t0},[$inp],$inc       @ load [rotated] I[i+2]
 326
 327         veor            $Xh,$Xh,$Xhn
 328          cclr           $inc,eq                 @ is it time to zero $inc?
 329         veor            $Xm,$Xm,$Xmn
 330
 331         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
 332         veor            $t2,$Xl,$Xh
 333         veor            $Xm,$Xm,$t1
 334          vld1.64        {$t1},[$inp],$inc       @ load [rotated] I[i+3]
 335 #ifndef __ARMEB__
 336          vrev64.8       $t0,$t0
 337 #endif
 338         veor            $Xm,$Xm,$t2
 339         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
 340
 341 #ifndef __ARMEB__
 342          vrev64.8       $t1,$t1
 343 #endif
 344         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
 345         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
 346          vext.8         $In,$t1,$t1,#8
 347          vext.8         $IN,$t0,$t0,#8
 348         veor            $Xl,$Xm,$t2
 349          vpmull.p64     $Xln,$H,$In             @ H·Ii+1
 350         veor            $IN,$IN,$Xh             @ accumulate $IN early
 351
 352         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
 353         vpmull.p64      $Xl,$Xl,$xC2
 354         veor            $IN,$IN,$t2
 355          veor           $t1,$t1,$In             @ Karatsuba pre-processing
 356         veor            $IN,$IN,$Xl
 357          vpmull2.p64    $Xhn,$H,$In
 358         b.hs            .Loop_mod2x_v8          @ there was at least 32 more bytes
 359
 360         veor            $Xh,$Xh,$t2
 361         vext.8          $IN,$t0,$t0,#8          @ re-construct $IN
 362         adds            $len,$len,#32           @ re-construct $len
 363         veor            $Xl,$Xl,$Xh             @ re-construct $Xl
 364         b.eq            .Ldone_v8               @ is $len zero?
 365 ___
 366 }
 367 $code.=<<___;
 368 .Lodd_tail_v8:
 369         vext.8          $t2,$Xl,$Xl,#8
 370         veor            $IN,$IN,$Xl             @ inp^=Xi
 371         veor            $t1,$t0,$t2             @ $t1 is rotated inp^Xi
 372
 373         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
 374         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
 375         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
 376         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
 377
 378         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
 379         veor            $t2,$Xl,$Xh
 380         veor            $Xm,$Xm,$t1
 381         veor            $Xm,$Xm,$t2
 382         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
 383
 384         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
 385         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
 386         veor            $Xl,$Xm,$t2
 387
 388         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
 389         vpmull.p64      $Xl,$Xl,$xC2
 390         veor            $t2,$t2,$Xh
 391         veor            $Xl,$Xl,$t2
 392
 393 .Ldone_v8:
 394 #ifndef __ARMEB__
 395         vrev64.8        $Xl,$Xl
 396 #endif
 397         vext.8          $Xl,$Xl,$Xl,#8
 398         vst1.64         {$Xl},[$Xi]             @ write out Xi
 399
 400 ___
 401 $code.=<<___            if ($flavour !~ /64/);
 402         vldmia          sp!,{d8-d15}            @ 32-bit ABI says so
 403 ___
 404 $code.=<<___;
 405         ret
 406 .size   gcm_ghash_v8,.-gcm_ghash_v8
 407 ___
 408
 409 if ($flavour =~ /64/) {                         # 4x subroutine
 410 my ($I0,$j1,$j2,$j3,
 411     $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
 412
 413 $code.=<<___;
 414 .type   gcm_ghash_v8_4x,%function
 415 .align  4
 416 gcm_ghash_v8_4x:
 417 .Lgcm_ghash_v8_4x:
 418         vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
 419         vld1.64         {$H-$H2},[$Htbl],#48    @ load twisted H, ..., H^2
 420         vmov.i8         $xC2,#0xe1
 421         vld1.64         {$H3-$H4},[$Htbl]       @ load twisted H^3, ..., H^4
 422         vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
 423 #ifndef __ARMEB__
 424         vrev64.8        $Xl,$Xl
 425 #endif
 426
 427         vld1.64         {$I0-$j3},[$inp],#64
 428 #ifndef __ARMEB__
 429         vrev64.8        $j1,$j1
 430         vrev64.8        $j2,$j2
 431         vrev64.8        $j3,$j3
 432         vrev64.8        $I0,$I0
 433 #endif
 434         vext.8          $I3,$j3,$j3,#8
 435         vext.8          $I2,$j2,$j2,#8
 436         vext.8          $I1,$j1,$j1,#8
 437
 438         vpmull.p64      $Yl,$H,$I3              @ H·Ii+3
 439         veor            $j3,$j3,$I3
 440         vpmull2.p64     $Yh,$H,$I3
 441         vpmull.p64      $Ym,$Hhl,$j3
 442
 443         vpmull.p64      $t0,$H2,$I2             @ H^2·Ii+2
 444         veor            $j2,$j2,$I2
 445         vpmull2.p64     $I2,$H2,$I2
 446         vpmull2.p64     $j2,$Hhl,$j2
 447
 448         veor            $Yl,$Yl,$t0
 449         veor            $Yh,$Yh,$I2
 450         veor            $Ym,$Ym,$j2
 451
 452         vpmull.p64      $j3,$H3,$I1             @ H^3·Ii+1
 453         veor            $j1,$j1,$I1
 454         vpmull2.p64     $I1,$H3,$I1
 455         vpmull.p64      $j1,$H34,$j1
 456
 457         veor            $Yl,$Yl,$j3
 458         veor            $Yh,$Yh,$I1
 459         veor            $Ym,$Ym,$j1
 460
 461         subs            $len,$len,#64
 462         b.eq            .Ltail4x
 463
 464         b               .Loop4x
 465
 466 .align  4
 467 .Loop4x:
 468         veor            $t0,$I0,$Xl
 469         vext.8          $IN,$t0,$t0,#8
 470
 471         vpmull.p64      $Xl,$H4,$IN             @ H^4·(Xi+Ii)
 472         veor            $t0,$t0,$IN
 473         vpmull2.p64     $Xh,$H4,$IN
 474         vpmull2.p64     $Xm,$H34,$t0
 475
 476         veor            $Xl,$Xl,$Yl
 477         veor            $Xh,$Xh,$Yh
 478         veor            $Xm,$Xm,$Ym
 479
 480         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
 481         veor            $t2,$Xl,$Xh
 482         veor            $Xm,$Xm,$t1
 483         veor            $Xm,$Xm,$t2
 484
 485         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
 486         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
 487         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
 488         veor            $Xl,$Xm,$t2
 489
 490         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
 491         vpmull.p64      $Xl,$Xl,$xC2
 492         veor            $t2,$t2,$Xh
 493         veor            $Xl,$Xl,$t2
 494         vext.8          $Xl,$Xl,$Xl,#8
 495
 496          vld1.64        {$I0-$j3},[$inp],#64
 497 #ifndef __ARMEB__
 498          vrev64.8       $j1,$j1
 499          vrev64.8       $j2,$j2
 500          vrev64.8       $j3,$j3
 501          vrev64.8       $I0,$I0
 502 #endif
 503          vext.8         $I3,$j3,$j3,#8
 504          vext.8         $I2,$j2,$j2,#8
 505          vext.8         $I1,$j1,$j1,#8
 506
 507          vpmull.p64     $Yl,$H,$I3              @ H·Ii+3
 508          veor           $j3,$j3,$I3
 509          vpmull2.p64    $Yh,$H,$I3
 510          vpmull.p64     $Ym,$Hhl,$j3
 511
 512          vpmull.p64     $t0,$H2,$I2             @ H^2·Ii+2
 513          veor           $j2,$j2,$I2
 514          vpmull2.p64    $I2,$H2,$I2
 515          vpmull2.p64    $j2,$Hhl,$j2
 516
 517          veor           $Yl,$Yl,$t0
 518          veor           $Yh,$Yh,$I2
 519          veor           $Ym,$Ym,$j2
 520
 521          vpmull.p64     $j3,$H3,$I1             @ H^3·Ii+1
 522          veor           $j1,$j1,$I1
 523          vpmull2.p64    $I1,$H3,$I1
 524          vpmull.p64     $j1,$H34,$j1
 525
 526          veor           $Yl,$Yl,$j3
 527          veor           $Yh,$Yh,$I1
 528          veor           $Ym,$Ym,$j1
 529
 530         subs            $len,$len,#64
 531         b.ne            .Loop4x
 532
 533 .Ltail4x:
 534         veor            $t0,$I0,$Xl
 535         vext.8          $IN,$t0,$t0,#8
 536
 537         vpmull.p64      $Xl,$H4,$IN             @ H^4·(Xi+Ii)
 538         veor            $t0,$t0,$IN
 539         vpmull2.p64     $Xh,$H4,$IN
 540         vpmull2.p64     $Xm,$H34,$t0
 541
 542         veor            $Xl,$Xl,$Yl
 543         veor            $Xh,$Xh,$Yh
 544         veor            $Xm,$Xm,$Ym
 545
 546         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
 547         veor            $t2,$Xl,$Xh
 548         veor            $Xm,$Xm,$t1
 549         veor            $Xm,$Xm,$t2
 550
 551         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
 552         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
 553         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
 554         veor            $Xl,$Xm,$t2
 555
 556         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
 557         vpmull.p64      $Xl,$Xl,$xC2
 558         veor            $t2,$t2,$Xh
 559         veor            $Xl,$Xl,$t2
 560         vext.8          $Xl,$Xl,$Xl,#8
 561
 562 #ifndef __ARMEB__
 563         vrev64.8        $Xl,$Xl
 564 #endif
 565         vst1.64         {$Xl},[$Xi]             @ write out Xi
 566
 567         ret
 568 .size   gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
 569 ___
 570
 571 }
 572 }
 573
 574 $code.=<<___;
 575 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 576 .align  2
 577 ___
 578
 579 if ($flavour =~ /64/) {                 ######## 64-bit code
 580     sub unvmov {
 581         my $arg=shift;
 582
 583         $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
 584         sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
 585                                              $3<8?$3:$3+8,($4 eq "lo")?0:1;
 586     }
 587     foreach(split("\n",$code)) {
 588         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
 589         s/vmov\.i8/movi/o               or      # fix up legacy mnemonics
 590         s/vmov\s+(.*)/unvmov($1)/geo    or
 591         s/vext\.8/ext/o                 or
 592         s/vshr\.s/sshr\.s/o             or
 593         s/vshr/ushr/o                   or
 594         s/^(\s+)v/$1/o                  or      # strip off v prefix
 595         s/\bbx\s+lr\b/ret/o;
 596
 597         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
 598         s/@\s/\/\//o;                           # old->new style commentary
 599
 600         # fix up remaining legacy suffixes
 601         s/\.[ui]?8(\s)/$1/o;
 602         s/\.[uis]?32//o and s/\.16b/\.4s/go;
 603         m/\.p64/o and s/\.16b/\.1q/o;           # 1st pmull argument
 604         m/l\.p64/o and s/\.16b/\.1d/go;         # 2nd and 3rd pmull arguments
 605         s/\.[uisp]?64//o and s/\.16b/\.2d/go;
 606         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
 607
 608         print $_,"\n";
 609     }
 610 } else {                                ######## 32-bit code
 611     sub unvdup32 {
 612         my $arg=shift;
 613
 614         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
 615         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
 616     }
 617     sub unvpmullp64 {
 618         my ($mnemonic,$arg)=@_;
 619
 620         if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
 621             my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
 622                                  |(($2&7)<<17)|(($2&8)<<4)
 623                                  |(($3&7)<<1) |(($3&8)<<2);
 624             $word |= 0x00010001  if ($mnemonic =~ "2");
 625             # since ARMv7 instructions are always encoded little-endian.
 626             # correct solution is to use .inst directive, but older
 627             # assemblers don't implement it:-(
 628             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 629                         $word&0xff,($word>>8)&0xff,
 630                         ($word>>16)&0xff,($word>>24)&0xff,
 631                         $mnemonic,$arg;
 632         }
 633     }
 634
 635     foreach(split("\n",$code)) {
 636         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
 637         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
 638         s/\/\/\s?/@ /o;                         # new->old style commentary
 639
 640         # fix up remaining new-style suffixes
 641         s/\],#[0-9]+/]!/o;
 642
 643         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o                 or
 644         s/vdup\.32\s+(.*)/unvdup32($1)/geo                              or
 645         s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo                or
 646         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
 647         s/^(\s+)b\./$1b/o                                               or
 648         s/^(\s+)ret/$1bx\tlr/o;
 649
 650         print $_,"\n";
 651     }
 652 }
 653
 654 close STDOUT; # enforce flush