crypto/modes/asm/ghash-armv4.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # April 2010
  11 #
  12 # The module implements "4-bit" GCM GHASH function and underlying
  13 # single multiplication operation in GF(2^128). "4-bit" means that it
  14 # uses 256 bytes per-key table [+32 bytes shared table]. There is no
  15 # experimental performance data available yet. The only approximation
  16 # that can be made at this point is based on code size. Inner loop is
  17 # 32 instructions long and on single-issue core should execute in <40
  18 # cycles. Having verified that gcc 3.4 didn't unroll corresponding
  19 # loop, this assembler loop body was found to be ~3x smaller than
  20 # compiler-generated one...
  21 #
  22 # July 2010
  23 #
  24 # Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
  25 # Cortex A8 core and ~25 cycles per processed byte (which was observed
  26 # to be ~3 times faster than gcc-generated code:-)
  27 #
  28 # February 2011
  29 #
  30 # Profiler-assisted and platform-specific optimization resulted in 7%
  31 # improvement on Cortex A8 core and ~23.5 cycles per byte.
  32 #
  33 # March 2011
  34 #
  35 # Add NEON implementation featuring polynomial multiplication, i.e. no
  36 # lookup tables involved. On Cortex A8 it was measured to process one
  37 # byte in 15 cycles or 55% faster than integer-only code.
  38 #
  39 # April 2014
  40 #
  41 # Switch to multiplication algorithm suggested in paper referred
  42 # below and combine it with reduction algorithm from x86 module.
  43 # Performance improvement over previous version varies from 65% on
  44 # Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
  45 # processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
  46 # Snapdragon S4 - in 9.33.
  47 #
  48 # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
  49 # Polynomial Multiplication on ARM Processors using the NEON Engine.
  50 #
  51 # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
  52
  53 # ====================================================================
  54 # Note about "528B" variant. In ARM case it makes lesser sense to
  55 # implement it for following reasons:
  56 #
  57 # - performance improvement won't be anywhere near 50%, because 128-
  58 #   bit shift operation is neatly fused with 128-bit xor here, and
  59 #   "538B" variant would eliminate only 4-5 instructions out of 32
  60 #   in the inner loop (meaning that estimated improvement is ~15%);
  61 # - ARM-based systems are often embedded ones and extra memory
  62 #   consumption might be unappreciated (for so little improvement);
  63 #
  64 # Byte order [in]dependence. =========================================
  65 #
  66 # Caller is expected to maintain specific *dword* order in Htable,
  67 # namely with *least* significant dword of 128-bit value at *lower*
  68 # address. This differs completely from C code and has everything to
  69 # do with ldm instruction and order in which dwords are "consumed" by
  70 # algorithm. *Byte* order within these dwords in turn is whatever
  71 # *native* byte order on current platform. See gcm128.c for working
  72 # example...
  73
  74 $flavour = shift;
  75 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  76 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
  77
  78 if ($flavour && $flavour ne "void") {
  79     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  80     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  81     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  82     die "can't locate arm-xlate.pl";
  83
  84     open STDOUT,"| \"$^X\" $xlate $flavour $output";
  85 } else {
  86     open STDOUT,">$output";
  87 }
  88
  89 $Xi="r0";       # argument block
  90 $Htbl="r1";
  91 $inp="r2";
  92 $len="r3";
  93
  94 $Zll="r4";      # variables
  95 $Zlh="r5";
  96 $Zhl="r6";
  97 $Zhh="r7";
  98 $Tll="r8";
  99 $Tlh="r9";
 100 $Thl="r10";
 101 $Thh="r11";
 102 $nlo="r12";
 103 ################# r13 is stack pointer
 104 $nhi="r14";
 105 ################# r15 is program counter
 106
 107 $rem_4bit=$inp; # used in gcm_gmult_4bit
 108 $cnt=$len;
 109
 110 sub Zsmash() {
 111   my $i=12;
 112   my @args=@_;
 113   for ($Zll,$Zlh,$Zhl,$Zhh) {
 114     $code.=<<___;
 115 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
 116         rev     $_,$_
 117         str     $_,[$Xi,#$i]
 118 #elif defined(__ARMEB__)
 119         str     $_,[$Xi,#$i]
 120 #else
 121         mov     $Tlh,$_,lsr#8
 122         strb    $_,[$Xi,#$i+3]
 123         mov     $Thl,$_,lsr#16
 124         strb    $Tlh,[$Xi,#$i+2]
 125         mov     $Thh,$_,lsr#24
 126         strb    $Thl,[$Xi,#$i+1]
 127         strb    $Thh,[$Xi,#$i]
 128 #endif
 129 ___
 130     $code.="\t".shift(@args)."\n";
 131     $i-=4;
 132   }
 133 }
 134
 135 $code=<<___;
 136 #include "arm_arch.h"
 137
 138 .text
 139 #if defined(__thumb2__) && !defined(__APPLE__)
 140 .syntax unified
 141 .thumb
 142 #else
 143 .code   32
 144 #endif
 145
 146 #ifdef  __clang__
 147 #define ldrplb  ldrbpl
 148 #define ldrneb  ldrbne
 149 #endif
 150
 151 .type   rem_4bit,%object
 152 .align  5
 153 rem_4bit:
 154 .short  0x0000,0x1C20,0x3840,0x2460
 155 .short  0x7080,0x6CA0,0x48C0,0x54E0
 156 .short  0xE100,0xFD20,0xD940,0xC560
 157 .short  0x9180,0x8DA0,0xA9C0,0xB5E0
 158 .size   rem_4bit,.-rem_4bit
 159
 160 .type   rem_4bit_get,%function
 161 rem_4bit_get:
 162 #if defined(__thumb2__)
 163         adr     $rem_4bit,rem_4bit
 164 #else
 165         sub     $rem_4bit,pc,#8+32      @ &rem_4bit
 166 #endif
 167         b       .Lrem_4bit_got
 168         nop
 169         nop
 170 .size   rem_4bit_get,.-rem_4bit_get
 171
 172 .global gcm_ghash_4bit
 173 .type   gcm_ghash_4bit,%function
 174 .align  4
 175 gcm_ghash_4bit:
 176 #if defined(__thumb2__)
 177         adr     r12,rem_4bit
 178 #else
 179         sub     r12,pc,#8+48            @ &rem_4bit
 180 #endif
 181         add     $len,$inp,$len          @ $len to point at the end
 182         stmdb   sp!,{r3-r11,lr}         @ save $len/end too
 183
 184         ldmia   r12,{r4-r11}            @ copy rem_4bit ...
 185         stmdb   sp!,{r4-r11}            @ ... to stack
 186
 187         ldrb    $nlo,[$inp,#15]
 188         ldrb    $nhi,[$Xi,#15]
 189 .Louter:
 190         eor     $nlo,$nlo,$nhi
 191         and     $nhi,$nlo,#0xf0
 192         and     $nlo,$nlo,#0x0f
 193         mov     $cnt,#14
 194
 195         add     $Zhh,$Htbl,$nlo,lsl#4
 196         ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
 197         add     $Thh,$Htbl,$nhi
 198         ldrb    $nlo,[$inp,#14]
 199
 200         and     $nhi,$Zll,#0xf          @ rem
 201         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 202         add     $nhi,$nhi,$nhi
 203         eor     $Zll,$Tll,$Zll,lsr#4
 204         ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
 205         eor     $Zll,$Zll,$Zlh,lsl#28
 206         ldrb    $nhi,[$Xi,#14]
 207         eor     $Zlh,$Tlh,$Zlh,lsr#4
 208         eor     $Zlh,$Zlh,$Zhl,lsl#28
 209         eor     $Zhl,$Thl,$Zhl,lsr#4
 210         eor     $Zhl,$Zhl,$Zhh,lsl#28
 211         eor     $Zhh,$Thh,$Zhh,lsr#4
 212         eor     $nlo,$nlo,$nhi
 213         and     $nhi,$nlo,#0xf0
 214         and     $nlo,$nlo,#0x0f
 215         eor     $Zhh,$Zhh,$Tll,lsl#16
 216
 217 .Linner:
 218         add     $Thh,$Htbl,$nlo,lsl#4
 219         and     $nlo,$Zll,#0xf          @ rem
 220         subs    $cnt,$cnt,#1
 221         add     $nlo,$nlo,$nlo
 222         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
 223         eor     $Zll,$Tll,$Zll,lsr#4
 224         eor     $Zll,$Zll,$Zlh,lsl#28
 225         eor     $Zlh,$Tlh,$Zlh,lsr#4
 226         eor     $Zlh,$Zlh,$Zhl,lsl#28
 227         ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
 228         eor     $Zhl,$Thl,$Zhl,lsr#4
 229 #ifdef  __thumb2__
 230         it      pl
 231 #endif
 232         ldrplb  $nlo,[$inp,$cnt]
 233         eor     $Zhl,$Zhl,$Zhh,lsl#28
 234         eor     $Zhh,$Thh,$Zhh,lsr#4
 235
 236         add     $Thh,$Htbl,$nhi
 237         and     $nhi,$Zll,#0xf          @ rem
 238         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 239         add     $nhi,$nhi,$nhi
 240         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 241         eor     $Zll,$Tll,$Zll,lsr#4
 242 #ifdef  __thumb2__
 243         it      pl
 244 #endif
 245         ldrplb  $Tll,[$Xi,$cnt]
 246         eor     $Zll,$Zll,$Zlh,lsl#28
 247         eor     $Zlh,$Tlh,$Zlh,lsr#4
 248         ldrh    $Tlh,[sp,$nhi]
 249         eor     $Zlh,$Zlh,$Zhl,lsl#28
 250         eor     $Zhl,$Thl,$Zhl,lsr#4
 251         eor     $Zhl,$Zhl,$Zhh,lsl#28
 252 #ifdef  __thumb2__
 253         it      pl
 254 #endif
 255         eorpl   $nlo,$nlo,$Tll
 256         eor     $Zhh,$Thh,$Zhh,lsr#4
 257 #ifdef  __thumb2__
 258         itt     pl
 259 #endif
 260         andpl   $nhi,$nlo,#0xf0
 261         andpl   $nlo,$nlo,#0x0f
 262         eor     $Zhh,$Zhh,$Tlh,lsl#16   @ ^= rem_4bit[rem]
 263         bpl     .Linner
 264
 265         ldr     $len,[sp,#32]           @ re-load $len/end
 266         add     $inp,$inp,#16
 267         mov     $nhi,$Zll
 268 ___
 269         &Zsmash("cmp\t$inp,$len","\n".
 270                                  "#ifdef __thumb2__\n".
 271                                  "      it      ne\n".
 272                                  "#endif\n".
 273                                  "      ldrneb  $nlo,[$inp,#15]");
 274 $code.=<<___;
 275         bne     .Louter
 276
 277         add     sp,sp,#36
 278 #if __ARM_ARCH__>=5
 279         ldmia   sp!,{r4-r11,pc}
 280 #else
 281         ldmia   sp!,{r4-r11,lr}
 282         tst     lr,#1
 283         moveq   pc,lr                   @ be binary compatible with V4, yet
 284         bx      lr                      @ interoperable with Thumb ISA:-)
 285 #endif
 286 .size   gcm_ghash_4bit,.-gcm_ghash_4bit
 287
 288 .global gcm_gmult_4bit
 289 .type   gcm_gmult_4bit,%function
 290 gcm_gmult_4bit:
 291         stmdb   sp!,{r4-r11,lr}
 292         ldrb    $nlo,[$Xi,#15]
 293         b       rem_4bit_get
 294 .Lrem_4bit_got:
 295         and     $nhi,$nlo,#0xf0
 296         and     $nlo,$nlo,#0x0f
 297         mov     $cnt,#14
 298
 299         add     $Zhh,$Htbl,$nlo,lsl#4
 300         ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
 301         ldrb    $nlo,[$Xi,#14]
 302
 303         add     $Thh,$Htbl,$nhi
 304         and     $nhi,$Zll,#0xf          @ rem
 305         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 306         add     $nhi,$nhi,$nhi
 307         eor     $Zll,$Tll,$Zll,lsr#4
 308         ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
 309         eor     $Zll,$Zll,$Zlh,lsl#28
 310         eor     $Zlh,$Tlh,$Zlh,lsr#4
 311         eor     $Zlh,$Zlh,$Zhl,lsl#28
 312         eor     $Zhl,$Thl,$Zhl,lsr#4
 313         eor     $Zhl,$Zhl,$Zhh,lsl#28
 314         eor     $Zhh,$Thh,$Zhh,lsr#4
 315         and     $nhi,$nlo,#0xf0
 316         eor     $Zhh,$Zhh,$Tll,lsl#16
 317         and     $nlo,$nlo,#0x0f
 318
 319 .Loop:
 320         add     $Thh,$Htbl,$nlo,lsl#4
 321         and     $nlo,$Zll,#0xf          @ rem
 322         subs    $cnt,$cnt,#1
 323         add     $nlo,$nlo,$nlo
 324         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
 325         eor     $Zll,$Tll,$Zll,lsr#4
 326         eor     $Zll,$Zll,$Zlh,lsl#28
 327         eor     $Zlh,$Tlh,$Zlh,lsr#4
 328         eor     $Zlh,$Zlh,$Zhl,lsl#28
 329         ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
 330         eor     $Zhl,$Thl,$Zhl,lsr#4
 331 #ifdef  __thumb2__
 332         it      pl
 333 #endif
 334         ldrplb  $nlo,[$Xi,$cnt]
 335         eor     $Zhl,$Zhl,$Zhh,lsl#28
 336         eor     $Zhh,$Thh,$Zhh,lsr#4
 337
 338         add     $Thh,$Htbl,$nhi
 339         and     $nhi,$Zll,#0xf          @ rem
 340         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 341         add     $nhi,$nhi,$nhi
 342         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 343         eor     $Zll,$Tll,$Zll,lsr#4
 344         eor     $Zll,$Zll,$Zlh,lsl#28
 345         eor     $Zlh,$Tlh,$Zlh,lsr#4
 346         ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
 347         eor     $Zlh,$Zlh,$Zhl,lsl#28
 348         eor     $Zhl,$Thl,$Zhl,lsr#4
 349         eor     $Zhl,$Zhl,$Zhh,lsl#28
 350         eor     $Zhh,$Thh,$Zhh,lsr#4
 351 #ifdef  __thumb2__
 352         itt     pl
 353 #endif
 354         andpl   $nhi,$nlo,#0xf0
 355         andpl   $nlo,$nlo,#0x0f
 356         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 357         bpl     .Loop
 358 ___
 359         &Zsmash();
 360 $code.=<<___;
 361 #if __ARM_ARCH__>=5
 362         ldmia   sp!,{r4-r11,pc}
 363 #else
 364         ldmia   sp!,{r4-r11,lr}
 365         tst     lr,#1
 366         moveq   pc,lr                   @ be binary compatible with V4, yet
 367         bx      lr                      @ interoperable with Thumb ISA:-)
 368 #endif
 369 .size   gcm_gmult_4bit,.-gcm_gmult_4bit
 370 ___
 371 {
 372 my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
 373 my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
 374 my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
 375
 376 sub clmul64x64 {
 377 my ($r,$a,$b)=@_;
 378 $code.=<<___;
 379         vext.8          $t0#lo, $a, $a, #1      @ A1
 380         vmull.p8        $t0, $t0#lo, $b         @ F = A1*B
 381         vext.8          $r#lo, $b, $b, #1       @ B1
 382         vmull.p8        $r, $a, $r#lo           @ E = A*B1
 383         vext.8          $t1#lo, $a, $a, #2      @ A2
 384         vmull.p8        $t1, $t1#lo, $b         @ H = A2*B
 385         vext.8          $t3#lo, $b, $b, #2      @ B2
 386         vmull.p8        $t3, $a, $t3#lo         @ G = A*B2
 387         vext.8          $t2#lo, $a, $a, #3      @ A3
 388         veor            $t0, $t0, $r            @ L = E + F
 389         vmull.p8        $t2, $t2#lo, $b         @ J = A3*B
 390         vext.8          $r#lo, $b, $b, #3       @ B3
 391         veor            $t1, $t1, $t3           @ M = G + H
 392         vmull.p8        $r, $a, $r#lo           @ I = A*B3
 393         veor            $t0#lo, $t0#lo, $t0#hi  @ t0 = (L) (P0 + P1) << 8
 394         vand            $t0#hi, $t0#hi, $k48
 395         vext.8          $t3#lo, $b, $b, #4      @ B4
 396         veor            $t1#lo, $t1#lo, $t1#hi  @ t1 = (M) (P2 + P3) << 16
 397         vand            $t1#hi, $t1#hi, $k32
 398         vmull.p8        $t3, $a, $t3#lo         @ K = A*B4
 399         veor            $t2, $t2, $r            @ N = I + J
 400         veor            $t0#lo, $t0#lo, $t0#hi
 401         veor            $t1#lo, $t1#lo, $t1#hi
 402         veor            $t2#lo, $t2#lo, $t2#hi  @ t2 = (N) (P4 + P5) << 24
 403         vand            $t2#hi, $t2#hi, $k16
 404         vext.8          $t0, $t0, $t0, #15
 405         veor            $t3#lo, $t3#lo, $t3#hi  @ t3 = (K) (P6 + P7) << 32
 406         vmov.i64        $t3#hi, #0
 407         vext.8          $t1, $t1, $t1, #14
 408         veor            $t2#lo, $t2#lo, $t2#hi
 409         vmull.p8        $r, $a, $b              @ D = A*B
 410         vext.8          $t3, $t3, $t3, #12
 411         vext.8          $t2, $t2, $t2, #13
 412         veor            $t0, $t0, $t1
 413         veor            $t2, $t2, $t3
 414         veor            $r, $r, $t0
 415         veor            $r, $r, $t2
 416 ___
 417 }
 418
 419 $code.=<<___;
 420 #if __ARM_MAX_ARCH__>=7
 421 .arch   armv7-a
 422 .fpu    neon
 423
 424 .global gcm_init_neon
 425 .type   gcm_init_neon,%function
 426 .align  4
 427 gcm_init_neon:
 428         vld1.64         $IN#hi,[r1]!            @ load H
 429         vmov.i8         $t0,#0xe1
 430         vld1.64         $IN#lo,[r1]
 431         vshl.i64        $t0#hi,#57
 432         vshr.u64        $t0#lo,#63              @ t0=0xc2....01
 433         vdup.8          $t1,$IN#hi[7]
 434         vshr.u64        $Hlo,$IN#lo,#63
 435         vshr.s8         $t1,#7                  @ broadcast carry bit
 436         vshl.i64        $IN,$IN,#1
 437         vand            $t0,$t0,$t1
 438         vorr            $IN#hi,$Hlo             @ H<<<=1
 439         veor            $IN,$IN,$t0             @ twisted H
 440         vstmia          r0,{$IN}
 441
 442         ret                                     @ bx lr
 443 .size   gcm_init_neon,.-gcm_init_neon
 444
 445 .global gcm_gmult_neon
 446 .type   gcm_gmult_neon,%function
 447 .align  4
 448 gcm_gmult_neon:
 449         vld1.64         $IN#hi,[$Xi]!           @ load Xi
 450         vld1.64         $IN#lo,[$Xi]!
 451         vmov.i64        $k48,#0x0000ffffffffffff
 452         vldmia          $Htbl,{$Hlo-$Hhi}       @ load twisted H
 453         vmov.i64        $k32,#0x00000000ffffffff
 454 #ifdef __ARMEL__
 455         vrev64.8        $IN,$IN
 456 #endif
 457         vmov.i64        $k16,#0x000000000000ffff
 458         veor            $Hhl,$Hlo,$Hhi          @ Karatsuba pre-processing
 459         mov             $len,#16
 460         b               .Lgmult_neon
 461 .size   gcm_gmult_neon,.-gcm_gmult_neon
 462
 463 .global gcm_ghash_neon
 464 .type   gcm_ghash_neon,%function
 465 .align  4
 466 gcm_ghash_neon:
 467         vld1.64         $Xl#hi,[$Xi]!           @ load Xi
 468         vld1.64         $Xl#lo,[$Xi]!
 469         vmov.i64        $k48,#0x0000ffffffffffff
 470         vldmia          $Htbl,{$Hlo-$Hhi}       @ load twisted H
 471         vmov.i64        $k32,#0x00000000ffffffff
 472 #ifdef __ARMEL__
 473         vrev64.8        $Xl,$Xl
 474 #endif
 475         vmov.i64        $k16,#0x000000000000ffff
 476         veor            $Hhl,$Hlo,$Hhi          @ Karatsuba pre-processing
 477
 478 .Loop_neon:
 479         vld1.64         $IN#hi,[$inp]!          @ load inp
 480         vld1.64         $IN#lo,[$inp]!
 481 #ifdef __ARMEL__
 482         vrev64.8        $IN,$IN
 483 #endif
 484         veor            $IN,$Xl                 @ inp^=Xi
 485 .Lgmult_neon:
 486 ___
 487         &clmul64x64     ($Xl,$Hlo,"$IN#lo");    # H.lo·Xi.lo
 488 $code.=<<___;
 489         veor            $IN#lo,$IN#lo,$IN#hi    @ Karatsuba pre-processing
 490 ___
 491         &clmul64x64     ($Xm,$Hhl,"$IN#lo");    # (H.lo+H.hi)·(Xi.lo+Xi.hi)
 492         &clmul64x64     ($Xh,$Hhi,"$IN#hi");    # H.hi·Xi.hi
 493 $code.=<<___;
 494         veor            $Xm,$Xm,$Xl             @ Karatsuba post-processing
 495         veor            $Xm,$Xm,$Xh
 496         veor            $Xl#hi,$Xl#hi,$Xm#lo
 497         veor            $Xh#lo,$Xh#lo,$Xm#hi    @ Xh|Xl - 256-bit result
 498
 499         @ equivalent of reduction_avx from ghash-x86_64.pl
 500         vshl.i64        $t1,$Xl,#57             @ 1st phase
 501         vshl.i64        $t2,$Xl,#62
 502         veor            $t2,$t2,$t1             @
 503         vshl.i64        $t1,$Xl,#63
 504         veor            $t2, $t2, $t1           @
 505         veor            $Xl#hi,$Xl#hi,$t2#lo    @
 506         veor            $Xh#lo,$Xh#lo,$t2#hi
 507
 508         vshr.u64        $t2,$Xl,#1              @ 2nd phase
 509         veor            $Xh,$Xh,$Xl
 510         veor            $Xl,$Xl,$t2             @
 511         vshr.u64        $t2,$t2,#6
 512         vshr.u64        $Xl,$Xl,#1              @
 513         veor            $Xl,$Xl,$Xh             @
 514         veor            $Xl,$Xl,$t2             @
 515
 516         subs            $len,#16
 517         bne             .Loop_neon
 518
 519 #ifdef __ARMEL__
 520         vrev64.8        $Xl,$Xl
 521 #endif
 522         sub             $Xi,#16
 523         vst1.64         $Xl#hi,[$Xi]!           @ write out Xi
 524         vst1.64         $Xl#lo,[$Xi]
 525
 526         ret                                     @ bx lr
 527 .size   gcm_ghash_neon,.-gcm_ghash_neon
 528 #endif
 529 ___
 530 }
 531 $code.=<<___;
 532 .asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 533 .align  2
 534 ___
 535
 536 foreach (split("\n",$code)) {
 537         s/\`([^\`]*)\`/eval $1/geo;
 538
 539         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
 540         s/\bret\b/bx    lr/go           or
 541         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
 542
 543         print $_,"\n";
 544 }
 545 close STDOUT; # enforce flush