crypto/sha/asm/keccak1600-armv8.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # Keccak-1600 for ARMv8.
  17 #
  18 # June 2017.
  19 #
  20 # This is straightforward KECCAK_1X_ALT implementation. It makes no
  21 # sense to attempt SIMD/NEON implementation for following reason.
  22 # 64-bit lanes of vector registers can't be addressed as easily as in
  23 # 32-bit mode. This means that 64-bit NEON is bound to be slower than
  24 # 32-bit NEON, and this implementation is faster than 32-bit NEON on
  25 # same processor. Even though it takes more scalar xor's and andn's,
  26 # it gets compensated by availability of rotate. Not to forget that
  27 # most processors achieve higher issue rate with scalar instructions.
  28 #
  29 ######################################################################
  30 # Numbers are cycles per processed byte.
  31 #
  32 #               r=1088(*)
  33 #
  34 # Cortex-A53    13
  35 # Cortex-A57    12
  36 # X-Gene        14
  37 # Mongoose      10
  38 # Denver        7.8
  39 # Apple A7      7.2
  40 #
  41 # (*)   Corresponds to SHA3-256. No improvement coefficients are listed
  42 #       because they vary too much from compiler to compiler. Newer
  43 #       compiler does much better and improvement varies from 5% on
  44 #       Cortex-A57 to 25% on Cortex-A53. While in comparison to older
  45 #       compiler this code is at least 2x faster...
  46
  47 $flavour = shift;
  48 $output  = shift;
  49
  50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  51 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  52 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  53 die "can't locate arm-xlate.pl";
  54
  55 open OUT,"| \"$^X\" $xlate $flavour $output";
  56 *STDOUT=*OUT;
  57
  58 my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
  59             (0, 5, 10, 15, 20));
  60    $A[3][3] = "x25"; # x18 is reserved
  61
  62 my @C = map("x$_", (26,27,28,30));
  63
  64 my @rhotates = ([  0,  1, 62, 28, 27 ],
  65                 [ 36, 44,  6, 55, 20 ],
  66                 [  3, 10, 43, 25, 39 ],
  67                 [ 41, 45, 15, 21,  8 ],
  68                 [ 18,  2, 61, 56, 14 ]);
  69
  70 $code.=<<___;
  71 .text
  72
  73 .align 8        // strategic alignment and padding that allows to use
  74                 // address value as loop termination condition...
  75         .quad   0,0,0,0,0,0,0,0
  76 .type   iotas,%object
  77 iotas:
  78         .quad   0x0000000000000001
  79         .quad   0x0000000000008082
  80         .quad   0x800000000000808a
  81         .quad   0x8000000080008000
  82         .quad   0x000000000000808b
  83         .quad   0x0000000080000001
  84         .quad   0x8000000080008081
  85         .quad   0x8000000000008009
  86         .quad   0x000000000000008a
  87         .quad   0x0000000000000088
  88         .quad   0x0000000080008009
  89         .quad   0x000000008000000a
  90         .quad   0x000000008000808b
  91         .quad   0x800000000000008b
  92         .quad   0x8000000000008089
  93         .quad   0x8000000000008003
  94         .quad   0x8000000000008002
  95         .quad   0x8000000000000080
  96         .quad   0x000000000000800a
  97         .quad   0x800000008000000a
  98         .quad   0x8000000080008081
  99         .quad   0x8000000000008080
 100         .quad   0x0000000080000001
 101         .quad   0x8000000080008008
 102 .size   iotas,.-iotas
 103
 104 .type   KeccakF1600_int,%function
 105 .align  5
 106 KeccakF1600_int:
 107         adr     $C[2],iotas
 108         stp     $C[2],x30,[sp,#16]              // 32 bytes on top are mine
 109         b       .Loop
 110 .align  4
 111 .Loop:
 112         ////////////////////////////////////////// Theta
 113         eor     $C[0],$A[0][0],$A[1][0]
 114         stp     $A[0][4],$A[1][4],[sp,#0]       // offload pair...
 115         eor     $C[1],$A[0][1],$A[1][1]
 116         eor     $C[2],$A[0][2],$A[1][2]
 117         eor     $C[3],$A[0][3],$A[1][3]
 118 ___
 119         $C[4]=$A[0][4];
 120         $C[5]=$A[1][4];
 121 $code.=<<___;
 122         eor     $C[4],$A[0][4],$A[1][4]
 123         eor     $C[0],$C[0],$A[2][0]
 124         eor     $C[1],$C[1],$A[2][1]
 125         eor     $C[2],$C[2],$A[2][2]
 126         eor     $C[3],$C[3],$A[2][3]
 127         eor     $C[4],$C[4],$A[2][4]
 128         eor     $C[0],$C[0],$A[3][0]
 129         eor     $C[1],$C[1],$A[3][1]
 130         eor     $C[2],$C[2],$A[3][2]
 131         eor     $C[3],$C[3],$A[3][3]
 132         eor     $C[4],$C[4],$A[3][4]
 133         eor     $C[0],$C[0],$A[4][0]
 134         eor     $C[2],$C[2],$A[4][2]
 135         eor     $C[1],$C[1],$A[4][1]
 136         eor     $C[3],$C[3],$A[4][3]
 137         eor     $C[4],$C[4],$A[4][4]
 138
 139         eor     $C[5],$C[0],$C[2],ror#63
 140
 141         eor     $A[0][1],$A[0][1],$C[5]
 142         eor     $A[1][1],$A[1][1],$C[5]
 143         eor     $A[2][1],$A[2][1],$C[5]
 144         eor     $A[3][1],$A[3][1],$C[5]
 145         eor     $A[4][1],$A[4][1],$C[5]
 146
 147         eor     $C[5],$C[1],$C[3],ror#63
 148         eor     $C[2],$C[2],$C[4],ror#63
 149         eor     $C[3],$C[3],$C[0],ror#63
 150         eor     $C[4],$C[4],$C[1],ror#63
 151
 152         eor     $C[1],   $A[0][2],$C[5]         // mov  $C[1],$A[0][2]
 153         eor     $A[1][2],$A[1][2],$C[5]
 154         eor     $A[2][2],$A[2][2],$C[5]
 155         eor     $A[3][2],$A[3][2],$C[5]
 156         eor     $A[4][2],$A[4][2],$C[5]
 157
 158         eor     $A[0][0],$A[0][0],$C[4]
 159         eor     $A[1][0],$A[1][0],$C[4]
 160         eor     $A[2][0],$A[2][0],$C[4]
 161         eor     $A[3][0],$A[3][0],$C[4]
 162         eor     $A[4][0],$A[4][0],$C[4]
 163 ___
 164         $C[4]=undef;
 165         $C[5]=undef;
 166 $code.=<<___;
 167         ldp     $A[0][4],$A[1][4],[sp,#0]       // re-load offloaded data
 168         eor     $C[0],   $A[0][3],$C[2]         // mov  $C[0],$A[0][3]
 169         eor     $A[1][3],$A[1][3],$C[2]
 170         eor     $A[2][3],$A[2][3],$C[2]
 171         eor     $A[3][3],$A[3][3],$C[2]
 172         eor     $A[4][3],$A[4][3],$C[2]
 173
 174         eor     $C[2],   $A[0][4],$C[3]         // mov  $C[2],$A[0][4]
 175         eor     $A[1][4],$A[1][4],$C[3]
 176         eor     $A[2][4],$A[2][4],$C[3]
 177         eor     $A[3][4],$A[3][4],$C[3]
 178         eor     $A[4][4],$A[4][4],$C[3]
 179
 180         ////////////////////////////////////////// Rho+Pi
 181         mov     $C[3],$A[0][1]
 182         ror     $A[0][1],$A[1][1],#64-$rhotates[1][1]
 183         //mov   $C[1],$A[0][2]
 184         ror     $A[0][2],$A[2][2],#64-$rhotates[2][2]
 185         //mov   $C[0],$A[0][3]
 186         ror     $A[0][3],$A[3][3],#64-$rhotates[3][3]
 187         //mov   $C[2],$A[0][4]
 188         ror     $A[0][4],$A[4][4],#64-$rhotates[4][4]
 189
 190         ror     $A[1][1],$A[1][4],#64-$rhotates[1][4]
 191         ror     $A[2][2],$A[2][3],#64-$rhotates[2][3]
 192         ror     $A[3][3],$A[3][2],#64-$rhotates[3][2]
 193         ror     $A[4][4],$A[4][1],#64-$rhotates[4][1]
 194
 195         ror     $A[1][4],$A[4][2],#64-$rhotates[4][2]
 196         ror     $A[2][3],$A[3][4],#64-$rhotates[3][4]
 197         ror     $A[3][2],$A[2][1],#64-$rhotates[2][1]
 198         ror     $A[4][1],$A[1][3],#64-$rhotates[1][3]
 199
 200         ror     $A[4][2],$A[2][4],#64-$rhotates[2][4]
 201         ror     $A[3][4],$A[4][3],#64-$rhotates[4][3]
 202         ror     $A[2][1],$A[1][2],#64-$rhotates[1][2]
 203         ror     $A[1][3],$A[3][1],#64-$rhotates[3][1]
 204
 205         ror     $A[2][4],$A[4][0],#64-$rhotates[4][0]
 206         ror     $A[4][3],$A[3][0],#64-$rhotates[3][0]
 207         ror     $A[1][2],$A[2][0],#64-$rhotates[2][0]
 208         ror     $A[3][1],$A[1][0],#64-$rhotates[1][0]
 209
 210         ror     $A[1][0],$C[0],#64-$rhotates[0][3]
 211         ror     $A[2][0],$C[3],#64-$rhotates[0][1]
 212         ror     $A[3][0],$C[2],#64-$rhotates[0][4]
 213         ror     $A[4][0],$C[1],#64-$rhotates[0][2]
 214
 215         ////////////////////////////////////////// Chi+Iota
 216         bic     $C[0],$A[0][2],$A[0][1]
 217         bic     $C[1],$A[0][3],$A[0][2]
 218         bic     $C[2],$A[0][0],$A[0][4]
 219         bic     $C[3],$A[0][1],$A[0][0]
 220         eor     $A[0][0],$A[0][0],$C[0]
 221         bic     $C[0],$A[0][4],$A[0][3]
 222         eor     $A[0][1],$A[0][1],$C[1]
 223          ldr    $C[1],[sp,#16]
 224         eor     $A[0][3],$A[0][3],$C[2]
 225         eor     $A[0][4],$A[0][4],$C[3]
 226         eor     $A[0][2],$A[0][2],$C[0]
 227          ldr    $C[3],[$C[1]],#8                // Iota[i++]
 228
 229         bic     $C[0],$A[1][2],$A[1][1]
 230          tst    $C[1],#255                      // are we done?
 231          str    $C[1],[sp,#16]
 232         bic     $C[1],$A[1][3],$A[1][2]
 233         bic     $C[2],$A[1][0],$A[1][4]
 234          eor    $A[0][0],$A[0][0],$C[3]         // A[0][0] ^= Iota
 235         bic     $C[3],$A[1][1],$A[1][0]
 236         eor     $A[1][0],$A[1][0],$C[0]
 237         bic     $C[0],$A[1][4],$A[1][3]
 238         eor     $A[1][1],$A[1][1],$C[1]
 239         eor     $A[1][3],$A[1][3],$C[2]
 240         eor     $A[1][4],$A[1][4],$C[3]
 241         eor     $A[1][2],$A[1][2],$C[0]
 242
 243         bic     $C[0],$A[2][2],$A[2][1]
 244         bic     $C[1],$A[2][3],$A[2][2]
 245         bic     $C[2],$A[2][0],$A[2][4]
 246         bic     $C[3],$A[2][1],$A[2][0]
 247         eor     $A[2][0],$A[2][0],$C[0]
 248         bic     $C[0],$A[2][4],$A[2][3]
 249         eor     $A[2][1],$A[2][1],$C[1]
 250         eor     $A[2][3],$A[2][3],$C[2]
 251         eor     $A[2][4],$A[2][4],$C[3]
 252         eor     $A[2][2],$A[2][2],$C[0]
 253
 254         bic     $C[0],$A[3][2],$A[3][1]
 255         bic     $C[1],$A[3][3],$A[3][2]
 256         bic     $C[2],$A[3][0],$A[3][4]
 257         bic     $C[3],$A[3][1],$A[3][0]
 258         eor     $A[3][0],$A[3][0],$C[0]
 259         bic     $C[0],$A[3][4],$A[3][3]
 260         eor     $A[3][1],$A[3][1],$C[1]
 261         eor     $A[3][3],$A[3][3],$C[2]
 262         eor     $A[3][4],$A[3][4],$C[3]
 263         eor     $A[3][2],$A[3][2],$C[0]
 264
 265         bic     $C[0],$A[4][2],$A[4][1]
 266         bic     $C[1],$A[4][3],$A[4][2]
 267         bic     $C[2],$A[4][0],$A[4][4]
 268         bic     $C[3],$A[4][1],$A[4][0]
 269         eor     $A[4][0],$A[4][0],$C[0]
 270         bic     $C[0],$A[4][4],$A[4][3]
 271         eor     $A[4][1],$A[4][1],$C[1]
 272         eor     $A[4][3],$A[4][3],$C[2]
 273         eor     $A[4][4],$A[4][4],$C[3]
 274         eor     $A[4][2],$A[4][2],$C[0]
 275
 276         bne     .Loop
 277
 278         ldr     x30,[sp,#24]
 279         ret
 280 .size   KeccakF1600_int,.-KeccakF1600_int
 281
 282 .type   KeccakF1600,%function
 283 .align  5
 284 KeccakF1600:
 285         stp     x29,x30,[sp,#-128]!
 286         add     x29,sp,#0
 287         stp     x19,x20,[sp,#16]
 288         stp     x21,x22,[sp,#32]
 289         stp     x23,x24,[sp,#48]
 290         stp     x25,x26,[sp,#64]
 291         stp     x27,x28,[sp,#80]
 292         sub     sp,sp,#48
 293
 294         str     x0,[sp,#32]                     // offload argument
 295         mov     $C[0],x0
 296         ldp     $A[0][0],$A[0][1],[x0,#16*0]
 297         ldp     $A[0][2],$A[0][3],[$C[0],#16*1]
 298         ldp     $A[0][4],$A[1][0],[$C[0],#16*2]
 299         ldp     $A[1][1],$A[1][2],[$C[0],#16*3]
 300         ldp     $A[1][3],$A[1][4],[$C[0],#16*4]
 301         ldp     $A[2][0],$A[2][1],[$C[0],#16*5]
 302         ldp     $A[2][2],$A[2][3],[$C[0],#16*6]
 303         ldp     $A[2][4],$A[3][0],[$C[0],#16*7]
 304         ldp     $A[3][1],$A[3][2],[$C[0],#16*8]
 305         ldp     $A[3][3],$A[3][4],[$C[0],#16*9]
 306         ldp     $A[4][0],$A[4][1],[$C[0],#16*10]
 307         ldp     $A[4][2],$A[4][3],[$C[0],#16*11]
 308         ldr     $A[4][4],[$C[0],#16*12]
 309
 310         bl      KeccakF1600_int
 311
 312         ldr     $C[0],[sp,#32]
 313         stp     $A[0][0],$A[0][1],[$C[0],#16*0]
 314         stp     $A[0][2],$A[0][3],[$C[0],#16*1]
 315         stp     $A[0][4],$A[1][0],[$C[0],#16*2]
 316         stp     $A[1][1],$A[1][2],[$C[0],#16*3]
 317         stp     $A[1][3],$A[1][4],[$C[0],#16*4]
 318         stp     $A[2][0],$A[2][1],[$C[0],#16*5]
 319         stp     $A[2][2],$A[2][3],[$C[0],#16*6]
 320         stp     $A[2][4],$A[3][0],[$C[0],#16*7]
 321         stp     $A[3][1],$A[3][2],[$C[0],#16*8]
 322         stp     $A[3][3],$A[3][4],[$C[0],#16*9]
 323         stp     $A[4][0],$A[4][1],[$C[0],#16*10]
 324         stp     $A[4][2],$A[4][3],[$C[0],#16*11]
 325         str     $A[4][4],[$C[0],#16*12]
 326
 327         ldp     x19,x20,[x29,#16]
 328         add     sp,sp,#48
 329         ldp     x21,x22,[x29,#32]
 330         ldp     x23,x24,[x29,#48]
 331         ldp     x25,x26,[x29,#64]
 332         ldp     x27,x28,[x29,#80]
 333         ldp     x29,x30,[sp],#128
 334         ret
 335 .size   KeccakF1600,.-KeccakF1600
 336
 337 .globl  SHA3_absorb
 338 .type   SHA3_absorb,%function
 339 .align  5
 340 SHA3_absorb:
 341         stp     x29,x30,[sp,#-128]!
 342         add     x29,sp,#0
 343         stp     x19,x20,[sp,#16]
 344         stp     x21,x22,[sp,#32]
 345         stp     x23,x24,[sp,#48]
 346         stp     x25,x26,[sp,#64]
 347         stp     x27,x28,[sp,#80]
 348         sub     sp,sp,#64
 349
 350         stp     x0,x1,[sp,#32]                  // offload arguments
 351         stp     x2,x3,[sp,#48]
 352
 353         mov     $C[0],x0                        // uint64_t A[5][5]
 354         mov     $C[1],x1                        // const void *inp
 355         mov     $C[2],x2                        // size_t len
 356         mov     $C[3],x3                        // size_t bsz
 357         ldp     $A[0][0],$A[0][1],[$C[0],#16*0]
 358         ldp     $A[0][2],$A[0][3],[$C[0],#16*1]
 359         ldp     $A[0][4],$A[1][0],[$C[0],#16*2]
 360         ldp     $A[1][1],$A[1][2],[$C[0],#16*3]
 361         ldp     $A[1][3],$A[1][4],[$C[0],#16*4]
 362         ldp     $A[2][0],$A[2][1],[$C[0],#16*5]
 363         ldp     $A[2][2],$A[2][3],[$C[0],#16*6]
 364         ldp     $A[2][4],$A[3][0],[$C[0],#16*7]
 365         ldp     $A[3][1],$A[3][2],[$C[0],#16*8]
 366         ldp     $A[3][3],$A[3][4],[$C[0],#16*9]
 367         ldp     $A[4][0],$A[4][1],[$C[0],#16*10]
 368         ldp     $A[4][2],$A[4][3],[$C[0],#16*11]
 369         ldr     $A[4][4],[$C[0],#16*12]
 370         b       .Loop_absorb
 371
 372 .align  4
 373 .Loop_absorb:
 374         subs    $C[0],$C[2],$C[3]               // len - bsz
 375         blo     .Labsorbed
 376
 377         str     $C[0],[sp,#48]                  // save len - bsz
 378         ldr     $C[0],[$C[1]],#8                // *inp++
 379 #ifdef  __AARCH64EB__
 380         rev     $C[0],$C[0]
 381 #endif
 382         eor     $A[0][0],$A[0][0],$C[0]
 383         cmp     $C[3],#8*2
 384         blo     .Lprocess_block
 385         ldr     $C[0],[$C[1]],#8                // *inp++
 386 #ifdef  __AARCH64EB__
 387         rev     $C[0],$C[0]
 388 #endif
 389         eor     $A[0][1],$A[0][1],$C[0]
 390         beq     .Lprocess_block
 391         ldr     $C[0],[$C[1]],#8                // *inp++
 392 #ifdef  __AARCH64EB__
 393         rev     $C[0],$C[0]
 394 #endif
 395         eor     $A[0][2],$A[0][2],$C[0]
 396         cmp     $C[3],#8*4
 397         blo     .Lprocess_block
 398         ldr     $C[0],[$C[1]],#8                // *inp++
 399 #ifdef  __AARCH64EB__
 400         rev     $C[0],$C[0]
 401 #endif
 402         eor     $A[0][3],$A[0][3],$C[0]
 403         beq     .Lprocess_block
 404         ldr     $C[0],[$C[1]],#8                // *inp++
 405 #ifdef  __AARCH64EB__
 406         rev     $C[0],$C[0]
 407 #endif
 408         eor     $A[0][4],$A[0][4],$C[0]
 409         cmp     $C[3],#8*6
 410         blo     .Lprocess_block
 411         ldr     $C[0],[$C[1]],#8                // *inp++
 412 #ifdef  __AARCH64EB__
 413         rev     $C[0],$C[0]
 414 #endif
 415         eor     $A[1][0],$A[1][0],$C[0]
 416         beq     .Lprocess_block
 417         ldr     $C[0],[$C[1]],#8                // *inp++
 418 #ifdef  __AARCH64EB__
 419         rev     $C[0],$C[0]
 420 #endif
 421         eor     $A[1][1],$A[1][1],$C[0]
 422         cmp     $C[3],#8*8
 423         blo     .Lprocess_block
 424         ldr     $C[0],[$C[1]],#8                // *inp++
 425 #ifdef  __AARCH64EB__
 426         rev     $C[0],$C[0]
 427 #endif
 428         eor     $A[1][2],$A[1][2],$C[0]
 429         beq     .Lprocess_block
 430         ldr     $C[0],[$C[1]],#8                // *inp++
 431 #ifdef  __AARCH64EB__
 432         rev     $C[0],$C[0]
 433 #endif
 434         eor     $A[1][3],$A[1][3],$C[0]
 435         cmp     $C[3],#8*10
 436         blo     .Lprocess_block
 437         ldr     $C[0],[$C[1]],#8                // *inp++
 438 #ifdef  __AARCH64EB__
 439         rev     $C[0],$C[0]
 440 #endif
 441         eor     $A[1][4],$A[1][4],$C[0]
 442         beq     .Lprocess_block
 443         ldr     $C[0],[$C[1]],#8                // *inp++
 444 #ifdef  __AARCH64EB__
 445         rev     $C[0],$C[0]
 446 #endif
 447         eor     $A[2][0],$A[2][0],$C[0]
 448         cmp     $C[3],#8*12
 449         blo     .Lprocess_block
 450         ldr     $C[0],[$C[1]],#8                // *inp++
 451 #ifdef  __AARCH64EB__
 452         rev     $C[0],$C[0]
 453 #endif
 454         eor     $A[2][1],$A[2][1],$C[0]
 455         beq     .Lprocess_block
 456         ldr     $C[0],[$C[1]],#8                // *inp++
 457 #ifdef  __AARCH64EB__
 458         rev     $C[0],$C[0]
 459 #endif
 460         eor     $A[2][2],$A[2][2],$C[0]
 461         cmp     $C[3],#8*14
 462         blo     .Lprocess_block
 463         ldr     $C[0],[$C[1]],#8                // *inp++
 464 #ifdef  __AARCH64EB__
 465         rev     $C[0],$C[0]
 466 #endif
 467         eor     $A[2][3],$A[2][3],$C[0]
 468         beq     .Lprocess_block
 469         ldr     $C[0],[$C[1]],#8                // *inp++
 470 #ifdef  __AARCH64EB__
 471         rev     $C[0],$C[0]
 472 #endif
 473         eor     $A[2][4],$A[2][4],$C[0]
 474         cmp     $C[3],#8*16
 475         blo     .Lprocess_block
 476         ldr     $C[0],[$C[1]],#8                // *inp++
 477 #ifdef  __AARCH64EB__
 478         rev     $C[0],$C[0]
 479 #endif
 480         eor     $A[3][0],$A[3][0],$C[0]
 481         beq     .Lprocess_block
 482         ldr     $C[0],[$C[1]],#8                // *inp++
 483 #ifdef  __AARCH64EB__
 484         rev     $C[0],$C[0]
 485 #endif
 486         eor     $A[3][1],$A[3][1],$C[0]
 487         cmp     $C[3],#8*18
 488         blo     .Lprocess_block
 489         ldr     $C[0],[$C[1]],#8                // *inp++
 490 #ifdef  __AARCH64EB__
 491         rev     $C[0],$C[0]
 492 #endif
 493         eor     $A[3][2],$A[3][2],$C[0]
 494         beq     .Lprocess_block
 495         ldr     $C[0],[$C[1]],#8                // *inp++
 496 #ifdef  __AARCH64EB__
 497         rev     $C[0],$C[0]
 498 #endif
 499         eor     $A[3][3],$A[3][3],$C[0]
 500         cmp     $C[3],#8*20
 501         blo     .Lprocess_block
 502         ldr     $C[0],[$C[1]],#8                // *inp++
 503 #ifdef  __AARCH64EB__
 504         rev     $C[0],$C[0]
 505 #endif
 506         eor     $A[3][4],$A[3][4],$C[0]
 507         beq     .Lprocess_block
 508         ldr     $C[0],[$C[1]],#8                // *inp++
 509 #ifdef  __AARCH64EB__
 510         rev     $C[0],$C[0]
 511 #endif
 512         eor     $A[4][0],$A[4][0],$C[0]
 513         cmp     $C[3],#8*22
 514         blo     .Lprocess_block
 515         ldr     $C[0],[$C[1]],#8                // *inp++
 516 #ifdef  __AARCH64EB__
 517         rev     $C[0],$C[0]
 518 #endif
 519         eor     $A[4][1],$A[4][1],$C[0]
 520         beq     .Lprocess_block
 521         ldr     $C[0],[$C[1]],#8                // *inp++
 522 #ifdef  __AARCH64EB__
 523         rev     $C[0],$C[0]
 524 #endif
 525         eor     $A[4][2],$A[4][2],$C[0]
 526         cmp     $C[3],#8*24
 527         blo     .Lprocess_block
 528         ldr     $C[0],[$C[1]],#8                // *inp++
 529 #ifdef  __AARCH64EB__
 530         rev     $C[0],$C[0]
 531 #endif
 532         eor     $A[4][3],$A[4][3],$C[0]
 533         beq     .Lprocess_block
 534         ldr     $C[0],[$C[1]],#8                // *inp++
 535 #ifdef  __AARCH64EB__
 536         rev     $C[0],$C[0]
 537 #endif
 538         eor     $A[4][4],$A[4][4],$C[0]
 539
 540 .Lprocess_block:
 541         str     $C[1],[sp,#40]                  // save inp
 542
 543         bl      KeccakF1600_int
 544
 545         ldr     $C[1],[sp,#40]                  // restore arguments
 546         ldp     $C[2],$C[3],[sp,#48]
 547         b       .Loop_absorb
 548
 549 .align  4
 550 .Labsorbed:
 551         ldr     $C[1],[sp,#32]
 552         stp     $A[0][0],$A[0][1],[$C[1],#16*0]
 553         stp     $A[0][2],$A[0][3],[$C[1],#16*1]
 554         stp     $A[0][4],$A[1][0],[$C[1],#16*2]
 555         stp     $A[1][1],$A[1][2],[$C[1],#16*3]
 556         stp     $A[1][3],$A[1][4],[$C[1],#16*4]
 557         stp     $A[2][0],$A[2][1],[$C[1],#16*5]
 558         stp     $A[2][2],$A[2][3],[$C[1],#16*6]
 559         stp     $A[2][4],$A[3][0],[$C[1],#16*7]
 560         stp     $A[3][1],$A[3][2],[$C[1],#16*8]
 561         stp     $A[3][3],$A[3][4],[$C[1],#16*9]
 562         stp     $A[4][0],$A[4][1],[$C[1],#16*10]
 563         stp     $A[4][2],$A[4][3],[$C[1],#16*11]
 564         str     $A[4][4],[$C[1],#16*12]
 565
 566         mov     x0,$C[0]                        // return value
 567         ldp     x19,x20,[x29,#16]
 568         add     sp,sp,#64
 569         ldp     x21,x22,[x29,#32]
 570         ldp     x23,x24,[x29,#48]
 571         ldp     x25,x26,[x29,#64]
 572         ldp     x27,x28,[x29,#80]
 573         ldp     x29,x30,[sp],#128
 574         ret
 575 .size   SHA3_absorb,.-SHA3_absorb
 576 ___
 577 {
 578 my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
 579 $code.=<<___;
 580 .globl  SHA3_squeeze
 581 .type   SHA3_squeeze,%function
 582 .align  5
 583 SHA3_squeeze:
 584         stp     x29,x30,[sp,#-48]!
 585         add     x29,sp,#0
 586         stp     x19,x20,[sp,#16]
 587         stp     x21,x22,[sp,#32]
 588
 589         mov     $A_flat,x0                      // put aside arguments
 590         mov     $out,x1
 591         mov     $len,x2
 592         mov     $bsz,x3
 593
 594 .Loop_squeeze:
 595         ldr     x4,[x0],#8
 596         cmp     $len,#8
 597         blo     .Lsqueeze_tail
 598 #ifdef  __AARCH64EB__
 599         rev     x4,x4
 600 #endif
 601         str     x4,[$out],#8
 602         subs    $len,$len,#8
 603         beq     .Lsqueeze_done
 604
 605         subs    x3,x3,#8
 606         bhi     .Loop_squeeze
 607
 608         mov     x0,$A_flat
 609         bl      KeccakF1600
 610         mov     x0,$A_flat
 611         mov     x3,$bsz
 612         b       .Loop_squeeze
 613
 614 .align  4
 615 .Lsqueeze_tail:
 616         strb    w4,[$out],#1
 617         lsr     x4,x4,#8
 618         subs    $len,$len,#1
 619         beq     .Lsqueeze_done
 620         strb    w4,[$out],#1
 621         lsr     x4,x4,#8
 622         subs    $len,$len,#1
 623         beq     .Lsqueeze_done
 624         strb    w4,[$out],#1
 625         lsr     x4,x4,#8
 626         subs    $len,$len,#1
 627         beq     .Lsqueeze_done
 628         strb    w4,[$out],#1
 629         lsr     x4,x4,#8
 630         subs    $len,$len,#1
 631         beq     .Lsqueeze_done
 632         strb    w4,[$out],#1
 633         lsr     x4,x4,#8
 634         subs    $len,$len,#1
 635         beq     .Lsqueeze_done
 636         strb    w4,[$out],#1
 637         lsr     x4,x4,#8
 638         subs    $len,$len,#1
 639         beq     .Lsqueeze_done
 640         strb    w4,[$out],#1
 641
 642 .Lsqueeze_done:
 643         ldp     x19,x20,[sp,#16]
 644         ldp     x21,x22,[sp,#32]
 645         ldp     x29,x30,[sp],#48
 646         ret
 647 .size   SHA3_squeeze,.-SHA3_squeeze
 648 .asciz  "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 649 ___
 650 }
 651
 652 print $code;
 653 close STDOUT;