crypto/sha/asm/keccak1600-armv8.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # Keccak-1600 for ARMv8.
  17 #
  18 # June 2017.
  19 #
  20 # This is straightforward KECCAK_1X_ALT implementation. It makes no
  21 # sense to attempt SIMD/NEON implementation for following reason.
  22 # 64-bit lanes of vector registers can't be addressed as easily as in
  23 # 32-bit mode. This means that 64-bit NEON is bound to be slower than
  24 # 32-bit NEON, and this implementation is faster than 32-bit NEON on
  25 # same processor. Even though it takes more scalar xor's and andn's,
  26 # it gets compensated by availability of rotate. Not to forget that
  27 # most processors achieve higher issue rate with scalar instructions.
  28 #
  29 ######################################################################
  30 # Numbers are cycles per processed byte.
  31 #
  32 #               r=1088(*)
  33 #
  34 # Cortex-A53    13
  35 # Cortex-A57    12
  36 # X-Gene        14
  37 # Mongoose      10
  38 # Kryo          12
  39 # Denver        7.8
  40 # Apple A7      7.2
  41 #
  42 # (*)   Corresponds to SHA3-256. No improvement coefficients are listed
  43 #       because they vary too much from compiler to compiler. Newer
  44 #       compiler does much better and improvement varies from 5% on
  45 #       Cortex-A57 to 25% on Cortex-A53. While in comparison to older
  46 #       compiler this code is at least 2x faster...
  47
  48 $flavour = shift;
  49 $output  = shift;
  50
  51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  52 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  53 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  54 die "can't locate arm-xlate.pl";
  55
  56 open OUT,"| \"$^X\" $xlate $flavour $output";
  57 *STDOUT=*OUT;
  58
  59 my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
  60             (0, 5, 10, 15, 20));
  61    $A[3][3] = "x25"; # x18 is reserved
  62
  63 my @C = map("x$_", (26,27,28,30));
  64
  65 my @rhotates = ([  0,  1, 62, 28, 27 ],
  66                 [ 36, 44,  6, 55, 20 ],
  67                 [  3, 10, 43, 25, 39 ],
  68                 [ 41, 45, 15, 21,  8 ],
  69                 [ 18,  2, 61, 56, 14 ]);
  70
  71 $code.=<<___;
  72 .text
  73
  74 .align 8        // strategic alignment and padding that allows to use
  75                 // address value as loop termination condition...
  76         .quad   0,0,0,0,0,0,0,0
  77 .type   iotas,%object
  78 iotas:
  79         .quad   0x0000000000000001
  80         .quad   0x0000000000008082
  81         .quad   0x800000000000808a
  82         .quad   0x8000000080008000
  83         .quad   0x000000000000808b
  84         .quad   0x0000000080000001
  85         .quad   0x8000000080008081
  86         .quad   0x8000000000008009
  87         .quad   0x000000000000008a
  88         .quad   0x0000000000000088
  89         .quad   0x0000000080008009
  90         .quad   0x000000008000000a
  91         .quad   0x000000008000808b
  92         .quad   0x800000000000008b
  93         .quad   0x8000000000008089
  94         .quad   0x8000000000008003
  95         .quad   0x8000000000008002
  96         .quad   0x8000000000000080
  97         .quad   0x000000000000800a
  98         .quad   0x800000008000000a
  99         .quad   0x8000000080008081
 100         .quad   0x8000000000008080
 101         .quad   0x0000000080000001
 102         .quad   0x8000000080008008
 103 .size   iotas,.-iotas
 104
 105 .type   KeccakF1600_int,%function
 106 .align  5
 107 KeccakF1600_int:
 108         adr     $C[2],iotas
 109         stp     $C[2],x30,[sp,#16]              // 32 bytes on top are mine
 110         b       .Loop
 111 .align  4
 112 .Loop:
 113         ////////////////////////////////////////// Theta
 114         eor     $C[0],$A[0][0],$A[1][0]
 115         stp     $A[0][4],$A[1][4],[sp,#0]       // offload pair...
 116         eor     $C[1],$A[0][1],$A[1][1]
 117         eor     $C[2],$A[0][2],$A[1][2]
 118         eor     $C[3],$A[0][3],$A[1][3]
 119 ___
 120         $C[4]=$A[0][4];
 121         $C[5]=$A[1][4];
 122 $code.=<<___;
 123         eor     $C[4],$A[0][4],$A[1][4]
 124         eor     $C[0],$C[0],$A[2][0]
 125         eor     $C[1],$C[1],$A[2][1]
 126         eor     $C[2],$C[2],$A[2][2]
 127         eor     $C[3],$C[3],$A[2][3]
 128         eor     $C[4],$C[4],$A[2][4]
 129         eor     $C[0],$C[0],$A[3][0]
 130         eor     $C[1],$C[1],$A[3][1]
 131         eor     $C[2],$C[2],$A[3][2]
 132         eor     $C[3],$C[3],$A[3][3]
 133         eor     $C[4],$C[4],$A[3][4]
 134         eor     $C[0],$C[0],$A[4][0]
 135         eor     $C[2],$C[2],$A[4][2]
 136         eor     $C[1],$C[1],$A[4][1]
 137         eor     $C[3],$C[3],$A[4][3]
 138         eor     $C[4],$C[4],$A[4][4]
 139
 140         eor     $C[5],$C[0],$C[2],ror#63
 141
 142         eor     $A[0][1],$A[0][1],$C[5]
 143         eor     $A[1][1],$A[1][1],$C[5]
 144         eor     $A[2][1],$A[2][1],$C[5]
 145         eor     $A[3][1],$A[3][1],$C[5]
 146         eor     $A[4][1],$A[4][1],$C[5]
 147
 148         eor     $C[5],$C[1],$C[3],ror#63
 149         eor     $C[2],$C[2],$C[4],ror#63
 150         eor     $C[3],$C[3],$C[0],ror#63
 151         eor     $C[4],$C[4],$C[1],ror#63
 152
 153         eor     $C[1],   $A[0][2],$C[5]         // mov  $C[1],$A[0][2]
 154         eor     $A[1][2],$A[1][2],$C[5]
 155         eor     $A[2][2],$A[2][2],$C[5]
 156         eor     $A[3][2],$A[3][2],$C[5]
 157         eor     $A[4][2],$A[4][2],$C[5]
 158
 159         eor     $A[0][0],$A[0][0],$C[4]
 160         eor     $A[1][0],$A[1][0],$C[4]
 161         eor     $A[2][0],$A[2][0],$C[4]
 162         eor     $A[3][0],$A[3][0],$C[4]
 163         eor     $A[4][0],$A[4][0],$C[4]
 164 ___
 165         $C[4]=undef;
 166         $C[5]=undef;
 167 $code.=<<___;
 168         ldp     $A[0][4],$A[1][4],[sp,#0]       // re-load offloaded data
 169         eor     $C[0],   $A[0][3],$C[2]         // mov  $C[0],$A[0][3]
 170         eor     $A[1][3],$A[1][3],$C[2]
 171         eor     $A[2][3],$A[2][3],$C[2]
 172         eor     $A[3][3],$A[3][3],$C[2]
 173         eor     $A[4][3],$A[4][3],$C[2]
 174
 175         eor     $C[2],   $A[0][4],$C[3]         // mov  $C[2],$A[0][4]
 176         eor     $A[1][4],$A[1][4],$C[3]
 177         eor     $A[2][4],$A[2][4],$C[3]
 178         eor     $A[3][4],$A[3][4],$C[3]
 179         eor     $A[4][4],$A[4][4],$C[3]
 180
 181         ////////////////////////////////////////// Rho+Pi
 182         mov     $C[3],$A[0][1]
 183         ror     $A[0][1],$A[1][1],#64-$rhotates[1][1]
 184         //mov   $C[1],$A[0][2]
 185         ror     $A[0][2],$A[2][2],#64-$rhotates[2][2]
 186         //mov   $C[0],$A[0][3]
 187         ror     $A[0][3],$A[3][3],#64-$rhotates[3][3]
 188         //mov   $C[2],$A[0][4]
 189         ror     $A[0][4],$A[4][4],#64-$rhotates[4][4]
 190
 191         ror     $A[1][1],$A[1][4],#64-$rhotates[1][4]
 192         ror     $A[2][2],$A[2][3],#64-$rhotates[2][3]
 193         ror     $A[3][3],$A[3][2],#64-$rhotates[3][2]
 194         ror     $A[4][4],$A[4][1],#64-$rhotates[4][1]
 195
 196         ror     $A[1][4],$A[4][2],#64-$rhotates[4][2]
 197         ror     $A[2][3],$A[3][4],#64-$rhotates[3][4]
 198         ror     $A[3][2],$A[2][1],#64-$rhotates[2][1]
 199         ror     $A[4][1],$A[1][3],#64-$rhotates[1][3]
 200
 201         ror     $A[4][2],$A[2][4],#64-$rhotates[2][4]
 202         ror     $A[3][4],$A[4][3],#64-$rhotates[4][3]
 203         ror     $A[2][1],$A[1][2],#64-$rhotates[1][2]
 204         ror     $A[1][3],$A[3][1],#64-$rhotates[3][1]
 205
 206         ror     $A[2][4],$A[4][0],#64-$rhotates[4][0]
 207         ror     $A[4][3],$A[3][0],#64-$rhotates[3][0]
 208         ror     $A[1][2],$A[2][0],#64-$rhotates[2][0]
 209         ror     $A[3][1],$A[1][0],#64-$rhotates[1][0]
 210
 211         ror     $A[1][0],$C[0],#64-$rhotates[0][3]
 212         ror     $A[2][0],$C[3],#64-$rhotates[0][1]
 213         ror     $A[3][0],$C[2],#64-$rhotates[0][4]
 214         ror     $A[4][0],$C[1],#64-$rhotates[0][2]
 215
 216         ////////////////////////////////////////// Chi+Iota
 217         bic     $C[0],$A[0][2],$A[0][1]
 218         bic     $C[1],$A[0][3],$A[0][2]
 219         bic     $C[2],$A[0][0],$A[0][4]
 220         bic     $C[3],$A[0][1],$A[0][0]
 221         eor     $A[0][0],$A[0][0],$C[0]
 222         bic     $C[0],$A[0][4],$A[0][3]
 223         eor     $A[0][1],$A[0][1],$C[1]
 224          ldr    $C[1],[sp,#16]
 225         eor     $A[0][3],$A[0][3],$C[2]
 226         eor     $A[0][4],$A[0][4],$C[3]
 227         eor     $A[0][2],$A[0][2],$C[0]
 228          ldr    $C[3],[$C[1]],#8                // Iota[i++]
 229
 230         bic     $C[0],$A[1][2],$A[1][1]
 231          tst    $C[1],#255                      // are we done?
 232          str    $C[1],[sp,#16]
 233         bic     $C[1],$A[1][3],$A[1][2]
 234         bic     $C[2],$A[1][0],$A[1][4]
 235          eor    $A[0][0],$A[0][0],$C[3]         // A[0][0] ^= Iota
 236         bic     $C[3],$A[1][1],$A[1][0]
 237         eor     $A[1][0],$A[1][0],$C[0]
 238         bic     $C[0],$A[1][4],$A[1][3]
 239         eor     $A[1][1],$A[1][1],$C[1]
 240         eor     $A[1][3],$A[1][3],$C[2]
 241         eor     $A[1][4],$A[1][4],$C[3]
 242         eor     $A[1][2],$A[1][2],$C[0]
 243
 244         bic     $C[0],$A[2][2],$A[2][1]
 245         bic     $C[1],$A[2][3],$A[2][2]
 246         bic     $C[2],$A[2][0],$A[2][4]
 247         bic     $C[3],$A[2][1],$A[2][0]
 248         eor     $A[2][0],$A[2][0],$C[0]
 249         bic     $C[0],$A[2][4],$A[2][3]
 250         eor     $A[2][1],$A[2][1],$C[1]
 251         eor     $A[2][3],$A[2][3],$C[2]
 252         eor     $A[2][4],$A[2][4],$C[3]
 253         eor     $A[2][2],$A[2][2],$C[0]
 254
 255         bic     $C[0],$A[3][2],$A[3][1]
 256         bic     $C[1],$A[3][3],$A[3][2]
 257         bic     $C[2],$A[3][0],$A[3][4]
 258         bic     $C[3],$A[3][1],$A[3][0]
 259         eor     $A[3][0],$A[3][0],$C[0]
 260         bic     $C[0],$A[3][4],$A[3][3]
 261         eor     $A[3][1],$A[3][1],$C[1]
 262         eor     $A[3][3],$A[3][3],$C[2]
 263         eor     $A[3][4],$A[3][4],$C[3]
 264         eor     $A[3][2],$A[3][2],$C[0]
 265
 266         bic     $C[0],$A[4][2],$A[4][1]
 267         bic     $C[1],$A[4][3],$A[4][2]
 268         bic     $C[2],$A[4][0],$A[4][4]
 269         bic     $C[3],$A[4][1],$A[4][0]
 270         eor     $A[4][0],$A[4][0],$C[0]
 271         bic     $C[0],$A[4][4],$A[4][3]
 272         eor     $A[4][1],$A[4][1],$C[1]
 273         eor     $A[4][3],$A[4][3],$C[2]
 274         eor     $A[4][4],$A[4][4],$C[3]
 275         eor     $A[4][2],$A[4][2],$C[0]
 276
 277         bne     .Loop
 278
 279         ldr     x30,[sp,#24]
 280         ret
 281 .size   KeccakF1600_int,.-KeccakF1600_int
 282
 283 .type   KeccakF1600,%function
 284 .align  5
 285 KeccakF1600:
 286         stp     x29,x30,[sp,#-128]!
 287         add     x29,sp,#0
 288         stp     x19,x20,[sp,#16]
 289         stp     x21,x22,[sp,#32]
 290         stp     x23,x24,[sp,#48]
 291         stp     x25,x26,[sp,#64]
 292         stp     x27,x28,[sp,#80]
 293         sub     sp,sp,#48
 294
 295         str     x0,[sp,#32]                     // offload argument
 296         mov     $C[0],x0
 297         ldp     $A[0][0],$A[0][1],[x0,#16*0]
 298         ldp     $A[0][2],$A[0][3],[$C[0],#16*1]
 299         ldp     $A[0][4],$A[1][0],[$C[0],#16*2]
 300         ldp     $A[1][1],$A[1][2],[$C[0],#16*3]
 301         ldp     $A[1][3],$A[1][4],[$C[0],#16*4]
 302         ldp     $A[2][0],$A[2][1],[$C[0],#16*5]
 303         ldp     $A[2][2],$A[2][3],[$C[0],#16*6]
 304         ldp     $A[2][4],$A[3][0],[$C[0],#16*7]
 305         ldp     $A[3][1],$A[3][2],[$C[0],#16*8]
 306         ldp     $A[3][3],$A[3][4],[$C[0],#16*9]
 307         ldp     $A[4][0],$A[4][1],[$C[0],#16*10]
 308         ldp     $A[4][2],$A[4][3],[$C[0],#16*11]
 309         ldr     $A[4][4],[$C[0],#16*12]
 310
 311         bl      KeccakF1600_int
 312
 313         ldr     $C[0],[sp,#32]
 314         stp     $A[0][0],$A[0][1],[$C[0],#16*0]
 315         stp     $A[0][2],$A[0][3],[$C[0],#16*1]
 316         stp     $A[0][4],$A[1][0],[$C[0],#16*2]
 317         stp     $A[1][1],$A[1][2],[$C[0],#16*3]
 318         stp     $A[1][3],$A[1][4],[$C[0],#16*4]
 319         stp     $A[2][0],$A[2][1],[$C[0],#16*5]
 320         stp     $A[2][2],$A[2][3],[$C[0],#16*6]
 321         stp     $A[2][4],$A[3][0],[$C[0],#16*7]
 322         stp     $A[3][1],$A[3][2],[$C[0],#16*8]
 323         stp     $A[3][3],$A[3][4],[$C[0],#16*9]
 324         stp     $A[4][0],$A[4][1],[$C[0],#16*10]
 325         stp     $A[4][2],$A[4][3],[$C[0],#16*11]
 326         str     $A[4][4],[$C[0],#16*12]
 327
 328         ldp     x19,x20,[x29,#16]
 329         add     sp,sp,#48
 330         ldp     x21,x22,[x29,#32]
 331         ldp     x23,x24,[x29,#48]
 332         ldp     x25,x26,[x29,#64]
 333         ldp     x27,x28,[x29,#80]
 334         ldp     x29,x30,[sp],#128
 335         ret
 336 .size   KeccakF1600,.-KeccakF1600
 337
 338 .globl  SHA3_absorb
 339 .type   SHA3_absorb,%function
 340 .align  5
 341 SHA3_absorb:
 342         stp     x29,x30,[sp,#-128]!
 343         add     x29,sp,#0
 344         stp     x19,x20,[sp,#16]
 345         stp     x21,x22,[sp,#32]
 346         stp     x23,x24,[sp,#48]
 347         stp     x25,x26,[sp,#64]
 348         stp     x27,x28,[sp,#80]
 349         sub     sp,sp,#64
 350
 351         stp     x0,x1,[sp,#32]                  // offload arguments
 352         stp     x2,x3,[sp,#48]
 353
 354         mov     $C[0],x0                        // uint64_t A[5][5]
 355         mov     $C[1],x1                        // const void *inp
 356         mov     $C[2],x2                        // size_t len
 357         mov     $C[3],x3                        // size_t bsz
 358         ldp     $A[0][0],$A[0][1],[$C[0],#16*0]
 359         ldp     $A[0][2],$A[0][3],[$C[0],#16*1]
 360         ldp     $A[0][4],$A[1][0],[$C[0],#16*2]
 361         ldp     $A[1][1],$A[1][2],[$C[0],#16*3]
 362         ldp     $A[1][3],$A[1][4],[$C[0],#16*4]
 363         ldp     $A[2][0],$A[2][1],[$C[0],#16*5]
 364         ldp     $A[2][2],$A[2][3],[$C[0],#16*6]
 365         ldp     $A[2][4],$A[3][0],[$C[0],#16*7]
 366         ldp     $A[3][1],$A[3][2],[$C[0],#16*8]
 367         ldp     $A[3][3],$A[3][4],[$C[0],#16*9]
 368         ldp     $A[4][0],$A[4][1],[$C[0],#16*10]
 369         ldp     $A[4][2],$A[4][3],[$C[0],#16*11]
 370         ldr     $A[4][4],[$C[0],#16*12]
 371         b       .Loop_absorb
 372
 373 .align  4
 374 .Loop_absorb:
 375         subs    $C[0],$C[2],$C[3]               // len - bsz
 376         blo     .Labsorbed
 377
 378         str     $C[0],[sp,#48]                  // save len - bsz
 379 ___
 380 for (my $i=0; $i<24; $i+=2) {
 381 my $j = $i+1;
 382 $code.=<<___;
 383         ldr     $C[0],[$C[1]],#8                // *inp++
 384 #ifdef  __AARCH64EB__
 385         rev     $C[0],$C[0]
 386 #endif
 387         eor     $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
 388         cmp     $C[3],#8*($i+2)
 389         blo     .Lprocess_block
 390         ldr     $C[0],[$C[1]],#8                // *inp++
 391 #ifdef  __AARCH64EB__
 392         rev     $C[0],$C[0]
 393 #endif
 394         eor     $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
 395         beq     .Lprocess_block
 396 ___
 397 }
 398 $code.=<<___;
 399         ldr     $C[0],[$C[1]],#8                // *inp++
 400 #ifdef  __AARCH64EB__
 401         rev     $C[0],$C[0]
 402 #endif
 403         eor     $A[4][4],$A[4][4],$C[0]
 404
 405 .Lprocess_block:
 406         str     $C[1],[sp,#40]                  // save inp
 407
 408         bl      KeccakF1600_int
 409
 410         ldr     $C[1],[sp,#40]                  // restore arguments
 411         ldp     $C[2],$C[3],[sp,#48]
 412         b       .Loop_absorb
 413
 414 .align  4
 415 .Labsorbed:
 416         ldr     $C[1],[sp,#32]
 417         stp     $A[0][0],$A[0][1],[$C[1],#16*0]
 418         stp     $A[0][2],$A[0][3],[$C[1],#16*1]
 419         stp     $A[0][4],$A[1][0],[$C[1],#16*2]
 420         stp     $A[1][1],$A[1][2],[$C[1],#16*3]
 421         stp     $A[1][3],$A[1][4],[$C[1],#16*4]
 422         stp     $A[2][0],$A[2][1],[$C[1],#16*5]
 423         stp     $A[2][2],$A[2][3],[$C[1],#16*6]
 424         stp     $A[2][4],$A[3][0],[$C[1],#16*7]
 425         stp     $A[3][1],$A[3][2],[$C[1],#16*8]
 426         stp     $A[3][3],$A[3][4],[$C[1],#16*9]
 427         stp     $A[4][0],$A[4][1],[$C[1],#16*10]
 428         stp     $A[4][2],$A[4][3],[$C[1],#16*11]
 429         str     $A[4][4],[$C[1],#16*12]
 430
 431         mov     x0,$C[2]                        // return value
 432         ldp     x19,x20,[x29,#16]
 433         add     sp,sp,#64
 434         ldp     x21,x22,[x29,#32]
 435         ldp     x23,x24,[x29,#48]
 436         ldp     x25,x26,[x29,#64]
 437         ldp     x27,x28,[x29,#80]
 438         ldp     x29,x30,[sp],#128
 439         ret
 440 .size   SHA3_absorb,.-SHA3_absorb
 441 ___
 442 {
 443 my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
 444 $code.=<<___;
 445 .globl  SHA3_squeeze
 446 .type   SHA3_squeeze,%function
 447 .align  5
 448 SHA3_squeeze:
 449         stp     x29,x30,[sp,#-48]!
 450         add     x29,sp,#0
 451         stp     x19,x20,[sp,#16]
 452         stp     x21,x22,[sp,#32]
 453
 454         mov     $A_flat,x0                      // put aside arguments
 455         mov     $out,x1
 456         mov     $len,x2
 457         mov     $bsz,x3
 458
 459 .Loop_squeeze:
 460         ldr     x4,[x0],#8
 461         cmp     $len,#8
 462         blo     .Lsqueeze_tail
 463 #ifdef  __AARCH64EB__
 464         rev     x4,x4
 465 #endif
 466         str     x4,[$out],#8
 467         subs    $len,$len,#8
 468         beq     .Lsqueeze_done
 469
 470         subs    x3,x3,#8
 471         bhi     .Loop_squeeze
 472
 473         mov     x0,$A_flat
 474         bl      KeccakF1600
 475         mov     x0,$A_flat
 476         mov     x3,$bsz
 477         b       .Loop_squeeze
 478
 479 .align  4
 480 .Lsqueeze_tail:
 481         strb    w4,[$out],#1
 482         lsr     x4,x4,#8
 483         subs    $len,$len,#1
 484         beq     .Lsqueeze_done
 485         strb    w4,[$out],#1
 486         lsr     x4,x4,#8
 487         subs    $len,$len,#1
 488         beq     .Lsqueeze_done
 489         strb    w4,[$out],#1
 490         lsr     x4,x4,#8
 491         subs    $len,$len,#1
 492         beq     .Lsqueeze_done
 493         strb    w4,[$out],#1
 494         lsr     x4,x4,#8
 495         subs    $len,$len,#1
 496         beq     .Lsqueeze_done
 497         strb    w4,[$out],#1
 498         lsr     x4,x4,#8
 499         subs    $len,$len,#1
 500         beq     .Lsqueeze_done
 501         strb    w4,[$out],#1
 502         lsr     x4,x4,#8
 503         subs    $len,$len,#1
 504         beq     .Lsqueeze_done
 505         strb    w4,[$out],#1
 506
 507 .Lsqueeze_done:
 508         ldp     x19,x20,[sp,#16]
 509         ldp     x21,x22,[sp,#32]
 510         ldp     x29,x30,[sp],#48
 511         ret
 512 .size   SHA3_squeeze,.-SHA3_squeeze
 513 .asciz  "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 514 ___
 515 }
 516
 517 print $code;
 518 close STDOUT;