crypto/sha/asm/sha512-armv4.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # SHA512 block procedure for ARMv4. September 2007.
  11
  12 # This code is ~4.5 (four and a half) times faster than code generated
  13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
  14 # Xscale PXA250 core].
  15 #
  16 # July 2010.
  17 #
  18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
  19 # Cortex A8 core and ~40 cycles per processed byte.
  20
  21 # February 2011.
  22 #
  23 # Profiler-assisted and platform-specific optimization resulted in 7%
  24 # improvement on Coxtex A8 core and ~38 cycles per byte.
  25
  26 # March 2011.
  27 #
  28 # Add NEON implementation. On Cortex A8 it was measured to process
  29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
  30
  31 # August 2012.
  32 #
  33 # Improve NEON performance by 12% on Snapdragon S4. In absolute
  34 # terms it's 22.6 cycles per byte, which is disappointing result.
  35 # Technical writers asserted that 3-way S4 pipeline can sustain
  36 # multiple NEON instructions per cycle, but dual NEON issue could
  37 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
  38 # for further details. On side note Cortex-A15 processes one byte in
  39 # 16 cycles.
  40
  41 # Byte order [in]dependence. =========================================
  42 #
  43 # Originally caller was expected to maintain specific *dword* order in
  44 # h[0-7], namely with most significant dword at *lower* address, which
  45 # was reflected in below two parameters as 0 and 4. Now caller is
  46 # expected to maintain native byte order for whole 64-bit values.
  47 $hi="HI";
  48 $lo="LO";
  49 # ====================================================================
  50
  51 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  52 open STDOUT,">$output";
  53
  54 $ctx="r0";      # parameter block
  55 $inp="r1";
  56 $len="r2";
  57
  58 $Tlo="r3";
  59 $Thi="r4";
  60 $Alo="r5";
  61 $Ahi="r6";
  62 $Elo="r7";
  63 $Ehi="r8";
  64 $t0="r9";
  65 $t1="r10";
  66 $t2="r11";
  67 $t3="r12";
  68 ############    r13 is stack pointer
  69 $Ktbl="r14";
  70 ############    r15 is program counter
  71
  72 $Aoff=8*0;
  73 $Boff=8*1;
  74 $Coff=8*2;
  75 $Doff=8*3;
  76 $Eoff=8*4;
  77 $Foff=8*5;
  78 $Goff=8*6;
  79 $Hoff=8*7;
  80 $Xoff=8*8;
  81
  82 sub BODY_00_15() {
  83 my $magic = shift;
  84 $code.=<<___;
  85         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
  86         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
  87         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
  88         mov     $t0,$Elo,lsr#14
  89         str     $Tlo,[sp,#$Xoff+0]
  90         mov     $t1,$Ehi,lsr#14
  91         str     $Thi,[sp,#$Xoff+4]
  92         eor     $t0,$t0,$Ehi,lsl#18
  93         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
  94         eor     $t1,$t1,$Elo,lsl#18
  95         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
  96         eor     $t0,$t0,$Elo,lsr#18
  97         eor     $t1,$t1,$Ehi,lsr#18
  98         eor     $t0,$t0,$Ehi,lsl#14
  99         eor     $t1,$t1,$Elo,lsl#14
 100         eor     $t0,$t0,$Ehi,lsr#9
 101         eor     $t1,$t1,$Elo,lsr#9
 102         eor     $t0,$t0,$Elo,lsl#23
 103         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
 104         adds    $Tlo,$Tlo,$t0
 105         ldr     $t0,[sp,#$Foff+0]       @ f.lo
 106         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
 107         ldr     $t1,[sp,#$Foff+4]       @ f.hi
 108         adds    $Tlo,$Tlo,$t2
 109         ldr     $t2,[sp,#$Goff+0]       @ g.lo
 110         adc     $Thi,$Thi,$t3           @ T += h
 111         ldr     $t3,[sp,#$Goff+4]       @ g.hi
 112
 113         eor     $t0,$t0,$t2
 114         str     $Elo,[sp,#$Eoff+0]
 115         eor     $t1,$t1,$t3
 116         str     $Ehi,[sp,#$Eoff+4]
 117         and     $t0,$t0,$Elo
 118         str     $Alo,[sp,#$Aoff+0]
 119         and     $t1,$t1,$Ehi
 120         str     $Ahi,[sp,#$Aoff+4]
 121         eor     $t0,$t0,$t2
 122         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
 123         eor     $t1,$t1,$t3             @ Ch(e,f,g)
 124         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
 125
 126         adds    $Tlo,$Tlo,$t0
 127         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
 128         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
 129         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
 130         adds    $Tlo,$Tlo,$t2
 131         and     $t0,$t2,#0xff
 132         adc     $Thi,$Thi,$t3           @ T += K[i]
 133         adds    $Elo,$Elo,$Tlo
 134         ldr     $t2,[sp,#$Boff+0]       @ b.lo
 135         adc     $Ehi,$Ehi,$Thi          @ d += T
 136         teq     $t0,#$magic
 137
 138         ldr     $t3,[sp,#$Coff+0]       @ c.lo
 139         orreq   $Ktbl,$Ktbl,#1
 140         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 141         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 142         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
 143         mov     $t0,$Alo,lsr#28
 144         mov     $t1,$Ahi,lsr#28
 145         eor     $t0,$t0,$Ahi,lsl#4
 146         eor     $t1,$t1,$Alo,lsl#4
 147         eor     $t0,$t0,$Ahi,lsr#2
 148         eor     $t1,$t1,$Alo,lsr#2
 149         eor     $t0,$t0,$Alo,lsl#30
 150         eor     $t1,$t1,$Ahi,lsl#30
 151         eor     $t0,$t0,$Ahi,lsr#7
 152         eor     $t1,$t1,$Alo,lsr#7
 153         eor     $t0,$t0,$Alo,lsl#25
 154         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
 155         adds    $Tlo,$Tlo,$t0
 156         and     $t0,$Alo,$t2
 157         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
 158
 159         ldr     $t1,[sp,#$Boff+4]       @ b.hi
 160         orr     $Alo,$Alo,$t2
 161         ldr     $t2,[sp,#$Coff+4]       @ c.hi
 162         and     $Alo,$Alo,$t3
 163         and     $t3,$Ahi,$t1
 164         orr     $Ahi,$Ahi,$t1
 165         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
 166         and     $Ahi,$Ahi,$t2
 167         adds    $Alo,$Alo,$Tlo
 168         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
 169         sub     sp,sp,#8
 170         adc     $Ahi,$Ahi,$Thi          @ h += T
 171         tst     $Ktbl,#1
 172         add     $Ktbl,$Ktbl,#8
 173 ___
 174 }
 175 $code=<<___;
 176 #include "arm_arch.h"
 177 #ifdef __ARMEL__
 178 # define LO 0
 179 # define HI 4
 180 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
 181 #else
 182 # define HI 0
 183 # define LO 4
 184 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
 185 #endif
 186
 187 .text
 188 .code   32
 189 .type   K512,%object
 190 .align  5
 191 K512:
 192 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
 193 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
 194 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
 195 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
 196 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
 197 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
 198 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
 199 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
 200 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
 201 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
 202 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
 203 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
 204 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
 205 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
 206 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
 207 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
 208 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
 209 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
 210 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
 211 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
 212 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
 213 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
 214 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
 215 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
 216 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
 217 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
 218 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
 219 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
 220 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
 221 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
 222 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
 223 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
 224 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
 225 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
 226 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
 227 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
 228 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
 229 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
 230 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 231 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 232 .size   K512,.-K512
 233 #if __ARM_MAX_ARCH__>=7
 234 .LOPENSSL_armcap:
 235 .word   OPENSSL_armcap_P-sha512_block_data_order
 236 .skip   32-4
 237 #else
 238 .skip   32
 239 #endif
 240
 241 .global sha512_block_data_order
 242 .type   sha512_block_data_order,%function
 243 sha512_block_data_order:
 244         sub     r3,pc,#8                @ sha512_block_data_order
 245         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
 246 #if __ARM_MAX_ARCH__>=7
 247         ldr     r12,.LOPENSSL_armcap
 248         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
 249         tst     r12,#1
 250         bne     .LNEON
 251 #endif
 252         stmdb   sp!,{r4-r12,lr}
 253         sub     $Ktbl,r3,#672           @ K512
 254         sub     sp,sp,#9*8
 255
 256         ldr     $Elo,[$ctx,#$Eoff+$lo]
 257         ldr     $Ehi,[$ctx,#$Eoff+$hi]
 258         ldr     $t0, [$ctx,#$Goff+$lo]
 259         ldr     $t1, [$ctx,#$Goff+$hi]
 260         ldr     $t2, [$ctx,#$Hoff+$lo]
 261         ldr     $t3, [$ctx,#$Hoff+$hi]
 262 .Loop:
 263         str     $t0, [sp,#$Goff+0]
 264         str     $t1, [sp,#$Goff+4]
 265         str     $t2, [sp,#$Hoff+0]
 266         str     $t3, [sp,#$Hoff+4]
 267         ldr     $Alo,[$ctx,#$Aoff+$lo]
 268         ldr     $Ahi,[$ctx,#$Aoff+$hi]
 269         ldr     $Tlo,[$ctx,#$Boff+$lo]
 270         ldr     $Thi,[$ctx,#$Boff+$hi]
 271         ldr     $t0, [$ctx,#$Coff+$lo]
 272         ldr     $t1, [$ctx,#$Coff+$hi]
 273         ldr     $t2, [$ctx,#$Doff+$lo]
 274         ldr     $t3, [$ctx,#$Doff+$hi]
 275         str     $Tlo,[sp,#$Boff+0]
 276         str     $Thi,[sp,#$Boff+4]
 277         str     $t0, [sp,#$Coff+0]
 278         str     $t1, [sp,#$Coff+4]
 279         str     $t2, [sp,#$Doff+0]
 280         str     $t3, [sp,#$Doff+4]
 281         ldr     $Tlo,[$ctx,#$Foff+$lo]
 282         ldr     $Thi,[$ctx,#$Foff+$hi]
 283         str     $Tlo,[sp,#$Foff+0]
 284         str     $Thi,[sp,#$Foff+4]
 285
 286 .L00_15:
 287 #if __ARM_ARCH__<7
 288         ldrb    $Tlo,[$inp,#7]
 289         ldrb    $t0, [$inp,#6]
 290         ldrb    $t1, [$inp,#5]
 291         ldrb    $t2, [$inp,#4]
 292         ldrb    $Thi,[$inp,#3]
 293         ldrb    $t3, [$inp,#2]
 294         orr     $Tlo,$Tlo,$t0,lsl#8
 295         ldrb    $t0, [$inp,#1]
 296         orr     $Tlo,$Tlo,$t1,lsl#16
 297         ldrb    $t1, [$inp],#8
 298         orr     $Tlo,$Tlo,$t2,lsl#24
 299         orr     $Thi,$Thi,$t3,lsl#8
 300         orr     $Thi,$Thi,$t0,lsl#16
 301         orr     $Thi,$Thi,$t1,lsl#24
 302 #else
 303         ldr     $Tlo,[$inp,#4]
 304         ldr     $Thi,[$inp],#8
 305 #ifdef __ARMEL__
 306         rev     $Tlo,$Tlo
 307         rev     $Thi,$Thi
 308 #endif
 309 #endif
 310 ___
 311         &BODY_00_15(0x94);
 312 $code.=<<___;
 313         tst     $Ktbl,#1
 314         beq     .L00_15
 315         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
 316         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
 317         bic     $Ktbl,$Ktbl,#1
 318 .L16_79:
 319         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 320         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 321         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 322         mov     $Tlo,$t0,lsr#1
 323         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
 324         mov     $Thi,$t1,lsr#1
 325         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
 326         eor     $Tlo,$Tlo,$t1,lsl#31
 327         eor     $Thi,$Thi,$t0,lsl#31
 328         eor     $Tlo,$Tlo,$t0,lsr#8
 329         eor     $Thi,$Thi,$t1,lsr#8
 330         eor     $Tlo,$Tlo,$t1,lsl#24
 331         eor     $Thi,$Thi,$t0,lsl#24
 332         eor     $Tlo,$Tlo,$t0,lsr#7
 333         eor     $Thi,$Thi,$t1,lsr#7
 334         eor     $Tlo,$Tlo,$t1,lsl#25
 335
 336         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
 337         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
 338         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
 339         mov     $t0,$t2,lsr#19
 340         mov     $t1,$t3,lsr#19
 341         eor     $t0,$t0,$t3,lsl#13
 342         eor     $t1,$t1,$t2,lsl#13
 343         eor     $t0,$t0,$t3,lsr#29
 344         eor     $t1,$t1,$t2,lsr#29
 345         eor     $t0,$t0,$t2,lsl#3
 346         eor     $t1,$t1,$t3,lsl#3
 347         eor     $t0,$t0,$t2,lsr#6
 348         eor     $t1,$t1,$t3,lsr#6
 349         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
 350         eor     $t0,$t0,$t3,lsl#26
 351
 352         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
 353         adds    $Tlo,$Tlo,$t0
 354         ldr     $t0,[sp,#`$Xoff+8*16`+0]
 355         adc     $Thi,$Thi,$t1
 356
 357         ldr     $t1,[sp,#`$Xoff+8*16`+4]
 358         adds    $Tlo,$Tlo,$t2
 359         adc     $Thi,$Thi,$t3
 360         adds    $Tlo,$Tlo,$t0
 361         adc     $Thi,$Thi,$t1
 362 ___
 363         &BODY_00_15(0x17);
 364 $code.=<<___;
 365         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
 366         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
 367         beq     .L16_79
 368         bic     $Ktbl,$Ktbl,#1
 369
 370         ldr     $Tlo,[sp,#$Boff+0]
 371         ldr     $Thi,[sp,#$Boff+4]
 372         ldr     $t0, [$ctx,#$Aoff+$lo]
 373         ldr     $t1, [$ctx,#$Aoff+$hi]
 374         ldr     $t2, [$ctx,#$Boff+$lo]
 375         ldr     $t3, [$ctx,#$Boff+$hi]
 376         adds    $t0,$Alo,$t0
 377         str     $t0, [$ctx,#$Aoff+$lo]
 378         adc     $t1,$Ahi,$t1
 379         str     $t1, [$ctx,#$Aoff+$hi]
 380         adds    $t2,$Tlo,$t2
 381         str     $t2, [$ctx,#$Boff+$lo]
 382         adc     $t3,$Thi,$t3
 383         str     $t3, [$ctx,#$Boff+$hi]
 384
 385         ldr     $Alo,[sp,#$Coff+0]
 386         ldr     $Ahi,[sp,#$Coff+4]
 387         ldr     $Tlo,[sp,#$Doff+0]
 388         ldr     $Thi,[sp,#$Doff+4]
 389         ldr     $t0, [$ctx,#$Coff+$lo]
 390         ldr     $t1, [$ctx,#$Coff+$hi]
 391         ldr     $t2, [$ctx,#$Doff+$lo]
 392         ldr     $t3, [$ctx,#$Doff+$hi]
 393         adds    $t0,$Alo,$t0
 394         str     $t0, [$ctx,#$Coff+$lo]
 395         adc     $t1,$Ahi,$t1
 396         str     $t1, [$ctx,#$Coff+$hi]
 397         adds    $t2,$Tlo,$t2
 398         str     $t2, [$ctx,#$Doff+$lo]
 399         adc     $t3,$Thi,$t3
 400         str     $t3, [$ctx,#$Doff+$hi]
 401
 402         ldr     $Tlo,[sp,#$Foff+0]
 403         ldr     $Thi,[sp,#$Foff+4]
 404         ldr     $t0, [$ctx,#$Eoff+$lo]
 405         ldr     $t1, [$ctx,#$Eoff+$hi]
 406         ldr     $t2, [$ctx,#$Foff+$lo]
 407         ldr     $t3, [$ctx,#$Foff+$hi]
 408         adds    $Elo,$Elo,$t0
 409         str     $Elo,[$ctx,#$Eoff+$lo]
 410         adc     $Ehi,$Ehi,$t1
 411         str     $Ehi,[$ctx,#$Eoff+$hi]
 412         adds    $t2,$Tlo,$t2
 413         str     $t2, [$ctx,#$Foff+$lo]
 414         adc     $t3,$Thi,$t3
 415         str     $t3, [$ctx,#$Foff+$hi]
 416
 417         ldr     $Alo,[sp,#$Goff+0]
 418         ldr     $Ahi,[sp,#$Goff+4]
 419         ldr     $Tlo,[sp,#$Hoff+0]
 420         ldr     $Thi,[sp,#$Hoff+4]
 421         ldr     $t0, [$ctx,#$Goff+$lo]
 422         ldr     $t1, [$ctx,#$Goff+$hi]
 423         ldr     $t2, [$ctx,#$Hoff+$lo]
 424         ldr     $t3, [$ctx,#$Hoff+$hi]
 425         adds    $t0,$Alo,$t0
 426         str     $t0, [$ctx,#$Goff+$lo]
 427         adc     $t1,$Ahi,$t1
 428         str     $t1, [$ctx,#$Goff+$hi]
 429         adds    $t2,$Tlo,$t2
 430         str     $t2, [$ctx,#$Hoff+$lo]
 431         adc     $t3,$Thi,$t3
 432         str     $t3, [$ctx,#$Hoff+$hi]
 433
 434         add     sp,sp,#640
 435         sub     $Ktbl,$Ktbl,#640
 436
 437         teq     $inp,$len
 438         bne     .Loop
 439
 440         add     sp,sp,#8*9              @ destroy frame
 441 #if __ARM_ARCH__>=5
 442         ldmia   sp!,{r4-r12,pc}
 443 #else
 444         ldmia   sp!,{r4-r12,lr}
 445         tst     lr,#1
 446         moveq   pc,lr                   @ be binary compatible with V4, yet
 447         bx      lr                      @ interoperable with Thumb ISA:-)
 448 #endif
 449 ___
 450
 451 {
 452 my @Sigma0=(28,34,39);
 453 my @Sigma1=(14,18,41);
 454 my @sigma0=(1, 8, 7);
 455 my @sigma1=(19,61,6);
 456
 457 my $Ktbl="r3";
 458 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
 459
 460 my @X=map("d$_",(0..15));
 461 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
 462
 463 sub NEON_00_15() {
 464 my $i=shift;
 465 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
 466 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
 467
 468 $code.=<<___ if ($i<16 || $i&1);
 469         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
 470 #if $i<16
 471         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
 472 #endif
 473         vshr.u64        $t1,$e,#@Sigma1[1]
 474 #if $i>0
 475          vadd.i64       $a,$Maj                 @ h+=Maj from the past
 476 #endif
 477         vshr.u64        $t2,$e,#@Sigma1[2]
 478 ___
 479 $code.=<<___;
 480         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
 481         vsli.64         $t0,$e,#`64-@Sigma1[0]`
 482         vsli.64         $t1,$e,#`64-@Sigma1[1]`
 483         vmov            $Ch,$e
 484         vsli.64         $t2,$e,#`64-@Sigma1[2]`
 485 #if $i<16 && defined(__ARMEL__)
 486         vrev64.8        @X[$i],@X[$i]
 487 #endif
 488         veor            $t1,$t0
 489         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
 490         vshr.u64        $t0,$a,#@Sigma0[0]
 491         veor            $t2,$t1                 @ Sigma1(e)
 492         vadd.i64        $T1,$Ch,$h
 493         vshr.u64        $t1,$a,#@Sigma0[1]
 494         vsli.64         $t0,$a,#`64-@Sigma0[0]`
 495         vadd.i64        $T1,$t2
 496         vshr.u64        $t2,$a,#@Sigma0[2]
 497         vadd.i64        $K,@X[$i%16]
 498         vsli.64         $t1,$a,#`64-@Sigma0[1]`
 499         veor            $Maj,$a,$b
 500         vsli.64         $t2,$a,#`64-@Sigma0[2]`
 501         veor            $h,$t0,$t1
 502         vadd.i64        $T1,$K
 503         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
 504         veor            $h,$t2                  @ Sigma0(a)
 505         vadd.i64        $d,$T1
 506         vadd.i64        $Maj,$T1
 507         @ vadd.i64      $h,$Maj
 508 ___
 509 }
 510
 511 sub NEON_16_79() {
 512 my $i=shift;
 513
 514 if ($i&1)       { &NEON_00_15($i,@_); return; }
 515
 516 # 2x-vectorized, therefore runs every 2nd round
 517 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
 518 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
 519 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
 520 my $e=@_[4];                                    # $e from NEON_00_15
 521 $i /= 2;
 522 $code.=<<___;
 523         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
 524         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
 525          vadd.i64       @_[0],d30                       @ h+=Maj from the past
 526         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
 527         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
 528         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
 529         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
 530         veor            $s1,$t0
 531         vshr.u64        $t0,$s0,#@sigma0[0]
 532         veor            $s1,$t1                         @ sigma1(X[i+14])
 533         vshr.u64        $t1,$s0,#@sigma0[1]
 534         vadd.i64        @X[$i%8],$s1
 535         vshr.u64        $s1,$s0,#@sigma0[2]
 536         vsli.64         $t0,$s0,#`64-@sigma0[0]`
 537         vsli.64         $t1,$s0,#`64-@sigma0[1]`
 538         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
 539         veor            $s1,$t0
 540         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
 541         vadd.i64        @X[$i%8],$s0
 542         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
 543         veor            $s1,$t1                         @ sigma0(X[i+1])
 544         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
 545         vadd.i64        @X[$i%8],$s1
 546 ___
 547         &NEON_00_15(2*$i,@_);
 548 }
 549
 550 $code.=<<___;
 551 #if __ARM_MAX_ARCH__>=7
 552 .arch   armv7-a
 553 .fpu    neon
 554
 555 .align  4
 556 .LNEON:
 557         dmb                             @ errata #451034 on early Cortex A8
 558         vstmdb  sp!,{d8-d15}            @ ABI specification says so
 559         sub     $Ktbl,r3,#672           @ K512
 560         vldmia  $ctx,{$A-$H}            @ load context
 561 .Loop_neon:
 562 ___
 563 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
 564 $code.=<<___;
 565         mov             $cnt,#4
 566 .L16_79_neon:
 567         subs            $cnt,#1
 568 ___
 569 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
 570 $code.=<<___;
 571         bne             .L16_79_neon
 572
 573          vadd.i64       $A,d30          @ h+=Maj from the past
 574         vldmia          $ctx,{d24-d31}  @ load context to temp
 575         vadd.i64        q8,q12          @ vectorized accumulate
 576         vadd.i64        q9,q13
 577         vadd.i64        q10,q14
 578         vadd.i64        q11,q15
 579         vstmia          $ctx,{$A-$H}    @ save context
 580         teq             $inp,$len
 581         sub             $Ktbl,#640      @ rewind K512
 582         bne             .Loop_neon
 583
 584         vldmia  sp!,{d8-d15}            @ epilogue
 585         ret                             @ bx lr
 586 #endif
 587 ___
 588 }
 589 $code.=<<___;
 590 .size   sha512_block_data_order,.-sha512_block_data_order
 591 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 592 .align  2
 593 #if __ARM_MAX_ARCH__>=7
 594 .comm   OPENSSL_armcap_P,4,4
 595 #endif
 596 ___
 597
 598 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 599 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 600 $code =~ s/\bret\b/bx   lr/gm;
 601 print $code;
 602 close STDOUT; # enforce flush