crypto/sha/asm/sha512-armv4.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # SHA512 block procedure for ARMv4. September 2007.
  11
  12 # This code is ~4.5 (four and a half) times faster than code generated
  13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
  14 # Xscale PXA250 core].
  15 #
  16 # July 2010.
  17 #
  18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
  19 # Cortex A8 core and ~40 cycles per processed byte.
  20
  21 # February 2011.
  22 #
  23 # Profiler-assisted and platform-specific optimization resulted in 7%
  24 # improvement on Coxtex A8 core and ~38 cycles per byte.
  25
  26 # March 2011.
  27 #
  28 # Add NEON implementation. On Cortex A8 it was measured to process
  29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
  30
  31 # Byte order [in]dependence. =========================================
  32 #
  33 # Originally caller was expected to maintain specific *dword* order in
  34 # h[0-7], namely with most significant dword at *lower* address, which
  35 # was reflected in below two parameters as 0 and 4. Now caller is
  36 # expected to maintain native byte order for whole 64-bit values.
  37 $hi="HI";
  38 $lo="LO";
  39 # ====================================================================
  40
  41 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  42 open STDOUT,">$output";
  43
  44 $ctx="r0";      # parameter block
  45 $inp="r1";
  46 $len="r2";
  47
  48 $Tlo="r3";
  49 $Thi="r4";
  50 $Alo="r5";
  51 $Ahi="r6";
  52 $Elo="r7";
  53 $Ehi="r8";
  54 $t0="r9";
  55 $t1="r10";
  56 $t2="r11";
  57 $t3="r12";
  58 ############    r13 is stack pointer
  59 $Ktbl="r14";
  60 ############    r15 is program counter
  61
  62 $Aoff=8*0;
  63 $Boff=8*1;
  64 $Coff=8*2;
  65 $Doff=8*3;
  66 $Eoff=8*4;
  67 $Foff=8*5;
  68 $Goff=8*6;
  69 $Hoff=8*7;
  70 $Xoff=8*8;
  71
  72 sub BODY_00_15() {
  73 my $magic = shift;
  74 $code.=<<___;
  75         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
  76         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
  77         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
  78         mov     $t0,$Elo,lsr#14
  79         str     $Tlo,[sp,#$Xoff+0]
  80         mov     $t1,$Ehi,lsr#14
  81         str     $Thi,[sp,#$Xoff+4]
  82         eor     $t0,$t0,$Ehi,lsl#18
  83         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
  84         eor     $t1,$t1,$Elo,lsl#18
  85         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
  86         eor     $t0,$t0,$Elo,lsr#18
  87         eor     $t1,$t1,$Ehi,lsr#18
  88         eor     $t0,$t0,$Ehi,lsl#14
  89         eor     $t1,$t1,$Elo,lsl#14
  90         eor     $t0,$t0,$Ehi,lsr#9
  91         eor     $t1,$t1,$Elo,lsr#9
  92         eor     $t0,$t0,$Elo,lsl#23
  93         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
  94         adds    $Tlo,$Tlo,$t0
  95         ldr     $t0,[sp,#$Foff+0]       @ f.lo
  96         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
  97         ldr     $t1,[sp,#$Foff+4]       @ f.hi
  98         adds    $Tlo,$Tlo,$t2
  99         ldr     $t2,[sp,#$Goff+0]       @ g.lo
 100         adc     $Thi,$Thi,$t3           @ T += h
 101         ldr     $t3,[sp,#$Goff+4]       @ g.hi
 102
 103         eor     $t0,$t0,$t2
 104         str     $Elo,[sp,#$Eoff+0]
 105         eor     $t1,$t1,$t3
 106         str     $Ehi,[sp,#$Eoff+4]
 107         and     $t0,$t0,$Elo
 108         str     $Alo,[sp,#$Aoff+0]
 109         and     $t1,$t1,$Ehi
 110         str     $Ahi,[sp,#$Aoff+4]
 111         eor     $t0,$t0,$t2
 112         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
 113         eor     $t1,$t1,$t3             @ Ch(e,f,g)
 114         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
 115
 116         adds    $Tlo,$Tlo,$t0
 117         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
 118         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
 119         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
 120         adds    $Tlo,$Tlo,$t2
 121         and     $t0,$t2,#0xff
 122         adc     $Thi,$Thi,$t3           @ T += K[i]
 123         adds    $Elo,$Elo,$Tlo
 124         ldr     $t2,[sp,#$Boff+0]       @ b.lo
 125         adc     $Ehi,$Ehi,$Thi          @ d += T
 126         teq     $t0,#$magic
 127
 128         ldr     $t3,[sp,#$Coff+0]       @ c.lo
 129         orreq   $Ktbl,$Ktbl,#1
 130         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 131         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 132         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
 133         mov     $t0,$Alo,lsr#28
 134         mov     $t1,$Ahi,lsr#28
 135         eor     $t0,$t0,$Ahi,lsl#4
 136         eor     $t1,$t1,$Alo,lsl#4
 137         eor     $t0,$t0,$Ahi,lsr#2
 138         eor     $t1,$t1,$Alo,lsr#2
 139         eor     $t0,$t0,$Alo,lsl#30
 140         eor     $t1,$t1,$Ahi,lsl#30
 141         eor     $t0,$t0,$Ahi,lsr#7
 142         eor     $t1,$t1,$Alo,lsr#7
 143         eor     $t0,$t0,$Alo,lsl#25
 144         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
 145         adds    $Tlo,$Tlo,$t0
 146         and     $t0,$Alo,$t2
 147         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
 148
 149         ldr     $t1,[sp,#$Boff+4]       @ b.hi
 150         orr     $Alo,$Alo,$t2
 151         ldr     $t2,[sp,#$Coff+4]       @ c.hi
 152         and     $Alo,$Alo,$t3
 153         and     $t3,$Ahi,$t1
 154         orr     $Ahi,$Ahi,$t1
 155         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
 156         and     $Ahi,$Ahi,$t2
 157         adds    $Alo,$Alo,$Tlo
 158         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
 159         sub     sp,sp,#8
 160         adc     $Ahi,$Ahi,$Thi          @ h += T
 161         tst     $Ktbl,#1
 162         add     $Ktbl,$Ktbl,#8
 163 ___
 164 }
 165 $code=<<___;
 166 #include "arm_arch.h"
 167 #ifdef __ARMEL__
 168 # define LO 0
 169 # define HI 4
 170 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
 171 #else
 172 # define HI 0
 173 # define LO 4
 174 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
 175 #endif
 176
 177 .text
 178 .code   32
 179 .type   K512,%object
 180 .align  5
 181 K512:
 182 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
 183 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
 184 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
 185 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
 186 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
 187 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
 188 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
 189 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
 190 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
 191 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
 192 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
 193 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
 194 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
 195 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
 196 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
 197 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
 198 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
 199 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
 200 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
 201 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
 202 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
 203 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
 204 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
 205 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
 206 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
 207 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
 208 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
 209 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
 210 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
 211 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
 212 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
 213 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
 214 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
 215 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
 216 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
 217 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
 218 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
 219 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
 220 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 221 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 222 .size   K512,.-K512
 223 .LOPENSSL_armcap:
 224 .word   OPENSSL_armcap_P-sha512_block_data_order
 225 .skip   32-4
 226
 227 .global sha512_block_data_order
 228 .type   sha512_block_data_order,%function
 229 sha512_block_data_order:
 230         sub     r3,pc,#8                @ sha512_block_data_order
 231         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
 232 #if __ARM_ARCH__>=7
 233         ldr     r12,.LOPENSSL_armcap
 234         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
 235         tst     r12,#1
 236         bne     .LNEON
 237 #endif
 238         stmdb   sp!,{r4-r12,lr}
 239         sub     $Ktbl,r3,#672           @ K512
 240         sub     sp,sp,#9*8
 241
 242         ldr     $Elo,[$ctx,#$Eoff+$lo]
 243         ldr     $Ehi,[$ctx,#$Eoff+$hi]
 244         ldr     $t0, [$ctx,#$Goff+$lo]
 245         ldr     $t1, [$ctx,#$Goff+$hi]
 246         ldr     $t2, [$ctx,#$Hoff+$lo]
 247         ldr     $t3, [$ctx,#$Hoff+$hi]
 248 .Loop:
 249         str     $t0, [sp,#$Goff+0]
 250         str     $t1, [sp,#$Goff+4]
 251         str     $t2, [sp,#$Hoff+0]
 252         str     $t3, [sp,#$Hoff+4]
 253         ldr     $Alo,[$ctx,#$Aoff+$lo]
 254         ldr     $Ahi,[$ctx,#$Aoff+$hi]
 255         ldr     $Tlo,[$ctx,#$Boff+$lo]
 256         ldr     $Thi,[$ctx,#$Boff+$hi]
 257         ldr     $t0, [$ctx,#$Coff+$lo]
 258         ldr     $t1, [$ctx,#$Coff+$hi]
 259         ldr     $t2, [$ctx,#$Doff+$lo]
 260         ldr     $t3, [$ctx,#$Doff+$hi]
 261         str     $Tlo,[sp,#$Boff+0]
 262         str     $Thi,[sp,#$Boff+4]
 263         str     $t0, [sp,#$Coff+0]
 264         str     $t1, [sp,#$Coff+4]
 265         str     $t2, [sp,#$Doff+0]
 266         str     $t3, [sp,#$Doff+4]
 267         ldr     $Tlo,[$ctx,#$Foff+$lo]
 268         ldr     $Thi,[$ctx,#$Foff+$hi]
 269         str     $Tlo,[sp,#$Foff+0]
 270         str     $Thi,[sp,#$Foff+4]
 271
 272 .L00_15:
 273 #if __ARM_ARCH__<7
 274         ldrb    $Tlo,[$inp,#7]
 275         ldrb    $t0, [$inp,#6]
 276         ldrb    $t1, [$inp,#5]
 277         ldrb    $t2, [$inp,#4]
 278         ldrb    $Thi,[$inp,#3]
 279         ldrb    $t3, [$inp,#2]
 280         orr     $Tlo,$Tlo,$t0,lsl#8
 281         ldrb    $t0, [$inp,#1]
 282         orr     $Tlo,$Tlo,$t1,lsl#16
 283         ldrb    $t1, [$inp],#8
 284         orr     $Tlo,$Tlo,$t2,lsl#24
 285         orr     $Thi,$Thi,$t3,lsl#8
 286         orr     $Thi,$Thi,$t0,lsl#16
 287         orr     $Thi,$Thi,$t1,lsl#24
 288 #else
 289         ldr     $Tlo,[$inp,#4]
 290         ldr     $Thi,[$inp],#8
 291 #ifdef __ARMEL__
 292         rev     $Tlo,$Tlo
 293         rev     $Thi,$Thi
 294 #endif
 295 #endif
 296 ___
 297         &BODY_00_15(0x94);
 298 $code.=<<___;
 299         tst     $Ktbl,#1
 300         beq     .L00_15
 301         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
 302         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
 303         bic     $Ktbl,$Ktbl,#1
 304 .L16_79:
 305         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 306         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 307         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 308         mov     $Tlo,$t0,lsr#1
 309         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
 310         mov     $Thi,$t1,lsr#1
 311         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
 312         eor     $Tlo,$Tlo,$t1,lsl#31
 313         eor     $Thi,$Thi,$t0,lsl#31
 314         eor     $Tlo,$Tlo,$t0,lsr#8
 315         eor     $Thi,$Thi,$t1,lsr#8
 316         eor     $Tlo,$Tlo,$t1,lsl#24
 317         eor     $Thi,$Thi,$t0,lsl#24
 318         eor     $Tlo,$Tlo,$t0,lsr#7
 319         eor     $Thi,$Thi,$t1,lsr#7
 320         eor     $Tlo,$Tlo,$t1,lsl#25
 321
 322         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
 323         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
 324         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
 325         mov     $t0,$t2,lsr#19
 326         mov     $t1,$t3,lsr#19
 327         eor     $t0,$t0,$t3,lsl#13
 328         eor     $t1,$t1,$t2,lsl#13
 329         eor     $t0,$t0,$t3,lsr#29
 330         eor     $t1,$t1,$t2,lsr#29
 331         eor     $t0,$t0,$t2,lsl#3
 332         eor     $t1,$t1,$t3,lsl#3
 333         eor     $t0,$t0,$t2,lsr#6
 334         eor     $t1,$t1,$t3,lsr#6
 335         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
 336         eor     $t0,$t0,$t3,lsl#26
 337
 338         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
 339         adds    $Tlo,$Tlo,$t0
 340         ldr     $t0,[sp,#`$Xoff+8*16`+0]
 341         adc     $Thi,$Thi,$t1
 342
 343         ldr     $t1,[sp,#`$Xoff+8*16`+4]
 344         adds    $Tlo,$Tlo,$t2
 345         adc     $Thi,$Thi,$t3
 346         adds    $Tlo,$Tlo,$t0
 347         adc     $Thi,$Thi,$t1
 348 ___
 349         &BODY_00_15(0x17);
 350 $code.=<<___;
 351         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
 352         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
 353         beq     .L16_79
 354         bic     $Ktbl,$Ktbl,#1
 355
 356         ldr     $Tlo,[sp,#$Boff+0]
 357         ldr     $Thi,[sp,#$Boff+4]
 358         ldr     $t0, [$ctx,#$Aoff+$lo]
 359         ldr     $t1, [$ctx,#$Aoff+$hi]
 360         ldr     $t2, [$ctx,#$Boff+$lo]
 361         ldr     $t3, [$ctx,#$Boff+$hi]
 362         adds    $t0,$Alo,$t0
 363         str     $t0, [$ctx,#$Aoff+$lo]
 364         adc     $t1,$Ahi,$t1
 365         str     $t1, [$ctx,#$Aoff+$hi]
 366         adds    $t2,$Tlo,$t2
 367         str     $t2, [$ctx,#$Boff+$lo]
 368         adc     $t3,$Thi,$t3
 369         str     $t3, [$ctx,#$Boff+$hi]
 370
 371         ldr     $Alo,[sp,#$Coff+0]
 372         ldr     $Ahi,[sp,#$Coff+4]
 373         ldr     $Tlo,[sp,#$Doff+0]
 374         ldr     $Thi,[sp,#$Doff+4]
 375         ldr     $t0, [$ctx,#$Coff+$lo]
 376         ldr     $t1, [$ctx,#$Coff+$hi]
 377         ldr     $t2, [$ctx,#$Doff+$lo]
 378         ldr     $t3, [$ctx,#$Doff+$hi]
 379         adds    $t0,$Alo,$t0
 380         str     $t0, [$ctx,#$Coff+$lo]
 381         adc     $t1,$Ahi,$t1
 382         str     $t1, [$ctx,#$Coff+$hi]
 383         adds    $t2,$Tlo,$t2
 384         str     $t2, [$ctx,#$Doff+$lo]
 385         adc     $t3,$Thi,$t3
 386         str     $t3, [$ctx,#$Doff+$hi]
 387
 388         ldr     $Tlo,[sp,#$Foff+0]
 389         ldr     $Thi,[sp,#$Foff+4]
 390         ldr     $t0, [$ctx,#$Eoff+$lo]
 391         ldr     $t1, [$ctx,#$Eoff+$hi]
 392         ldr     $t2, [$ctx,#$Foff+$lo]
 393         ldr     $t3, [$ctx,#$Foff+$hi]
 394         adds    $Elo,$Elo,$t0
 395         str     $Elo,[$ctx,#$Eoff+$lo]
 396         adc     $Ehi,$Ehi,$t1
 397         str     $Ehi,[$ctx,#$Eoff+$hi]
 398         adds    $t2,$Tlo,$t2
 399         str     $t2, [$ctx,#$Foff+$lo]
 400         adc     $t3,$Thi,$t3
 401         str     $t3, [$ctx,#$Foff+$hi]
 402
 403         ldr     $Alo,[sp,#$Goff+0]
 404         ldr     $Ahi,[sp,#$Goff+4]
 405         ldr     $Tlo,[sp,#$Hoff+0]
 406         ldr     $Thi,[sp,#$Hoff+4]
 407         ldr     $t0, [$ctx,#$Goff+$lo]
 408         ldr     $t1, [$ctx,#$Goff+$hi]
 409         ldr     $t2, [$ctx,#$Hoff+$lo]
 410         ldr     $t3, [$ctx,#$Hoff+$hi]
 411         adds    $t0,$Alo,$t0
 412         str     $t0, [$ctx,#$Goff+$lo]
 413         adc     $t1,$Ahi,$t1
 414         str     $t1, [$ctx,#$Goff+$hi]
 415         adds    $t2,$Tlo,$t2
 416         str     $t2, [$ctx,#$Hoff+$lo]
 417         adc     $t3,$Thi,$t3
 418         str     $t3, [$ctx,#$Hoff+$hi]
 419
 420         add     sp,sp,#640
 421         sub     $Ktbl,$Ktbl,#640
 422
 423         teq     $inp,$len
 424         bne     .Loop
 425
 426         add     sp,sp,#8*9              @ destroy frame
 427 #if __ARM_ARCH__>=5
 428         ldmia   sp!,{r4-r12,pc}
 429 #else
 430         ldmia   sp!,{r4-r12,lr}
 431         tst     lr,#1
 432         moveq   pc,lr                   @ be binary compatible with V4, yet
 433         bx      lr                      @ interoperable with Thumb ISA:-)
 434 #endif
 435 ___
 436
 437 {
 438 my @Sigma0=(28,34,39);
 439 my @Sigma1=(14,18,41);
 440 my @sigma0=(1, 8, 7);
 441 my @sigma1=(19,61,6);
 442
 443 my $Ktbl="r3";
 444 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
 445
 446 my @X=map("d$_",(0..15));
 447 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
 448
 449 sub NEON_00_15() {
 450 my $i=shift;
 451 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
 452 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
 453
 454 $code.=<<___ if ($i<16 || $i&1);
 455         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
 456 #if $i<16
 457         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
 458 #endif
 459         vshr.u64        $t1,$e,#@Sigma1[1]
 460         vshr.u64        $t2,$e,#@Sigma1[2]
 461 ___
 462 $code.=<<___;
 463         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
 464         vsli.64         $t0,$e,#`64-@Sigma1[0]`
 465         vsli.64         $t1,$e,#`64-@Sigma1[1]`
 466         vmov            $Ch,$e
 467         vsli.64         $t2,$e,#`64-@Sigma1[2]`
 468 #if $i<16 && defined(__ARMEL__)
 469         vrev64.8        @X[$i],@X[$i]
 470 #endif
 471         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
 472         veor            $t1,$t0
 473         vshr.u64        $t0,$a,#@Sigma0[0]
 474         veor            $t2,$t1                 @ Sigma1(e)
 475         vshr.u64        $t1,$a,#@Sigma0[1]
 476         vadd.i64        $T1,$h,$t2
 477         vshr.u64        $t2,$a,#@Sigma0[2]
 478         vadd.i64        $T1,$Ch
 479         vsli.64         $t0,$a,#`64-@Sigma0[0]`
 480         vadd.i64        $T1,@X[$i%16]
 481         vsli.64         $t1,$a,#`64-@Sigma0[1]`
 482         vadd.i64        $T1,$K
 483         vsli.64         $t2,$a,#`64-@Sigma0[2]`
 484         veor            $h,$t0,$t1
 485         veor            $Maj,$a,$b
 486         veor            $h,$t2                  @ Sigma0(a)
 487         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
 488         vadd.i64        $h,$T1
 489         vadd.i64        $d,$T1
 490         vadd.i64        $h,$Maj
 491 ___
 492 }
 493
 494 sub NEON_16_79() {
 495 my $i=shift;
 496
 497 if ($i&1)       { &NEON_00_15($i,@_); return; }
 498
 499 # 2x-vectorized, therefore runs every 2nd round
 500 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
 501 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
 502 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
 503 my $e=@_[4];                                    # $e from NEON_00_15
 504 $i /= 2;
 505 $code.=<<___;
 506         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
 507         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
 508         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
 509         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
 510         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
 511         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
 512         veor            $s1,$t0
 513         vshr.u64        $t0,$s0,#@sigma0[0]
 514         veor            $s1,$t1                         @ sigma1(X[i+14])
 515         vshr.u64        $t1,$s0,#@sigma0[1]
 516         vadd.i64        @X[$i%8],$s1
 517         vshr.u64        $s1,$s0,#@sigma0[2]
 518         vsli.64         $t0,$s0,#`64-@sigma0[0]`
 519         vsli.64         $t1,$s0,#`64-@sigma0[1]`
 520         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
 521         veor            $s1,$t0
 522         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
 523         vadd.i64        @X[$i%8],$s0
 524         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
 525         veor            $s1,$t1                         @ sigma0(X[i+1])
 526         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
 527         vadd.i64        @X[$i%8],$s1
 528 ___
 529         &NEON_00_15(2*$i,@_);
 530 }
 531
 532 $code.=<<___;
 533 #if __ARM_ARCH__>=7
 534 .fpu    neon
 535
 536 .align  4
 537 .LNEON:
 538         dmb                             @ errata #451034 on early Cortex A8
 539         vstmdb  sp!,{d8-d15}            @ ABI specification says so
 540         sub     $Ktbl,r3,#672           @ K512
 541         vldmia  $ctx,{$A-$H}            @ load context
 542 .Loop_neon:
 543 ___
 544 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
 545 $code.=<<___;
 546         mov             $cnt,#4
 547 .L16_79_neon:
 548         subs            $cnt,#1
 549 ___
 550 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
 551 $code.=<<___;
 552         bne             .L16_79_neon
 553
 554         vldmia          $ctx,{d24-d31}  @ load context to temp
 555         vadd.i64        q8,q12          @ vectorized accumulate
 556         vadd.i64        q9,q13
 557         vadd.i64        q10,q14
 558         vadd.i64        q11,q15
 559         vstmia          $ctx,{$A-$H}    @ save context
 560         teq             $inp,$len
 561         sub             $Ktbl,#640      @ rewind K512
 562         bne             .Loop_neon
 563
 564         vldmia  sp!,{d8-d15}            @ epilogue
 565         bx      lr
 566 #endif
 567 ___
 568 }
 569 $code.=<<___;
 570 .size   sha512_block_data_order,.-sha512_block_data_order
 571 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 572 .align  2
 573 .comm   OPENSSL_armcap_P,4,4
 574 ___
 575
 576 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 577 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 578 print $code;
 579 close STDOUT; # enforce flush