crypto/bn/asm/armv4-gf2m.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # May 2011
  18 #
  19 # The module implements bn_GF2m_mul_2x2 polynomial multiplication
  20 # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
  21 # C for the time being... Except that it has two code paths: pure
  22 # integer code suitable for any ARMv4 and later CPU and NEON code
  23 # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
  24 # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
  25 # faster than compiler-generated code. For ECDH and ECDSA verify (but
  26 # not for ECDSA sign) it means 25%-45% improvement depending on key
  27 # length, more for longer keys. Even though NEON 1x1 multiplication
  28 # runs in even less cycles, ~30, improvement is measurable only on
  29 # longer keys. One has to optimize code elsewhere to get NEON glow...
  30 #
  31 # April 2014
  32 #
  33 # Double bn_GF2m_mul_2x2 performance by using algorithm from paper
  34 # referred below, which improves ECDH and ECDSA verify benchmarks
  35 # by 18-40%.
  36 #
  37 # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
  38 # Polynomial Multiplication on ARM Processors using the NEON Engine.
  39 #
  40 # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
  41
  42 # $output is the last argument if it looks like a file (it has an extension)
  43 # $flavour is the first argument if it doesn't look like a file
  44 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  45 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  46
  47 if ($flavour && $flavour ne "void") {
  48     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  49     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  50     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  51     die "can't locate arm-xlate.pl";
  52
  53     open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
  54         or die "can't call $xlate: $1";
  55 } else {
  56     $output and open STDOUT,">$output";
  57 }
  58
  59 $code=<<___;
  60 #include "arm_arch.h"
  61
  62 #if defined(__thumb2__)
  63 .syntax unified
  64 .thumb
  65 #else
  66 .code   32
  67 #endif
  68
  69 .text
  70 ___
  71 ################
  72 # private interface to mul_1x1_ialu
  73 #
  74 $a="r1";
  75 $b="r0";
  76
  77 ($a0,$a1,$a2,$a12,$a4,$a14)=
  78 ($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
  79
  80 $mask="r12";
  81
  82 $code.=<<___;
  83 .type   mul_1x1_ialu,%function
  84 .align  5
  85 mul_1x1_ialu:
  86         mov     $a0,#0
  87         bic     $a1,$a,#3<<30           @ a1=a&0x3fffffff
  88         str     $a0,[sp,#0]             @ tab[0]=0
  89         add     $a2,$a1,$a1             @ a2=a1<<1
  90         str     $a1,[sp,#4]             @ tab[1]=a1
  91         eor     $a12,$a1,$a2            @ a1^a2
  92         str     $a2,[sp,#8]             @ tab[2]=a2
  93         mov     $a4,$a1,lsl#2           @ a4=a1<<2
  94         str     $a12,[sp,#12]           @ tab[3]=a1^a2
  95         eor     $a14,$a1,$a4            @ a1^a4
  96         str     $a4,[sp,#16]            @ tab[4]=a4
  97         eor     $a0,$a2,$a4             @ a2^a4
  98         str     $a14,[sp,#20]           @ tab[5]=a1^a4
  99         eor     $a12,$a12,$a4           @ a1^a2^a4
 100         str     $a0,[sp,#24]            @ tab[6]=a2^a4
 101         and     $i0,$mask,$b,lsl#2
 102         str     $a12,[sp,#28]           @ tab[7]=a1^a2^a4
 103
 104         and     $i1,$mask,$b,lsr#1
 105         ldr     $lo,[sp,$i0]            @ tab[b       & 0x7]
 106         and     $i0,$mask,$b,lsr#4
 107         ldr     $t1,[sp,$i1]            @ tab[b >>  3 & 0x7]
 108         and     $i1,$mask,$b,lsr#7
 109         ldr     $t0,[sp,$i0]            @ tab[b >>  6 & 0x7]
 110         eor     $lo,$lo,$t1,lsl#3       @ stall
 111         mov     $hi,$t1,lsr#29
 112         ldr     $t1,[sp,$i1]            @ tab[b >>  9 & 0x7]
 113
 114         and     $i0,$mask,$b,lsr#10
 115         eor     $lo,$lo,$t0,lsl#6
 116         eor     $hi,$hi,$t0,lsr#26
 117         ldr     $t0,[sp,$i0]            @ tab[b >> 12 & 0x7]
 118
 119         and     $i1,$mask,$b,lsr#13
 120         eor     $lo,$lo,$t1,lsl#9
 121         eor     $hi,$hi,$t1,lsr#23
 122         ldr     $t1,[sp,$i1]            @ tab[b >> 15 & 0x7]
 123
 124         and     $i0,$mask,$b,lsr#16
 125         eor     $lo,$lo,$t0,lsl#12
 126         eor     $hi,$hi,$t0,lsr#20
 127         ldr     $t0,[sp,$i0]            @ tab[b >> 18 & 0x7]
 128
 129         and     $i1,$mask,$b,lsr#19
 130         eor     $lo,$lo,$t1,lsl#15
 131         eor     $hi,$hi,$t1,lsr#17
 132         ldr     $t1,[sp,$i1]            @ tab[b >> 21 & 0x7]
 133
 134         and     $i0,$mask,$b,lsr#22
 135         eor     $lo,$lo,$t0,lsl#18
 136         eor     $hi,$hi,$t0,lsr#14
 137         ldr     $t0,[sp,$i0]            @ tab[b >> 24 & 0x7]
 138
 139         and     $i1,$mask,$b,lsr#25
 140         eor     $lo,$lo,$t1,lsl#21
 141         eor     $hi,$hi,$t1,lsr#11
 142         ldr     $t1,[sp,$i1]            @ tab[b >> 27 & 0x7]
 143
 144         tst     $a,#1<<30
 145         and     $i0,$mask,$b,lsr#28
 146         eor     $lo,$lo,$t0,lsl#24
 147         eor     $hi,$hi,$t0,lsr#8
 148         ldr     $t0,[sp,$i0]            @ tab[b >> 30      ]
 149
 150 #ifdef  __thumb2__
 151         itt     ne
 152 #endif
 153         eorne   $lo,$lo,$b,lsl#30
 154         eorne   $hi,$hi,$b,lsr#2
 155         tst     $a,#1<<31
 156         eor     $lo,$lo,$t1,lsl#27
 157         eor     $hi,$hi,$t1,lsr#5
 158 #ifdef  __thumb2__
 159         itt     ne
 160 #endif
 161         eorne   $lo,$lo,$b,lsl#31
 162         eorne   $hi,$hi,$b,lsr#1
 163         eor     $lo,$lo,$t0,lsl#30
 164         eor     $hi,$hi,$t0,lsr#2
 165
 166         mov     pc,lr
 167 .size   mul_1x1_ialu,.-mul_1x1_ialu
 168 ___
 169 ################
 170 # void  bn_GF2m_mul_2x2(BN_ULONG *r,
 171 #       BN_ULONG a1,BN_ULONG a0,
 172 #       BN_ULONG b1,BN_ULONG b0);       # r[3..0]=a1a0·b1b0
 173 {
 174 $code.=<<___;
 175 .global bn_GF2m_mul_2x2
 176 .type   bn_GF2m_mul_2x2,%function
 177 .align  5
 178 bn_GF2m_mul_2x2:
 179 #if __ARM_MAX_ARCH__>=7
 180         stmdb   sp!,{r10,lr}
 181         ldr     r12,.LOPENSSL_armcap
 182 # if !defined(_WIN32)
 183         adr     r10,.LOPENSSL_armcap
 184         ldr     r12,[r12,r10]
 185 # endif
 186 # if defined(__APPLE__) || defined(_WIN32)
 187         ldr     r12,[r12]
 188 # endif
 189         tst     r12,#ARMV7_NEON
 190         itt     ne
 191         ldrne   r10,[sp],#8
 192         bne     .LNEON
 193         stmdb   sp!,{r4-r9}
 194 #else
 195         stmdb   sp!,{r4-r10,lr}
 196 #endif
 197 ___
 198 $ret="r10";     # reassigned 1st argument
 199 $code.=<<___;
 200         mov     $ret,r0                 @ reassign 1st argument
 201         mov     $b,r3                   @ $b=b1
 202         sub     r7,sp,#36
 203         mov     r8,sp
 204         and     r7,r7,#-32
 205         ldr     r3,[sp,#32]             @ load b0
 206         mov     $mask,#7<<2
 207         mov     sp,r7                   @ allocate tab[8]
 208         str     r8,[r7,#32]
 209
 210         bl      mul_1x1_ialu            @ a1·b1
 211         str     $lo,[$ret,#8]
 212         str     $hi,[$ret,#12]
 213
 214         eor     $b,$b,r3                @ flip b0 and b1
 215          eor    $a,$a,r2                @ flip a0 and a1
 216         eor     r3,r3,$b
 217          eor    r2,r2,$a
 218         eor     $b,$b,r3
 219          eor    $a,$a,r2
 220         bl      mul_1x1_ialu            @ a0·b0
 221         str     $lo,[$ret]
 222         str     $hi,[$ret,#4]
 223
 224         eor     $a,$a,r2
 225         eor     $b,$b,r3
 226         bl      mul_1x1_ialu            @ (a1+a0)·(b1+b0)
 227 ___
 228 @r=map("r$_",(6..9));
 229 $code.=<<___;
 230         ldmia   $ret,{@r[0]-@r[3]}
 231         eor     $lo,$lo,$hi
 232         ldr     sp,[sp,#32]             @ destroy tab[8]
 233         eor     $hi,$hi,@r[1]
 234         eor     $lo,$lo,@r[0]
 235         eor     $hi,$hi,@r[2]
 236         eor     $lo,$lo,@r[3]
 237         eor     $hi,$hi,@r[3]
 238         str     $hi,[$ret,#8]
 239         eor     $lo,$lo,$hi
 240         str     $lo,[$ret,#4]
 241
 242 #if __ARM_ARCH__>=5
 243         ldmia   sp!,{r4-r10,pc}
 244 #else
 245         ldmia   sp!,{r4-r10,lr}
 246         tst     lr,#1
 247         moveq   pc,lr                   @ be binary compatible with V4, yet
 248         bx      lr                      @ interoperable with Thumb ISA:-)
 249 #endif
 250 ___
 251 }
 252 {
 253 my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
 254 my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
 255
 256 $code.=<<___;
 257 #if __ARM_MAX_ARCH__>=7
 258 .arch   armv7-a
 259 .fpu    neon
 260
 261 .align  5
 262 .LNEON:
 263         ldr             r12, [sp]               @ 5th argument
 264         vmov            $a, r2, r1
 265         vmov            $b, r12, r3
 266         vmov.i64        $k48, #0x0000ffffffffffff
 267         vmov.i64        $k32, #0x00000000ffffffff
 268         vmov.i64        $k16, #0x000000000000ffff
 269
 270         vext.8          $t0#lo, $a, $a, #1      @ A1
 271         vmull.p8        $t0, $t0#lo, $b         @ F = A1*B
 272         vext.8          $r#lo, $b, $b, #1       @ B1
 273         vmull.p8        $r, $a, $r#lo           @ E = A*B1
 274         vext.8          $t1#lo, $a, $a, #2      @ A2
 275         vmull.p8        $t1, $t1#lo, $b         @ H = A2*B
 276         vext.8          $t3#lo, $b, $b, #2      @ B2
 277         vmull.p8        $t3, $a, $t3#lo         @ G = A*B2
 278         vext.8          $t2#lo, $a, $a, #3      @ A3
 279         veor            $t0, $t0, $r            @ L = E + F
 280         vmull.p8        $t2, $t2#lo, $b         @ J = A3*B
 281         vext.8          $r#lo, $b, $b, #3       @ B3
 282         veor            $t1, $t1, $t3           @ M = G + H
 283         vmull.p8        $r, $a, $r#lo           @ I = A*B3
 284         veor            $t0#lo, $t0#lo, $t0#hi  @ t0 = (L) (P0 + P1) << 8
 285         vand            $t0#hi, $t0#hi, $k48
 286         vext.8          $t3#lo, $b, $b, #4      @ B4
 287         veor            $t1#lo, $t1#lo, $t1#hi  @ t1 = (M) (P2 + P3) << 16
 288         vand            $t1#hi, $t1#hi, $k32
 289         vmull.p8        $t3, $a, $t3#lo         @ K = A*B4
 290         veor            $t2, $t2, $r            @ N = I + J
 291         veor            $t0#lo, $t0#lo, $t0#hi
 292         veor            $t1#lo, $t1#lo, $t1#hi
 293         veor            $t2#lo, $t2#lo, $t2#hi  @ t2 = (N) (P4 + P5) << 24
 294         vand            $t2#hi, $t2#hi, $k16
 295         vext.8          $t0, $t0, $t0, #15
 296         veor            $t3#lo, $t3#lo, $t3#hi  @ t3 = (K) (P6 + P7) << 32
 297         vmov.i64        $t3#hi, #0
 298         vext.8          $t1, $t1, $t1, #14
 299         veor            $t2#lo, $t2#lo, $t2#hi
 300         vmull.p8        $r, $a, $b              @ D = A*B
 301         vext.8          $t3, $t3, $t3, #12
 302         vext.8          $t2, $t2, $t2, #13
 303         veor            $t0, $t0, $t1
 304         veor            $t2, $t2, $t3
 305         veor            $r, $r, $t0
 306         veor            $r, $r, $t2
 307
 308         vst1.32         {$r}, [r0]
 309         ret             @ bx lr
 310 #endif
 311 ___
 312 }
 313 $code.=<<___;
 314 .size   bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
 315 #if __ARM_MAX_ARCH__>=7
 316 .align  5
 317 .LOPENSSL_armcap:
 318 # ifdef _WIN32
 319 .word   OPENSSL_armcap_P
 320 # else
 321 .word   OPENSSL_armcap_P-.
 322 # endif
 323 #endif
 324 .asciz  "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 325 .align  5
 326
 327 #if __ARM_MAX_ARCH__>=7
 328 .comm   OPENSSL_armcap_P,4,4
 329 #endif
 330 ___
 331
 332 foreach (split("\n",$code)) {
 333         s/\`([^\`]*)\`/eval $1/geo;
 334
 335         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
 336         s/\bret\b/bx    lr/go           or
 337         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
 338
 339         print $_,"\n";
 340 }
 341 close STDOUT;   # enforce flush