crypto/modes/asm/ghash-armv4.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # April 2010
  11 #
  12 # The module implements "4-bit" GCM GHASH function and underlying
  13 # single multiplication operation in GF(2^128). "4-bit" means that it
  14 # uses 256 bytes per-key table [+32 bytes shared table]. There is no
  15 # experimental performance data available yet. The only approximation
  16 # that can be made at this point is based on code size. Inner loop is
  17 # 32 instructions long and on single-issue core should execute in <40
  18 # cycles. Having verified that gcc 3.4 didn't unroll corresponding
  19 # loop, this assembler loop body was found to be ~3x smaller than
  20 # compiler-generated one...
  21 #
  22 # Byte order [in]dependence. =========================================
  23 #
  24 # Caller is expected to maintain specific *dword* order in Htable,
  25 # namely with *least* significant dword of 128-bit value at *lower*
  26 # address. This differs completely from C code and has everything to
  27 # do with ldm instruction and order in which dwords are "consumed" by
  28 # algorithm. *Byte* order within these dwords in turn is whatever
  29 # *native* byte order on current platform. See gcm128.c for working
  30 # example...
  31
  32 $Xi="r0";       # argument block
  33 $Htbl="r1";
  34 $inp="r2";
  35 $len="r3";
  36 $Zll="r4";      # variables
  37 $Zlh="r5";
  38 $Zhl="r6";
  39 $Zhh="r7";
  40 $Tll="r8";
  41 $Tlh="r9";
  42 $Thl="r10";
  43 $Thh="r11";
  44 $nlo="r12";
  45 ################# r13 is stack pointer
  46 $nhi="r14";
  47 ################# r15 is program counter
  48
  49 $rem_4bit=$inp; # used in gcm_gmult_4bit
  50 $cnt=$len;
  51
  52 $output=shift;
  53 open STDOUT,">$output";
  54
  55 sub Zsmash() {
  56   my $i=12;
  57   my @args=@_;
  58   for ($Zll,$Zlh,$Zhl,$Zhh) {
  59     # can be reduced to single "str $_,[$Xi,$i]" on big-endian platforms
  60     $code.=<<___;
  61         mov     $Tlh,$_,lsr#8
  62         strb    $_,[$Xi,#$i+3]
  63         mov     $Thl,$_,lsr#16
  64         strb    $Tlh,[$Xi,#$i+2]
  65         mov     $Thh,$_,lsr#24
  66         strb    $Thl,[$Xi,#$i+1]
  67         strb    $Thh,[$Xi,#$i]
  68 ___
  69     $code.="\t".shift(@args)."\n";
  70     $i-=4;
  71   }
  72 }
  73
  74 $code=<<___;
  75 .text
  76 .code   32
  77
  78 .type   rem_4bit,%object
  79 .align  5
  80 rem_4bit:
  81 .short  0x0000,0x1C20,0x3840,0x2460
  82 .short  0x7080,0x6CA0,0x48C0,0x54E0
  83 .short  0xE100,0xFD20,0xD940,0xC560
  84 .short  0x9180,0x8DA0,0xA9C0,0xB5E0
  85 .size   rem_4bit,.-rem_4bit
  86
  87 .type   rem_4bit_get,%function
  88 rem_4bit_get:
  89         sub     $rem_4bit,pc,#8
  90         sub     $rem_4bit,$rem_4bit,#32 @ &rem_4bit
  91         b       .Lrem_4bit_got
  92         nop
  93 .size   rem_4bit_get,.-rem_4bit_get
  94
  95 .global gcm_ghash_4bit
  96 .type   gcm_ghash_4bit,%function
  97 gcm_ghash_4bit:
  98         sub     r12,pc,#8
  99         add     $len,$inp,$len          @ $len to point at the end
 100         stmdb   sp!,{r3-r11,lr}         @ save $len/end too
 101         sub     r12,r12,#48             @ &rem_4bit
 102
 103         ldmia   r12,{r4-r11}            @ copy rem_4bit ...
 104         stmdb   sp!,{r4-r11}            @ ... to stack
 105
 106         ldrb    $nlo,[$inp,#15]
 107         ldrb    $nhi,[$Xi,#15]
 108 .Louter:
 109         eor     $nlo,$nlo,$nhi
 110         and     $nhi,$nlo,#0xf0
 111         and     $nlo,$nlo,#0x0f
 112         mov     $cnt,#14
 113
 114         add     $Zhh,$Htbl,$nlo,lsl#4
 115         ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
 116         ldrb    $nlo,[$inp,#14]
 117
 118         add     $Thh,$Htbl,$nhi
 119         and     $nhi,$Zll,#0xf          @ rem
 120         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 121         mov     $nhi,$nhi,lsl#1
 122         eor     $Zll,$Tll,$Zll,lsr#4
 123         ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
 124         eor     $Zll,$Zll,$Zlh,lsl#28
 125         ldrb    $nhi,[$Xi,#14]
 126         eor     $Zlh,$Tlh,$Zlh,lsr#4
 127         eor     $Zlh,$Zlh,$Zhl,lsl#28
 128         eor     $Zhl,$Thl,$Zhl,lsr#4
 129         eor     $Zhl,$Zhl,$Zhh,lsl#28
 130         eor     $Zhh,$Thh,$Zhh,lsr#4
 131         eor     $nlo,$nlo,$nhi
 132         eor     $Zhh,$Zhh,$Tll,lsl#16
 133         and     $nhi,$nlo,#0xf0
 134         and     $nlo,$nlo,#0x0f
 135
 136 .Loop:
 137         add     $Thh,$Htbl,$nlo,lsl#4
 138         subs    $cnt,$cnt,#1
 139         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
 140         and     $nlo,$Zll,#0xf          @ rem
 141         add     $nlo,$nlo,$nlo
 142         eor     $Zll,$Tll,$Zll,lsr#4
 143         ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
 144         eor     $Zll,$Zll,$Zlh,lsl#28
 145         eor     $Zlh,$Tlh,$Zlh,lsr#4
 146         eor     $Zlh,$Zlh,$Zhl,lsl#28
 147         eor     $Zhl,$Thl,$Zhl,lsr#4
 148         eor     $Zhl,$Zhl,$Zhh,lsl#28
 149         eor     $Zhh,$Thh,$Zhh,lsr#4
 150         ldrplb  $nlo,[$inp,$cnt]
 151
 152         add     $Thh,$Htbl,$nhi
 153         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 154         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 155         and     $nhi,$Zll,#0xf          @ rem
 156         add     $nhi,$nhi,$nhi
 157         eor     $Zll,$Tll,$Zll,lsr#4
 158         ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
 159         eor     $Zll,$Zll,$Zlh,lsl#28
 160         ldrplb  $nhi,[$Xi,$cnt]
 161         eor     $Zlh,$Tlh,$Zlh,lsr#4
 162         eor     $Zlh,$Zlh,$Zhl,lsl#28
 163         eor     $Zhl,$Thl,$Zhl,lsr#4
 164         eor     $Zhl,$Zhl,$Zhh,lsl#28
 165         eor     $Zhh,$Thh,$Zhh,lsr#4
 166         eorpl   $nlo,$nlo,$nhi
 167         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 168         andpl   $nhi,$nlo,#0xf0
 169         andpl   $nlo,$nlo,#0x0f
 170         bpl     .Loop
 171
 172         ldr     $len,[sp,#32]           @ re-load $len/end
 173         add     $inp,$inp,#16
 174         mov     $nhi,$Zll
 175 ___
 176         &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
 177 $code.=<<___;
 178         bne     .Louter
 179
 180         add     sp,sp,#36
 181         ldmia   sp!,{r4-r11,lr}
 182         tst     lr,#1
 183         moveq   pc,lr                   @ be binary compatible with V4, yet
 184         bx      lr                      @ interoperable with Thumb ISA:-)
 185 .size   gcm_ghash_4bit,.-gcm_ghash_4bit
 186
 187 .global gcm_gmult_4bit
 188 .type   gcm_gmult_4bit,%function
 189 gcm_gmult_4bit:
 190         stmdb   sp!,{r4-r11,lr}
 191         ldrb    $nlo,[$Xi,#15]
 192         b       rem_4bit_get
 193 .Lrem_4bit_got:
 194         and     $nhi,$nlo,#0xf0
 195         and     $nlo,$nlo,#0x0f
 196         mov     $cnt,#14
 197
 198         add     $Zhh,$Htbl,$nlo,lsl#4
 199         ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
 200         ldrb    $nlo,[$Xi,#14]
 201
 202         add     $Thh,$Htbl,$nhi
 203         and     $nhi,$Zll,#0xf          @ rem
 204         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 205         mov     $nhi,$nhi,lsl#1
 206         eor     $Zll,$Tll,$Zll,lsr#4
 207         ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
 208         eor     $Zll,$Zll,$Zlh,lsl#28
 209         eor     $Zlh,$Tlh,$Zlh,lsr#4
 210         eor     $Zlh,$Zlh,$Zhl,lsl#28
 211         eor     $Zhl,$Thl,$Zhl,lsr#4
 212         eor     $Zhl,$Zhl,$Zhh,lsl#28
 213         eor     $Zhh,$Thh,$Zhh,lsr#4
 214         and     $nhi,$nlo,#0xf0
 215         eor     $Zhh,$Zhh,$Tll,lsl#16
 216         and     $nlo,$nlo,#0x0f
 217
 218 .Loop2:
 219         add     $Thh,$Htbl,$nlo,lsl#4
 220         subs    $cnt,$cnt,#1
 221         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
 222         and     $nlo,$Zll,#0xf          @ rem
 223         add     $nlo,$nlo,$nlo
 224         eor     $Zll,$Tll,$Zll,lsr#4
 225         ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
 226         eor     $Zll,$Zll,$Zlh,lsl#28
 227         eor     $Zlh,$Tlh,$Zlh,lsr#4
 228         eor     $Zlh,$Zlh,$Zhl,lsl#28
 229         eor     $Zhl,$Thl,$Zhl,lsr#4
 230         eor     $Zhl,$Zhl,$Zhh,lsl#28
 231         eor     $Zhh,$Thh,$Zhh,lsr#4
 232         ldrplb  $nlo,[$Xi,$cnt]
 233
 234         add     $Thh,$Htbl,$nhi
 235         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 236         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 237         and     $nhi,$Zll,#0xf          @ rem
 238         add     $nhi,$nhi,$nhi
 239         eor     $Zll,$Tll,$Zll,lsr#4
 240         ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
 241         eor     $Zll,$Zll,$Zlh,lsl#28
 242         eor     $Zlh,$Tlh,$Zlh,lsr#4
 243         eor     $Zlh,$Zlh,$Zhl,lsl#28
 244         eor     $Zhl,$Thl,$Zhl,lsr#4
 245         eor     $Zhl,$Zhl,$Zhh,lsl#28
 246         eor     $Zhh,$Thh,$Zhh,lsr#4
 247         andpl   $nhi,$nlo,#0xf0
 248         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 249         andpl   $nlo,$nlo,#0x0f
 250         bpl     .Loop2
 251 ___
 252         &Zsmash();
 253 $code.=<<___;
 254         ldmia   sp!,{r4-r11,lr}
 255         tst     lr,#1
 256         moveq   pc,lr                   @ be binary compatible with V4, yet
 257         bx      lr                      @ interoperable with Thumb ISA:-)
 258 .size   gcm_gmult_4bit,.-gcm_gmult_4bit
 259 .asciz  "GHASH for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 260 .align  2
 261 ___
 262
 263 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 264 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 265 print $code;
 266 close STDOUT; # enforce flush