Optimize AES-ECB mode in OpenSSL for both aarch64 and aarch32

author XiaokangQian <xiaokang.qian@arm.com>

Thu, 7 Nov 2019 02:36:45 +0000 (02:36 +0000)

committer Richard Levitte <levitte@openssl.org>

Wed, 11 Dec 2019 17:56:11 +0000 (18:56 +0100)
author XiaokangQian <xiaokang.qian@arm.com>
Thu, 7 Nov 2019 02:36:45 +0000 (02:36 +0000)
committer Richard Levitte <levitte@openssl.org>
Wed, 11 Dec 2019 17:56:11 +0000 (18:56 +0100)
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl

index c132d1d7fe44e130f0482e9a05f252120d4abba2..d0ae34bc48fd0ff75d2d44f1632db8cac3bfed30 100755 (executable)
--- a/crypto/aes/asm/aesv8-armx.pl
+++ b/crypto/aes/asm/aesv8-armx.pl
@@ -384,6 +384,836 @@ ___
  &gen_block("en");
  &gen_block("de");
  }}}
+
+# Performance in cycles per byte.
+# Processed with AES-ECB different key size.
+# It shows the value before and after optimization as below:
+# (before/after):
+#
+#              AES-128-ECB             AES-192-ECB             AES-256-ECB
+# Cortex-A57   1.85/0.82               2.16/0.96               2.47/1.10
+# Cortex-A72   1.64/0.85               1.82/0.99               2.13/1.14
+
+# Optimization is implemented by loop unrolling and interleaving.
+# Commonly, we choose the unrolling factor as 5, if the input
+# data size smaller than 5 blocks, but not smaller than 3 blocks,
+# choose 3 as the unrolling factor.
+# If the input data size dsize >= 5*16 bytes, then take 5 blocks
+# as one iteration, every loop the left size lsize -= 5*16.
+# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
+# every loop lsize -=3*16.
+# If lsize < 3*16 bytes, treat them as the tail, interleave the
+# two blocks AES instructions.
+# There is one special case, if the original input data size dsize
+# = 16 bytes, we will treat it seperately to improve the
+# performance: one independent code block without LR, FP load and
+# store, just looks like what the original ECB implementation does.
+
+{{{
+my ($inp,$out,$len,$key)=map("x$_",(0..3));
+my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+
+### q7 last round key
+### q10-q15    q7 Last 7 round keys
+### q8-q9      preloaded round keys except last 7 keys for big size
+### q5, q6, q8-q9      preloaded round keys except last 7 keys for only 16 byte
+
+{
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat3,$in3,$tmp3); # used only in 64-bit mode
+my ($dat4,$in4,$tmp4);
+if ($flavour =~ /64/) {
+    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
+}
+
+$code.=<<___;
+.globl ${prefix}_ecb_encrypt
+.type  ${prefix}_ecb_encrypt,%function
+.align 5
+${prefix}_ecb_encrypt:
+___
+$code.=<<___   if ($flavour =~ /64/);
+       subs    $len,$len,#16
+       // Original input data size bigger than 16, jump to big size processing.
+       b.ne    .Lecb_big_size
+       vld1.8  {$dat0},[$inp]
+       cmp     $enc,#0                                 // en- or decrypting?
+       ldr     $rounds,[$key,#240]
+       vld1.32 {q5-q6},[$key],#32                      // load key schedule...
+
+       b.eq .Lecb_small_dec
+       aese    $dat0,q5
+       aesmc   $dat0,$dat0
+       vld1.32 {q8-q9},[$key],#32                      // load key schedule...
+       aese    $dat0,q6
+       aesmc   $dat0,$dat0
+       subs    $rounds,$rounds,#10                     // if rounds==10, jump to aes-128-ecb processing
+       b.eq    .Lecb_128_enc
+.Lecb_round_loop:
+       aese    $dat0,q8
+       aesmc   $dat0,$dat0
+       vld1.32 {q8},[$key],#16                         // load key schedule...
+       aese    $dat0,q9
+       aesmc   $dat0,$dat0
+       vld1.32 {q9},[$key],#16                         // load key schedule...
+       subs    $rounds,$rounds,#2                      // bias
+       b.gt    .Lecb_round_loop
+.Lecb_128_enc:
+       vld1.32 {q10-q11},[$key],#32            // load key schedule...
+       aese    $dat0,q8
+       aesmc   $dat0,$dat0
+       aese    $dat0,q9
+       aesmc   $dat0,$dat0
+       vld1.32 {q12-q13},[$key],#32            // load key schedule...
+       aese    $dat0,q10
+       aesmc   $dat0,$dat0
+       aese    $dat0,q11
+       aesmc   $dat0,$dat0
+       vld1.32 {q14-q15},[$key],#32            // load key schedule...
+       aese    $dat0,q12
+       aesmc   $dat0,$dat0
+       aese    $dat0,q13
+       aesmc   $dat0,$dat0
+       vld1.32 {$rndlast},[$key]
+       aese    $dat0,q14
+       aesmc   $dat0,$dat0
+       aese    $dat0,q15
+       veor    $dat0,$dat0,$rndlast
+       vst1.8  {$dat0},[$out]
+       b       .Lecb_Final_abort
+.Lecb_small_dec:
+       aesd    $dat0,q5
+       aesimc  $dat0,$dat0
+       vld1.32 {q8-q9},[$key],#32                      // load key schedule...
+       aesd    $dat0,q6
+       aesimc  $dat0,$dat0
+       subs    $rounds,$rounds,#10                     // bias
+       b.eq    .Lecb_128_dec
+.Lecb_dec_round_loop:
+       aesd    $dat0,q8
+       aesimc  $dat0,$dat0
+       vld1.32 {q8},[$key],#16                         // load key schedule...
+       aesd    $dat0,q9
+       aesimc  $dat0,$dat0
+       vld1.32 {q9},[$key],#16                         // load key schedule...
+       subs    $rounds,$rounds,#2                      // bias
+       b.gt    .Lecb_dec_round_loop
+.Lecb_128_dec:
+       vld1.32 {q10-q11},[$key],#32            // load key schedule...
+       aesd    $dat0,q8
+       aesimc  $dat0,$dat0
+       aesd    $dat0,q9
+       aesimc  $dat0,$dat0
+       vld1.32 {q12-q13},[$key],#32            // load key schedule...
+       aesd    $dat0,q10
+       aesimc  $dat0,$dat0
+       aesd    $dat0,q11
+       aesimc  $dat0,$dat0
+       vld1.32 {q14-q15},[$key],#32            // load key schedule...
+       aesd    $dat0,q12
+       aesimc  $dat0,$dat0
+       aesd    $dat0,q13
+       aesimc  $dat0,$dat0
+       vld1.32 {$rndlast},[$key]
+       aesd    $dat0,q14
+       aesimc  $dat0,$dat0
+       aesd    $dat0,q15
+       veor    $dat0,$dat0,$rndlast
+       vst1.8  {$dat0},[$out]
+       b       .Lecb_Final_abort
+.Lecb_big_size:
+___
+$code.=<<___   if ($flavour =~ /64/);
+       stp     x29,x30,[sp,#-16]!
+       add     x29,sp,#0
+___
+$code.=<<___   if ($flavour !~ /64/);
+       mov     ip,sp
+       stmdb   sp!,{r4-r8,lr}
+       vstmdb  sp!,{d8-d15}                    @ ABI specification says so
+       ldmia   ip,{r4-r5}                      @ load remaining args
+       subs    $len,$len,#16
+___
+$code.=<<___;
+       mov     $step,#16
+       b.lo    .Lecb_done
+       cclr    $step,eq
+
+       cmp     $enc,#0                                 // en- or decrypting?
+       ldr     $rounds,[$key,#240]
+       and     $len,$len,#-16
+       vld1.8  {$dat},[$inp],$step
+
+       vld1.32 {q8-q9},[$key]                          // load key schedule...
+       sub     $rounds,$rounds,#6
+       add     $key_,$key,x5,lsl#4                             // pointer to last 7 round keys
+       sub     $rounds,$rounds,#2
+       vld1.32 {q10-q11},[$key_],#32
+       vld1.32 {q12-q13},[$key_],#32
+       vld1.32 {q14-q15},[$key_],#32
+       vld1.32 {$rndlast},[$key_]
+
+       add     $key_,$key,#32
+       mov     $cnt,$rounds
+       b.eq    .Lecb_dec
+
+       vld1.8  {$dat1},[$inp],#16
+       subs    $len,$len,#32                           // bias
+       add     $cnt,$rounds,#2
+       vorr    $in1,$dat1,$dat1
+       vorr    $dat2,$dat1,$dat1
+       vorr    $dat1,$dat,$dat
+       b.lo    .Lecb_enc_tail
+
+       vorr    $dat1,$in1,$in1
+       vld1.8  {$dat2},[$inp],#16
+___
+$code.=<<___   if ($flavour =~ /64/);
+       cmp     $len,#32
+       b.lo    .Loop3x_ecb_enc
+
+       vld1.8  {$dat3},[$inp],#16
+       vld1.8  {$dat4},[$inp],#16
+       sub     $len,$len,#32                           // bias
+       mov     $cnt,$rounds
+
+.Loop5x_ecb_enc:
+       aese    $dat0,q8
+       aesmc   $dat0,$dat0
+       aese    $dat1,q8
+       aesmc   $dat1,$dat1
+       aese    $dat2,q8
+       aesmc   $dat2,$dat2
+       aese    $dat3,q8
+       aesmc   $dat3,$dat3
+       aese    $dat4,q8
+       aesmc   $dat4,$dat4
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aese    $dat0,q9
+       aesmc   $dat0,$dat0
+       aese    $dat1,q9
+       aesmc   $dat1,$dat1
+       aese    $dat2,q9
+       aesmc   $dat2,$dat2
+       aese    $dat3,q9
+       aesmc   $dat3,$dat3
+       aese    $dat4,q9
+       aesmc   $dat4,$dat4
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Loop5x_ecb_enc
+
+       aese    $dat0,q8
+       aesmc   $dat0,$dat0
+       aese    $dat1,q8
+       aesmc   $dat1,$dat1
+       aese    $dat2,q8
+       aesmc   $dat2,$dat2
+       aese    $dat3,q8
+       aesmc   $dat3,$dat3
+       aese    $dat4,q8
+       aesmc   $dat4,$dat4
+       cmp     $len,#0x40                                      // because .Lecb_enc_tail4x
+       sub     $len,$len,#0x50
+
+       aese    $dat0,q9
+       aesmc   $dat0,$dat0
+       aese    $dat1,q9
+       aesmc   $dat1,$dat1
+       aese    $dat2,q9
+       aesmc   $dat2,$dat2
+       aese    $dat3,q9
+       aesmc   $dat3,$dat3
+       aese    $dat4,q9
+       aesmc   $dat4,$dat4
+       csel    x6,xzr,$len,gt                  // borrow x6, $cnt, "gt" is not typo
+       mov     $key_,$key
+
+       aese    $dat0,q10
+       aesmc   $dat0,$dat0
+       aese    $dat1,q10
+       aesmc   $dat1,$dat1
+       aese    $dat2,q10
+       aesmc   $dat2,$dat2
+       aese    $dat3,q10
+       aesmc   $dat3,$dat3
+       aese    $dat4,q10
+       aesmc   $dat4,$dat4
+       add     $inp,$inp,x6                            // $inp is adjusted in such way that
+                                                       // at exit from the loop $dat1-$dat4
+                                                       // are loaded with last "words"
+       add     x6,$len,#0x60               // because .Lecb_enc_tail4x
+
+       aese    $dat0,q11
+       aesmc   $dat0,$dat0
+       aese    $dat1,q11
+       aesmc   $dat1,$dat1
+       aese    $dat2,q11
+       aesmc   $dat2,$dat2
+       aese    $dat3,q11
+       aesmc   $dat3,$dat3
+       aese    $dat4,q11
+       aesmc   $dat4,$dat4
+
+       aese    $dat0,q12
+       aesmc   $dat0,$dat0
+       aese    $dat1,q12
+       aesmc   $dat1,$dat1
+       aese    $dat2,q12
+       aesmc   $dat2,$dat2
+       aese    $dat3,q12
+       aesmc   $dat3,$dat3
+       aese    $dat4,q12
+       aesmc   $dat4,$dat4
+
+       aese    $dat0,q13
+       aesmc   $dat0,$dat0
+       aese    $dat1,q13
+       aesmc   $dat1,$dat1
+       aese    $dat2,q13
+       aesmc   $dat2,$dat2
+       aese    $dat3,q13
+       aesmc   $dat3,$dat3
+       aese    $dat4,q13
+       aesmc   $dat4,$dat4
+
+       aese    $dat0,q14
+       aesmc   $dat0,$dat0
+       aese    $dat1,q14
+       aesmc   $dat1,$dat1
+       aese    $dat2,q14
+       aesmc   $dat2,$dat2
+       aese    $dat3,q14
+       aesmc   $dat3,$dat3
+       aese    $dat4,q14
+       aesmc   $dat4,$dat4
+
+       aese    $dat0,q15
+       vld1.8  {$in0},[$inp],#16
+       aese    $dat1,q15
+       vld1.8  {$in1},[$inp],#16
+       aese    $dat2,q15
+       vld1.8  {$in2},[$inp],#16
+       aese    $dat3,q15
+       vld1.8  {$in3},[$inp],#16
+       aese    $dat4,q15
+       vld1.8  {$in4},[$inp],#16
+       cbz     x6,.Lecb_enc_tail4x
+       vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
+       veor    $tmp0,$rndlast,$dat0
+       vorr    $dat0,$in0,$in0
+       veor    $tmp1,$rndlast,$dat1
+       vorr    $dat1,$in1,$in1
+       veor    $tmp2,$rndlast,$dat2
+       vorr    $dat2,$in2,$in2
+       veor    $tmp3,$rndlast,$dat3
+       vorr    $dat3,$in3,$in3
+       veor    $tmp4,$rndlast,$dat4
+       vst1.8  {$tmp0},[$out],#16
+       vorr    $dat4,$in4,$in4
+       vst1.8  {$tmp1},[$out],#16
+       mov     $cnt,$rounds
+       vst1.8  {$tmp2},[$out],#16
+       vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
+       vst1.8  {$tmp3},[$out],#16
+       vst1.8  {$tmp4},[$out],#16
+       b.hs    .Loop5x_ecb_enc
+
+       add     $len,$len,#0x50
+       cbz     $len,.Lecb_done
+
+       add     $cnt,$rounds,#2
+       subs    $len,$len,#0x30
+       vorr    $dat0,$in2,$in2
+       vorr    $dat1,$in3,$in3
+       vorr    $dat2,$in4,$in4
+       b.lo    .Lecb_enc_tail
+
+       b       .Loop3x_ecb_enc
+
+.align 4
+.Lecb_enc_tail4x:
+       veor    $tmp1,$rndlast,$dat1
+       veor    $tmp2,$rndlast,$dat2
+       veor    $tmp3,$rndlast,$dat3
+       veor    $tmp4,$rndlast,$dat4
+       vst1.8  {$tmp1},[$out],#16
+       vst1.8  {$tmp2},[$out],#16
+       vst1.8  {$tmp3},[$out],#16
+       vst1.8  {$tmp4},[$out],#16
+
+       b       .Lecb_done
+.align 4
+___
+$code.=<<___;
+.Loop3x_ecb_enc:
+       aese    $dat0,q8
+       aesmc   $dat0,$dat0
+       aese    $dat1,q8
+       aesmc   $dat1,$dat1
+       aese    $dat2,q8
+       aesmc   $dat2,$dat2
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aese    $dat0,q9
+       aesmc   $dat0,$dat0
+       aese    $dat1,q9
+       aesmc   $dat1,$dat1
+       aese    $dat2,q9
+       aesmc   $dat2,$dat2
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Loop3x_ecb_enc
+
+       aese    $dat0,q8
+       aesmc   $dat0,$dat0
+       aese    $dat1,q8
+       aesmc   $dat1,$dat1
+       aese    $dat2,q8
+       aesmc   $dat2,$dat2
+       subs    $len,$len,#0x30
+       mov.lo  x6,$len                         // x6, $cnt, is zero at this point
+       aese    $dat0,q9
+       aesmc   $dat0,$dat0
+       aese    $dat1,q9
+       aesmc   $dat1,$dat1
+       aese    $dat2,q9
+       aesmc   $dat2,$dat2
+       add     $inp,$inp,x6                    // $inp is adjusted in such way that
+                                               // at exit from the loop $dat1-$dat2
+                                               // are loaded with last "words"
+       mov     $key_,$key
+       aese    $dat0,q12
+       aesmc   $dat0,$dat0
+       aese    $dat1,q12
+       aesmc   $dat1,$dat1
+       aese    $dat2,q12
+       aesmc   $dat2,$dat2
+       vld1.8  {$in0},[$inp],#16
+       aese    $dat0,q13
+       aesmc   $dat0,$dat0
+       aese    $dat1,q13
+       aesmc   $dat1,$dat1
+       aese    $dat2,q13
+       aesmc   $dat2,$dat2
+       vld1.8  {$in1},[$inp],#16
+       aese    $dat0,q14
+       aesmc   $dat0,$dat0
+       aese    $dat1,q14
+       aesmc   $dat1,$dat1
+       aese    $dat2,q14
+       aesmc   $dat2,$dat2
+       vld1.8  {$in2},[$inp],#16
+       aese    $dat0,q15
+       aese    $dat1,q15
+       aese    $dat2,q15
+       vld1.32 {q8},[$key_],#16                // re-pre-load rndkey[0]
+       add     $cnt,$rounds,#2
+       veor    $tmp0,$rndlast,$dat0
+       veor    $tmp1,$rndlast,$dat1
+       veor    $dat2,$dat2,$rndlast
+       vld1.32 {q9},[$key_],#16                // re-pre-load rndkey[1]
+       vst1.8  {$tmp0},[$out],#16
+       vorr    $dat0,$in0,$in0
+       vst1.8  {$tmp1},[$out],#16
+       vorr    $dat1,$in1,$in1
+       vst1.8  {$dat2},[$out],#16
+       vorr    $dat2,$in2,$in2
+       b.hs    .Loop3x_ecb_enc
+
+       cmn     $len,#0x30
+       b.eq    .Lecb_done
+       nop
+
+.Lecb_enc_tail:
+       aese    $dat1,q8
+       aesmc   $dat1,$dat1
+       aese    $dat2,q8
+       aesmc   $dat2,$dat2
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aese    $dat1,q9
+       aesmc   $dat1,$dat1
+       aese    $dat2,q9
+       aesmc   $dat2,$dat2
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Lecb_enc_tail
+
+       aese    $dat1,q8
+       aesmc   $dat1,$dat1
+       aese    $dat2,q8
+       aesmc   $dat2,$dat2
+       aese    $dat1,q9
+       aesmc   $dat1,$dat1
+       aese    $dat2,q9
+       aesmc   $dat2,$dat2
+       aese    $dat1,q12
+       aesmc   $dat1,$dat1
+       aese    $dat2,q12
+       aesmc   $dat2,$dat2
+       cmn     $len,#0x20
+       aese    $dat1,q13
+       aesmc   $dat1,$dat1
+       aese    $dat2,q13
+       aesmc   $dat2,$dat2
+       aese    $dat1,q14
+       aesmc   $dat1,$dat1
+       aese    $dat2,q14
+       aesmc   $dat2,$dat2
+       aese    $dat1,q15
+       aese    $dat2,q15
+       b.eq    .Lecb_enc_one
+       veor    $tmp1,$rndlast,$dat1
+       veor    $tmp2,$rndlast,$dat2
+       vst1.8  {$tmp1},[$out],#16
+       vst1.8  {$tmp2},[$out],#16
+       b       .Lecb_done
+
+.Lecb_enc_one:
+       veor    $tmp1,$rndlast,$dat2
+       vst1.8  {$tmp1},[$out],#16
+       b       .Lecb_done
+___
+
+$code.=<<___;
+.align 5
+.Lecb_dec:
+       vld1.8  {$dat1},[$inp],#16
+       subs    $len,$len,#32                   // bias
+       add     $cnt,$rounds,#2
+       vorr    $in1,$dat1,$dat1
+       vorr    $dat2,$dat1,$dat1
+       vorr    $dat1,$dat,$dat
+       b.lo    .Lecb_dec_tail
+
+       vorr    $dat1,$in1,$in1
+       vld1.8  {$dat2},[$inp],#16
+___
+$code.=<<___   if ($flavour =~ /64/);
+       cmp     $len,#32
+       b.lo    .Loop3x_ecb_dec
+
+       vld1.8  {$dat3},[$inp],#16
+       vld1.8  {$dat4},[$inp],#16
+       sub     $len,$len,#32                           // bias
+       mov     $cnt,$rounds
+
+.Loop5x_ecb_dec:
+       aesd    $dat0,q8
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q8
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q8
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q8
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q8
+       aesimc  $dat4,$dat4
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aesd    $dat0,q9
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q9
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q9
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q9
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q9
+       aesimc  $dat4,$dat4
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Loop5x_ecb_dec
+
+       aesd    $dat0,q8
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q8
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q8
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q8
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q8
+       aesimc  $dat4,$dat4
+       cmp     $len,#0x40                              // because .Lecb_tail4x
+       sub     $len,$len,#0x50
+
+       aesd    $dat0,q9
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q9
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q9
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q9
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q9
+       aesimc  $dat4,$dat4
+       csel    x6,xzr,$len,gt          // borrow x6, $cnt, "gt" is not typo
+       mov     $key_,$key
+
+       aesd    $dat0,q10
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q10
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q10
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q10
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q10
+       aesimc  $dat4,$dat4
+       add     $inp,$inp,x6                            // $inp is adjusted in such way that
+                                                       // at exit from the loop $dat1-$dat4
+                                                       // are loaded with last "words"
+       add     x6,$len,#0x60                   // because .Lecb_tail4x
+
+       aesd    $dat0,q11
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q11
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q11
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q11
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q11
+       aesimc  $dat4,$dat4
+
+       aesd    $dat0,q12
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q12
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q12
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q12
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q12
+       aesimc  $dat4,$dat4
+
+       aesd    $dat0,q13
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q13
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q13
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q13
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q13
+       aesimc  $dat4,$dat4
+
+       aesd    $dat0,q14
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q14
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q14
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q14
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q14
+       aesimc  $dat4,$dat4
+
+       aesd    $dat0,q15
+       vld1.8  {$in0},[$inp],#16
+       aesd    $dat1,q15
+       vld1.8  {$in1},[$inp],#16
+       aesd    $dat2,q15
+       vld1.8  {$in2},[$inp],#16
+       aesd    $dat3,q15
+       vld1.8  {$in3},[$inp],#16
+       aesd    $dat4,q15
+       vld1.8  {$in4},[$inp],#16
+       cbz     x6,.Lecb_tail4x
+       vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
+       veor    $tmp0,$rndlast,$dat0
+       vorr    $dat0,$in0,$in0
+       veor    $tmp1,$rndlast,$dat1
+       vorr    $dat1,$in1,$in1
+       veor    $tmp2,$rndlast,$dat2
+       vorr    $dat2,$in2,$in2
+       veor    $tmp3,$rndlast,$dat3
+       vorr    $dat3,$in3,$in3
+       veor    $tmp4,$rndlast,$dat4
+       vst1.8  {$tmp0},[$out],#16
+       vorr    $dat4,$in4,$in4
+       vst1.8  {$tmp1},[$out],#16
+       mov     $cnt,$rounds
+       vst1.8  {$tmp2},[$out],#16
+       vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
+       vst1.8  {$tmp3},[$out],#16
+       vst1.8  {$tmp4},[$out],#16
+       b.hs    .Loop5x_ecb_dec
+
+       add     $len,$len,#0x50
+       cbz     $len,.Lecb_done
+
+       add     $cnt,$rounds,#2
+       subs    $len,$len,#0x30
+       vorr    $dat0,$in2,$in2
+       vorr    $dat1,$in3,$in3
+       vorr    $dat2,$in4,$in4
+       b.lo    .Lecb_dec_tail
+
+       b       .Loop3x_ecb_dec
+
+.align 4
+.Lecb_tail4x:
+       veor    $tmp1,$rndlast,$dat1
+       veor    $tmp2,$rndlast,$dat2
+       veor    $tmp3,$rndlast,$dat3
+       veor    $tmp4,$rndlast,$dat4
+       vst1.8  {$tmp1},[$out],#16
+       vst1.8  {$tmp2},[$out],#16
+       vst1.8  {$tmp3},[$out],#16
+       vst1.8  {$tmp4},[$out],#16
+
+       b       .Lecb_done
+.align 4
+___
+$code.=<<___;
+.Loop3x_ecb_dec:
+       aesd    $dat0,q8
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q8
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q8
+       aesimc  $dat2,$dat2
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aesd    $dat0,q9
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q9
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q9
+       aesimc  $dat2,$dat2
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Loop3x_ecb_dec
+
+       aesd    $dat0,q8
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q8
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q8
+       aesimc  $dat2,$dat2
+       subs    $len,$len,#0x30
+       mov.lo  x6,$len                         // x6, $cnt, is zero at this point
+       aesd    $dat0,q9
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q9
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q9
+       aesimc  $dat2,$dat2
+       add     $inp,$inp,x6                    // $inp is adjusted in such way that
+                                               // at exit from the loop $dat1-$dat2
+                                               // are loaded with last "words"
+       mov     $key_,$key
+       aesd    $dat0,q12
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q12
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q12
+       aesimc  $dat2,$dat2
+       vld1.8  {$in0},[$inp],#16
+       aesd    $dat0,q13
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q13
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q13
+       aesimc  $dat2,$dat2
+       vld1.8  {$in1},[$inp],#16
+       aesd    $dat0,q14
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q14
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q14
+       aesimc  $dat2,$dat2
+       vld1.8  {$in2},[$inp],#16
+       aesd    $dat0,q15
+       aesd    $dat1,q15
+       aesd    $dat2,q15
+       vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
+       add     $cnt,$rounds,#2
+       veor    $tmp0,$rndlast,$dat0
+       veor    $tmp1,$rndlast,$dat1
+       veor    $dat2,$dat2,$rndlast
+       vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
+       vst1.8  {$tmp0},[$out],#16
+       vorr    $dat0,$in0,$in0
+       vst1.8  {$tmp1},[$out],#16
+       vorr    $dat1,$in1,$in1
+       vst1.8  {$dat2},[$out],#16
+       vorr    $dat2,$in2,$in2
+       b.hs    .Loop3x_ecb_dec
+
+       cmn     $len,#0x30
+       b.eq    .Lecb_done
+       nop
+
+.Lecb_dec_tail:
+       aesd    $dat1,q8
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q8
+       aesimc  $dat2,$dat2
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aesd    $dat1,q9
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q9
+       aesimc  $dat2,$dat2
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Lecb_dec_tail
+
+       aesd    $dat1,q8
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q8
+       aesimc  $dat2,$dat2
+       aesd    $dat1,q9
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q9
+       aesimc  $dat2,$dat2
+       aesd    $dat1,q12
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q12
+       aesimc  $dat2,$dat2
+       cmn     $len,#0x20
+       aesd    $dat1,q13
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q13
+       aesimc  $dat2,$dat2
+       aesd    $dat1,q14
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q14
+       aesimc  $dat2,$dat2
+       aesd    $dat1,q15
+       aesd    $dat2,q15
+       b.eq    .Lecb_dec_one
+       veor    $tmp1,$rndlast,$dat1
+       veor    $tmp2,$rndlast,$dat2
+       vst1.8  {$tmp1},[$out],#16
+       vst1.8  {$tmp2},[$out],#16
+       b       .Lecb_done
+
+.Lecb_dec_one:
+       veor    $tmp1,$rndlast,$dat2
+       vst1.8  {$tmp1},[$out],#16
+
+.Lecb_done:
+___
+}
+$code.=<<___   if ($flavour !~ /64/);
+       vldmia  sp!,{d8-d15}
+       ldmia   sp!,{r4-r8,pc}
+___
+$code.=<<___   if ($flavour =~ /64/);
+       ldr     x29,[sp],#16
+___
+$code.=<<___   if ($flavour =~ /64/);
+.Lecb_Final_abort:
+       ret
+___
+$code.=<<___;
+.size  ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
+___
+}}}
  {{{
  my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
  my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
diff --git a/include/crypto/ciphermode_platform.h b/include/crypto/ciphermode_platform.h

index b1868e1a6df2288e28181917dfa4340a152a551d..03afd719af0612dd7bd5c6ed8d5c99a78ad104a6 100644 (file)
--- a/include/crypto/ciphermode_platform.h
+++ b/include/crypto/ciphermode_platform.h
@@ -89,6 +89,7 @@ void AES_xts_decrypt(const unsigned char *inp, unsigned char *out, size_t len,
  #    define HWAES_encrypt aes_v8_encrypt
  #    define HWAES_decrypt aes_v8_decrypt
  #    define HWAES_cbc_encrypt aes_v8_cbc_encrypt
+#    define HWAES_ecb_encrypt aes_v8_ecb_encrypt
  #    define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
  #   endif
  #  endif
@@ -411,6 +412,9 @@ void HWAES_decrypt(const unsigned char *in, unsigned char *out,
  void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
                         size_t length, const AES_KEY *key,
                         unsigned char *ivec, const int enc);
+void HWAES_ecb_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t length, const AES_KEY *key,
+                       const int enc);
  void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
                                  size_t len, const AES_KEY *key,
                                  const unsigned char ivec[16]);
diff --git a/include/openssl/modes.h b/include/openssl/modes.h

index 1c672d2a465dfef396f955823642f3e0b2ab1a86..e19079912ba24f1876016db92efd475cbeb375b8 100644 (file)
--- a/include/openssl/modes.h
+++ b/include/openssl/modes.h
@@ -29,6 +29,10 @@ typedef void (*cbc128_f) (const unsigned char *in, unsigned char *out,
                            size_t len, const void *key,
                            unsigned char ivec[16], int enc);
  
+typedef void (*ecb128_f) (const unsigned char *in, unsigned char *out,
+                          size_t len, const void *key,
+                          int enc);
+
  typedef void (*ctr128_f) (const unsigned char *in, unsigned char *out,
                            size_t blocks, const void *key,
                            const unsigned char ivec[16]);
diff --git a/providers/implementations/ciphers/cipher_aes_hw.c b/providers/implementations/ciphers/cipher_aes_hw.c

index bc733ee16acda714e4ef17e7602b81010b25647b..e9a7c31f98908251b1c95f9fff7d086d1717a56b 100644 (file)
--- a/providers/implementations/ciphers/cipher_aes_hw.c
+++ b/providers/implementations/ciphers/cipher_aes_hw.c
@@ -29,6 +29,10 @@ static int cipher_hw_aes_initkey(PROV_CIPHER_CTX *dat,
  # ifdef HWAES_cbc_encrypt
              if (dat->mode == EVP_CIPH_CBC_MODE)
                  dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
+# endif
+# ifdef HWAES_ecb_encrypt
+            if (dat->mode == EVP_CIPH_ECB_MODE)
+                dat->stream.ecb = (ecb128_f)HWAES_ecb_encrypt;
  # endif
          } else
  #endif
@@ -64,6 +68,11 @@ static int cipher_hw_aes_initkey(PROV_CIPHER_CTX *dat,
              dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
          else
  # endif
+# ifdef HWAES_ecb_encrypt
+        if (dat->mode == EVP_CIPH_ECB_MODE)
+            dat->stream.ecb = (ecb128_f)HWAES_ecb_encrypt;
+        else
+# endif
  # ifdef HWAES_ctr32_encrypt_blocks
          if (dat->mode == EVP_CIPH_CTR_MODE)
              dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;
diff --git a/providers/implementations/ciphers/ciphercommon_hw.c b/providers/implementations/ciphers/ciphercommon_hw.c

index f1c466edc8faa7b5ad714221b0da7f2c5354974f..31062f1bc666ea968f5ff17ecb4e4154174ddedf 100644 (file)
--- a/providers/implementations/ciphers/ciphercommon_hw.c
+++ b/providers/implementations/ciphers/ciphercommon_hw.c
@@ -34,8 +34,13 @@ int cipher_hw_generic_ecb(PROV_CIPHER_CTX *dat, unsigned char *out,
      if (len < bl)
          return 1;
  
-    for (i = 0, len -= bl; i <= len; i += bl)
-        (*dat->block) (in + i, out + i, dat->ks);
+    if (dat->stream.ecb) {
+        (*dat->stream.ecb) (in, out, len, dat->ks, dat->enc);
+    }
+    else {
+        for (i = 0, len -= bl; i <= len; i += bl)
+            (*dat->block) (in + i, out + i, dat->ks);
+    }
  
      return 1;
  }
diff --git a/providers/implementations/include/prov/ciphercommon.h b/providers/implementations/include/prov/ciphercommon.h

index 1072b577fe41b79b25f75c84bccbd4b5e0eb5ea0..bf77a4021e85f29276fb2bc7afe654c72b2e8c3a 100644 (file)
--- a/providers/implementations/include/prov/ciphercommon.h
+++ b/providers/implementations/include/prov/ciphercommon.h
@@ -37,6 +37,7 @@ struct prov_cipher_ctx_st {
      union {
          cbc128_f cbc;
          ctr128_f ctr;
+        ecb128_f ecb;
      } stream;
  
      unsigned int mode;
author	XiaokangQian <xiaokang.qian@arm.com>
	Thu, 7 Nov 2019 02:36:45 +0000 (02:36 +0000)
committer	Richard Levitte <levitte@openssl.org>
	Wed, 11 Dec 2019 17:56:11 +0000 (18:56 +0100)
crypto/aes/asm/aesv8-armx.pl		patch \| blob \| history
include/crypto/ciphermode_platform.h		patch \| blob \| history
include/openssl/modes.h		patch \| blob \| history
providers/implementations/ciphers/cipher_aes_hw.c		patch \| blob \| history
providers/implementations/ciphers/ciphercommon_hw.c		patch \| blob \| history
providers/implementations/include/prov/ciphercommon.h		patch \| blob \| history