Optimize AES-ECB mode in OpenSSL for both aarch64 and aarch32
authorXiaokangQian <xiaokang.qian@arm.com>
Thu, 7 Nov 2019 02:36:45 +0000 (02:36 +0000)
committerRichard Levitte <levitte@openssl.org>
Wed, 11 Dec 2019 17:56:11 +0000 (18:56 +0100)
Aes-ecb mode can be optimized by inverleaving cipher operation on
several blocks and loop unrolling. Interleaving needs one ideal
unrolling factor, here we adopt the same factor with aes-cbc,
which is described as below:
    If blocks number > 5, select 5 blocks as one iteration,every
    loop, decrease the blocks number by 5.
    If 3 < left blocks < 5 select 3 blocks as one iteration, every
    loop, decrease the block number by 3.
    If left blocks < 3, treat them as tail blocks.
Detailed implementation will have a little adjustment for squeezing
code space.
With this way, for small size such as 16 bytes, the performance is
similar as before, but for big size such as 16k bytes, the performance
improves a lot, even reaches to 100%, for some arches such as A57,
the improvement  even exceeds 100%. The following table will list the
encryption performance data on aarch64, take a72 and a57 as examples.
Performance value takes the unit of cycles per byte, takes the format
as comparision of values. List them as below:

A72:
                            Before optimization     After optimization  Improve
evp-aes-128-ecb@16          17.26538237             16.82663866         2.61%
evp-aes-128-ecb@64          5.50528499              5.222637557         5.41%
evp-aes-128-ecb@256         2.632700213             1.908442892         37.95%
evp-aes-128-ecb@1024        1.876102047             1.078018868         74.03%
evp-aes-128-ecb@8192        1.6550392               0.853982929         93.80%
evp-aes-128-ecb@16384       1.636871283             0.847623957         93.11%
evp-aes-192-ecb@16          17.73104961             17.09692468         3.71%
evp-aes-192-ecb@64          5.78984398              5.418545192         6.85%
evp-aes-192-ecb@256         2.872005308             2.081815274         37.96%
evp-aes-192-ecb@1024        2.083226672             1.25095642          66.53%
evp-aes-192-ecb@8192        1.831992057             0.995916251         83.95%
evp-aes-192-ecb@16384       1.821590009             0.993820525         83.29%
evp-aes-256-ecb@16          18.0606306              17.96963317         0.51%
evp-aes-256-ecb@64          6.19651997              5.762465812         7.53%
evp-aes-256-ecb@256         3.176991394             2.24642538          41.42%
evp-aes-256-ecb@1024        2.385991919             1.396018192         70.91%
evp-aes-256-ecb@8192        2.147862636             1.142222597         88.04%
evp-aes-256-ecb@16384       2.131361787             1.135944617         87.63%

A57:
                            Before optimization     After optimization  Improve
evp-aes-128-ecb@16          18.61045121             18.36456218         1.34%
evp-aes-128-ecb@64          6.438628994             5.467959461         17.75%
evp-aes-128-ecb@256         2.957452881             1.97238604          49.94%
evp-aes-128-ecb@1024        2.117096219             1.099665054         92.52%
evp-aes-128-ecb@8192        1.868385973             0.837440804         123.11%
evp-aes-128-ecb@16384       1.853078526             0.822420027         125.32%
evp-aes-192-ecb@16          19.07021756             18.50018552         3.08%
evp-aes-192-ecb@64          6.672351486             5.696088921         17.14%
evp-aes-192-ecb@256         3.260427769             2.131449916         52.97%
evp-aes-192-ecb@1024        2.410522832             1.250529718         92.76%
evp-aes-192-ecb@8192        2.17921605              0.973225504         123.92%
evp-aes-192-ecb@16384       2.162250997             0.95919871          125.42%
evp-aes-256-ecb@16          19.3008384              19.12743654         0.91%
evp-aes-256-ecb@64          6.992950658             5.92149541          18.09%
evp-aes-256-ecb@256         3.576361743             2.287619504         56.34%
evp-aes-256-ecb@1024        2.726671027             1.381267599         97.40%
evp-aes-256-ecb@8192        2.493583657             1.110959913         124.45%
evp-aes-256-ecb@16384       2.473916816             1.099967073         124.91%

Change-Id: Iccd23d972e0d52d22dc093f4c208f69c9d5a0ca7

Reviewed-by: Shane Lontis <shane.lontis@oracle.com>
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/10518)

crypto/aes/asm/aesv8-armx.pl
include/crypto/ciphermode_platform.h
include/openssl/modes.h
providers/implementations/ciphers/cipher_aes_hw.c
providers/implementations/ciphers/ciphercommon_hw.c
providers/implementations/include/prov/ciphercommon.h

index c132d1d7fe44e130f0482e9a05f252120d4abba2..d0ae34bc48fd0ff75d2d44f1632db8cac3bfed30 100755 (executable)
@@ -384,6 +384,836 @@ ___
 &gen_block("en");
 &gen_block("de");
 }}}
+
+# Performance in cycles per byte.
+# Processed with AES-ECB different key size.
+# It shows the value before and after optimization as below:
+# (before/after):
+#
+#              AES-128-ECB             AES-192-ECB             AES-256-ECB
+# Cortex-A57   1.85/0.82               2.16/0.96               2.47/1.10
+# Cortex-A72   1.64/0.85               1.82/0.99               2.13/1.14
+
+# Optimization is implemented by loop unrolling and interleaving.
+# Commonly, we choose the unrolling factor as 5, if the input
+# data size smaller than 5 blocks, but not smaller than 3 blocks,
+# choose 3 as the unrolling factor.
+# If the input data size dsize >= 5*16 bytes, then take 5 blocks
+# as one iteration, every loop the left size lsize -= 5*16.
+# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
+# every loop lsize -=3*16.
+# If lsize < 3*16 bytes, treat them as the tail, interleave the
+# two blocks AES instructions.
+# There is one special case, if the original input data size dsize
+# = 16 bytes, we will treat it seperately to improve the
+# performance: one independent code block without LR, FP load and
+# store, just looks like what the original ECB implementation does.
+
+{{{
+my ($inp,$out,$len,$key)=map("x$_",(0..3));
+my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+
+### q7 last round key
+### q10-q15    q7 Last 7 round keys
+### q8-q9      preloaded round keys except last 7 keys for big size
+### q5, q6, q8-q9      preloaded round keys except last 7 keys for only 16 byte
+
+{
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat3,$in3,$tmp3); # used only in 64-bit mode
+my ($dat4,$in4,$tmp4);
+if ($flavour =~ /64/) {
+    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
+}
+
+$code.=<<___;
+.globl ${prefix}_ecb_encrypt
+.type  ${prefix}_ecb_encrypt,%function
+.align 5
+${prefix}_ecb_encrypt:
+___
+$code.=<<___   if ($flavour =~ /64/);
+       subs    $len,$len,#16
+       // Original input data size bigger than 16, jump to big size processing.
+       b.ne    .Lecb_big_size
+       vld1.8  {$dat0},[$inp]
+       cmp     $enc,#0                                 // en- or decrypting?
+       ldr     $rounds,[$key,#240]
+       vld1.32 {q5-q6},[$key],#32                      // load key schedule...
+
+       b.eq .Lecb_small_dec
+       aese    $dat0,q5
+       aesmc   $dat0,$dat0
+       vld1.32 {q8-q9},[$key],#32                      // load key schedule...
+       aese    $dat0,q6
+       aesmc   $dat0,$dat0
+       subs    $rounds,$rounds,#10                     // if rounds==10, jump to aes-128-ecb processing
+       b.eq    .Lecb_128_enc
+.Lecb_round_loop:
+       aese    $dat0,q8
+       aesmc   $dat0,$dat0
+       vld1.32 {q8},[$key],#16                         // load key schedule...
+       aese    $dat0,q9
+       aesmc   $dat0,$dat0
+       vld1.32 {q9},[$key],#16                         // load key schedule...
+       subs    $rounds,$rounds,#2                      // bias
+       b.gt    .Lecb_round_loop
+.Lecb_128_enc:
+       vld1.32 {q10-q11},[$key],#32            // load key schedule...
+       aese    $dat0,q8
+       aesmc   $dat0,$dat0
+       aese    $dat0,q9
+       aesmc   $dat0,$dat0
+       vld1.32 {q12-q13},[$key],#32            // load key schedule...
+       aese    $dat0,q10
+       aesmc   $dat0,$dat0
+       aese    $dat0,q11
+       aesmc   $dat0,$dat0
+       vld1.32 {q14-q15},[$key],#32            // load key schedule...
+       aese    $dat0,q12
+       aesmc   $dat0,$dat0
+       aese    $dat0,q13
+       aesmc   $dat0,$dat0
+       vld1.32 {$rndlast},[$key]
+       aese    $dat0,q14
+       aesmc   $dat0,$dat0
+       aese    $dat0,q15
+       veor    $dat0,$dat0,$rndlast
+       vst1.8  {$dat0},[$out]
+       b       .Lecb_Final_abort
+.Lecb_small_dec:
+       aesd    $dat0,q5
+       aesimc  $dat0,$dat0
+       vld1.32 {q8-q9},[$key],#32                      // load key schedule...
+       aesd    $dat0,q6
+       aesimc  $dat0,$dat0
+       subs    $rounds,$rounds,#10                     // bias
+       b.eq    .Lecb_128_dec
+.Lecb_dec_round_loop:
+       aesd    $dat0,q8
+       aesimc  $dat0,$dat0
+       vld1.32 {q8},[$key],#16                         // load key schedule...
+       aesd    $dat0,q9
+       aesimc  $dat0,$dat0
+       vld1.32 {q9},[$key],#16                         // load key schedule...
+       subs    $rounds,$rounds,#2                      // bias
+       b.gt    .Lecb_dec_round_loop
+.Lecb_128_dec:
+       vld1.32 {q10-q11},[$key],#32            // load key schedule...
+       aesd    $dat0,q8
+       aesimc  $dat0,$dat0
+       aesd    $dat0,q9
+       aesimc  $dat0,$dat0
+       vld1.32 {q12-q13},[$key],#32            // load key schedule...
+       aesd    $dat0,q10
+       aesimc  $dat0,$dat0
+       aesd    $dat0,q11
+       aesimc  $dat0,$dat0
+       vld1.32 {q14-q15},[$key],#32            // load key schedule...
+       aesd    $dat0,q12
+       aesimc  $dat0,$dat0
+       aesd    $dat0,q13
+       aesimc  $dat0,$dat0
+       vld1.32 {$rndlast},[$key]
+       aesd    $dat0,q14
+       aesimc  $dat0,$dat0
+       aesd    $dat0,q15
+       veor    $dat0,$dat0,$rndlast
+       vst1.8  {$dat0},[$out]
+       b       .Lecb_Final_abort
+.Lecb_big_size:
+___
+$code.=<<___   if ($flavour =~ /64/);
+       stp     x29,x30,[sp,#-16]!
+       add     x29,sp,#0
+___
+$code.=<<___   if ($flavour !~ /64/);
+       mov     ip,sp
+       stmdb   sp!,{r4-r8,lr}
+       vstmdb  sp!,{d8-d15}                    @ ABI specification says so
+       ldmia   ip,{r4-r5}                      @ load remaining args
+       subs    $len,$len,#16
+___
+$code.=<<___;
+       mov     $step,#16
+       b.lo    .Lecb_done
+       cclr    $step,eq
+
+       cmp     $enc,#0                                 // en- or decrypting?
+       ldr     $rounds,[$key,#240]
+       and     $len,$len,#-16
+       vld1.8  {$dat},[$inp],$step
+
+       vld1.32 {q8-q9},[$key]                          // load key schedule...
+       sub     $rounds,$rounds,#6
+       add     $key_,$key,x5,lsl#4                             // pointer to last 7 round keys
+       sub     $rounds,$rounds,#2
+       vld1.32 {q10-q11},[$key_],#32
+       vld1.32 {q12-q13},[$key_],#32
+       vld1.32 {q14-q15},[$key_],#32
+       vld1.32 {$rndlast},[$key_]
+
+       add     $key_,$key,#32
+       mov     $cnt,$rounds
+       b.eq    .Lecb_dec
+
+       vld1.8  {$dat1},[$inp],#16
+       subs    $len,$len,#32                           // bias
+       add     $cnt,$rounds,#2
+       vorr    $in1,$dat1,$dat1
+       vorr    $dat2,$dat1,$dat1
+       vorr    $dat1,$dat,$dat
+       b.lo    .Lecb_enc_tail
+
+       vorr    $dat1,$in1,$in1
+       vld1.8  {$dat2},[$inp],#16
+___
+$code.=<<___   if ($flavour =~ /64/);
+       cmp     $len,#32
+       b.lo    .Loop3x_ecb_enc
+
+       vld1.8  {$dat3},[$inp],#16
+       vld1.8  {$dat4},[$inp],#16
+       sub     $len,$len,#32                           // bias
+       mov     $cnt,$rounds
+
+.Loop5x_ecb_enc:
+       aese    $dat0,q8
+       aesmc   $dat0,$dat0
+       aese    $dat1,q8
+       aesmc   $dat1,$dat1
+       aese    $dat2,q8
+       aesmc   $dat2,$dat2
+       aese    $dat3,q8
+       aesmc   $dat3,$dat3
+       aese    $dat4,q8
+       aesmc   $dat4,$dat4
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aese    $dat0,q9
+       aesmc   $dat0,$dat0
+       aese    $dat1,q9
+       aesmc   $dat1,$dat1
+       aese    $dat2,q9
+       aesmc   $dat2,$dat2
+       aese    $dat3,q9
+       aesmc   $dat3,$dat3
+       aese    $dat4,q9
+       aesmc   $dat4,$dat4
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Loop5x_ecb_enc
+
+       aese    $dat0,q8
+       aesmc   $dat0,$dat0
+       aese    $dat1,q8
+       aesmc   $dat1,$dat1
+       aese    $dat2,q8
+       aesmc   $dat2,$dat2
+       aese    $dat3,q8
+       aesmc   $dat3,$dat3
+       aese    $dat4,q8
+       aesmc   $dat4,$dat4
+       cmp     $len,#0x40                                      // because .Lecb_enc_tail4x
+       sub     $len,$len,#0x50
+
+       aese    $dat0,q9
+       aesmc   $dat0,$dat0
+       aese    $dat1,q9
+       aesmc   $dat1,$dat1
+       aese    $dat2,q9
+       aesmc   $dat2,$dat2
+       aese    $dat3,q9
+       aesmc   $dat3,$dat3
+       aese    $dat4,q9
+       aesmc   $dat4,$dat4
+       csel    x6,xzr,$len,gt                  // borrow x6, $cnt, "gt" is not typo
+       mov     $key_,$key
+
+       aese    $dat0,q10
+       aesmc   $dat0,$dat0
+       aese    $dat1,q10
+       aesmc   $dat1,$dat1
+       aese    $dat2,q10
+       aesmc   $dat2,$dat2
+       aese    $dat3,q10
+       aesmc   $dat3,$dat3
+       aese    $dat4,q10
+       aesmc   $dat4,$dat4
+       add     $inp,$inp,x6                            // $inp is adjusted in such way that
+                                                       // at exit from the loop $dat1-$dat4
+                                                       // are loaded with last "words"
+       add     x6,$len,#0x60               // because .Lecb_enc_tail4x
+
+       aese    $dat0,q11
+       aesmc   $dat0,$dat0
+       aese    $dat1,q11
+       aesmc   $dat1,$dat1
+       aese    $dat2,q11
+       aesmc   $dat2,$dat2
+       aese    $dat3,q11
+       aesmc   $dat3,$dat3
+       aese    $dat4,q11
+       aesmc   $dat4,$dat4
+
+       aese    $dat0,q12
+       aesmc   $dat0,$dat0
+       aese    $dat1,q12
+       aesmc   $dat1,$dat1
+       aese    $dat2,q12
+       aesmc   $dat2,$dat2
+       aese    $dat3,q12
+       aesmc   $dat3,$dat3
+       aese    $dat4,q12
+       aesmc   $dat4,$dat4
+
+       aese    $dat0,q13
+       aesmc   $dat0,$dat0
+       aese    $dat1,q13
+       aesmc   $dat1,$dat1
+       aese    $dat2,q13
+       aesmc   $dat2,$dat2
+       aese    $dat3,q13
+       aesmc   $dat3,$dat3
+       aese    $dat4,q13
+       aesmc   $dat4,$dat4
+
+       aese    $dat0,q14
+       aesmc   $dat0,$dat0
+       aese    $dat1,q14
+       aesmc   $dat1,$dat1
+       aese    $dat2,q14
+       aesmc   $dat2,$dat2
+       aese    $dat3,q14
+       aesmc   $dat3,$dat3
+       aese    $dat4,q14
+       aesmc   $dat4,$dat4
+
+       aese    $dat0,q15
+       vld1.8  {$in0},[$inp],#16
+       aese    $dat1,q15
+       vld1.8  {$in1},[$inp],#16
+       aese    $dat2,q15
+       vld1.8  {$in2},[$inp],#16
+       aese    $dat3,q15
+       vld1.8  {$in3},[$inp],#16
+       aese    $dat4,q15
+       vld1.8  {$in4},[$inp],#16
+       cbz     x6,.Lecb_enc_tail4x
+       vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
+       veor    $tmp0,$rndlast,$dat0
+       vorr    $dat0,$in0,$in0
+       veor    $tmp1,$rndlast,$dat1
+       vorr    $dat1,$in1,$in1
+       veor    $tmp2,$rndlast,$dat2
+       vorr    $dat2,$in2,$in2
+       veor    $tmp3,$rndlast,$dat3
+       vorr    $dat3,$in3,$in3
+       veor    $tmp4,$rndlast,$dat4
+       vst1.8  {$tmp0},[$out],#16
+       vorr    $dat4,$in4,$in4
+       vst1.8  {$tmp1},[$out],#16
+       mov     $cnt,$rounds
+       vst1.8  {$tmp2},[$out],#16
+       vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
+       vst1.8  {$tmp3},[$out],#16
+       vst1.8  {$tmp4},[$out],#16
+       b.hs    .Loop5x_ecb_enc
+
+       add     $len,$len,#0x50
+       cbz     $len,.Lecb_done
+
+       add     $cnt,$rounds,#2
+       subs    $len,$len,#0x30
+       vorr    $dat0,$in2,$in2
+       vorr    $dat1,$in3,$in3
+       vorr    $dat2,$in4,$in4
+       b.lo    .Lecb_enc_tail
+
+       b       .Loop3x_ecb_enc
+
+.align 4
+.Lecb_enc_tail4x:
+       veor    $tmp1,$rndlast,$dat1
+       veor    $tmp2,$rndlast,$dat2
+       veor    $tmp3,$rndlast,$dat3
+       veor    $tmp4,$rndlast,$dat4
+       vst1.8  {$tmp1},[$out],#16
+       vst1.8  {$tmp2},[$out],#16
+       vst1.8  {$tmp3},[$out],#16
+       vst1.8  {$tmp4},[$out],#16
+
+       b       .Lecb_done
+.align 4
+___
+$code.=<<___;
+.Loop3x_ecb_enc:
+       aese    $dat0,q8
+       aesmc   $dat0,$dat0
+       aese    $dat1,q8
+       aesmc   $dat1,$dat1
+       aese    $dat2,q8
+       aesmc   $dat2,$dat2
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aese    $dat0,q9
+       aesmc   $dat0,$dat0
+       aese    $dat1,q9
+       aesmc   $dat1,$dat1
+       aese    $dat2,q9
+       aesmc   $dat2,$dat2
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Loop3x_ecb_enc
+
+       aese    $dat0,q8
+       aesmc   $dat0,$dat0
+       aese    $dat1,q8
+       aesmc   $dat1,$dat1
+       aese    $dat2,q8
+       aesmc   $dat2,$dat2
+       subs    $len,$len,#0x30
+       mov.lo  x6,$len                         // x6, $cnt, is zero at this point
+       aese    $dat0,q9
+       aesmc   $dat0,$dat0
+       aese    $dat1,q9
+       aesmc   $dat1,$dat1
+       aese    $dat2,q9
+       aesmc   $dat2,$dat2
+       add     $inp,$inp,x6                    // $inp is adjusted in such way that
+                                               // at exit from the loop $dat1-$dat2
+                                               // are loaded with last "words"
+       mov     $key_,$key
+       aese    $dat0,q12
+       aesmc   $dat0,$dat0
+       aese    $dat1,q12
+       aesmc   $dat1,$dat1
+       aese    $dat2,q12
+       aesmc   $dat2,$dat2
+       vld1.8  {$in0},[$inp],#16
+       aese    $dat0,q13
+       aesmc   $dat0,$dat0
+       aese    $dat1,q13
+       aesmc   $dat1,$dat1
+       aese    $dat2,q13
+       aesmc   $dat2,$dat2
+       vld1.8  {$in1},[$inp],#16
+       aese    $dat0,q14
+       aesmc   $dat0,$dat0
+       aese    $dat1,q14
+       aesmc   $dat1,$dat1
+       aese    $dat2,q14
+       aesmc   $dat2,$dat2
+       vld1.8  {$in2},[$inp],#16
+       aese    $dat0,q15
+       aese    $dat1,q15
+       aese    $dat2,q15
+       vld1.32 {q8},[$key_],#16                // re-pre-load rndkey[0]
+       add     $cnt,$rounds,#2
+       veor    $tmp0,$rndlast,$dat0
+       veor    $tmp1,$rndlast,$dat1
+       veor    $dat2,$dat2,$rndlast
+       vld1.32 {q9},[$key_],#16                // re-pre-load rndkey[1]
+       vst1.8  {$tmp0},[$out],#16
+       vorr    $dat0,$in0,$in0
+       vst1.8  {$tmp1},[$out],#16
+       vorr    $dat1,$in1,$in1
+       vst1.8  {$dat2},[$out],#16
+       vorr    $dat2,$in2,$in2
+       b.hs    .Loop3x_ecb_enc
+
+       cmn     $len,#0x30
+       b.eq    .Lecb_done
+       nop
+
+.Lecb_enc_tail:
+       aese    $dat1,q8
+       aesmc   $dat1,$dat1
+       aese    $dat2,q8
+       aesmc   $dat2,$dat2
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aese    $dat1,q9
+       aesmc   $dat1,$dat1
+       aese    $dat2,q9
+       aesmc   $dat2,$dat2
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Lecb_enc_tail
+
+       aese    $dat1,q8
+       aesmc   $dat1,$dat1
+       aese    $dat2,q8
+       aesmc   $dat2,$dat2
+       aese    $dat1,q9
+       aesmc   $dat1,$dat1
+       aese    $dat2,q9
+       aesmc   $dat2,$dat2
+       aese    $dat1,q12
+       aesmc   $dat1,$dat1
+       aese    $dat2,q12
+       aesmc   $dat2,$dat2
+       cmn     $len,#0x20
+       aese    $dat1,q13
+       aesmc   $dat1,$dat1
+       aese    $dat2,q13
+       aesmc   $dat2,$dat2
+       aese    $dat1,q14
+       aesmc   $dat1,$dat1
+       aese    $dat2,q14
+       aesmc   $dat2,$dat2
+       aese    $dat1,q15
+       aese    $dat2,q15
+       b.eq    .Lecb_enc_one
+       veor    $tmp1,$rndlast,$dat1
+       veor    $tmp2,$rndlast,$dat2
+       vst1.8  {$tmp1},[$out],#16
+       vst1.8  {$tmp2},[$out],#16
+       b       .Lecb_done
+
+.Lecb_enc_one:
+       veor    $tmp1,$rndlast,$dat2
+       vst1.8  {$tmp1},[$out],#16
+       b       .Lecb_done
+___
+
+$code.=<<___;
+.align 5
+.Lecb_dec:
+       vld1.8  {$dat1},[$inp],#16
+       subs    $len,$len,#32                   // bias
+       add     $cnt,$rounds,#2
+       vorr    $in1,$dat1,$dat1
+       vorr    $dat2,$dat1,$dat1
+       vorr    $dat1,$dat,$dat
+       b.lo    .Lecb_dec_tail
+
+       vorr    $dat1,$in1,$in1
+       vld1.8  {$dat2},[$inp],#16
+___
+$code.=<<___   if ($flavour =~ /64/);
+       cmp     $len,#32
+       b.lo    .Loop3x_ecb_dec
+
+       vld1.8  {$dat3},[$inp],#16
+       vld1.8  {$dat4},[$inp],#16
+       sub     $len,$len,#32                           // bias
+       mov     $cnt,$rounds
+
+.Loop5x_ecb_dec:
+       aesd    $dat0,q8
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q8
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q8
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q8
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q8
+       aesimc  $dat4,$dat4
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aesd    $dat0,q9
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q9
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q9
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q9
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q9
+       aesimc  $dat4,$dat4
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Loop5x_ecb_dec
+
+       aesd    $dat0,q8
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q8
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q8
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q8
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q8
+       aesimc  $dat4,$dat4
+       cmp     $len,#0x40                              // because .Lecb_tail4x
+       sub     $len,$len,#0x50
+
+       aesd    $dat0,q9
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q9
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q9
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q9
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q9
+       aesimc  $dat4,$dat4
+       csel    x6,xzr,$len,gt          // borrow x6, $cnt, "gt" is not typo
+       mov     $key_,$key
+
+       aesd    $dat0,q10
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q10
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q10
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q10
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q10
+       aesimc  $dat4,$dat4
+       add     $inp,$inp,x6                            // $inp is adjusted in such way that
+                                                       // at exit from the loop $dat1-$dat4
+                                                       // are loaded with last "words"
+       add     x6,$len,#0x60                   // because .Lecb_tail4x
+
+       aesd    $dat0,q11
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q11
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q11
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q11
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q11
+       aesimc  $dat4,$dat4
+
+       aesd    $dat0,q12
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q12
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q12
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q12
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q12
+       aesimc  $dat4,$dat4
+
+       aesd    $dat0,q13
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q13
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q13
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q13
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q13
+       aesimc  $dat4,$dat4
+
+       aesd    $dat0,q14
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q14
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q14
+       aesimc  $dat2,$dat2
+       aesd    $dat3,q14
+       aesimc  $dat3,$dat3
+       aesd    $dat4,q14
+       aesimc  $dat4,$dat4
+
+       aesd    $dat0,q15
+       vld1.8  {$in0},[$inp],#16
+       aesd    $dat1,q15
+       vld1.8  {$in1},[$inp],#16
+       aesd    $dat2,q15
+       vld1.8  {$in2},[$inp],#16
+       aesd    $dat3,q15
+       vld1.8  {$in3},[$inp],#16
+       aesd    $dat4,q15
+       vld1.8  {$in4},[$inp],#16
+       cbz     x6,.Lecb_tail4x
+       vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
+       veor    $tmp0,$rndlast,$dat0
+       vorr    $dat0,$in0,$in0
+       veor    $tmp1,$rndlast,$dat1
+       vorr    $dat1,$in1,$in1
+       veor    $tmp2,$rndlast,$dat2
+       vorr    $dat2,$in2,$in2
+       veor    $tmp3,$rndlast,$dat3
+       vorr    $dat3,$in3,$in3
+       veor    $tmp4,$rndlast,$dat4
+       vst1.8  {$tmp0},[$out],#16
+       vorr    $dat4,$in4,$in4
+       vst1.8  {$tmp1},[$out],#16
+       mov     $cnt,$rounds
+       vst1.8  {$tmp2},[$out],#16
+       vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
+       vst1.8  {$tmp3},[$out],#16
+       vst1.8  {$tmp4},[$out],#16
+       b.hs    .Loop5x_ecb_dec
+
+       add     $len,$len,#0x50
+       cbz     $len,.Lecb_done
+
+       add     $cnt,$rounds,#2
+       subs    $len,$len,#0x30
+       vorr    $dat0,$in2,$in2
+       vorr    $dat1,$in3,$in3
+       vorr    $dat2,$in4,$in4
+       b.lo    .Lecb_dec_tail
+
+       b       .Loop3x_ecb_dec
+
+.align 4
+.Lecb_tail4x:
+       veor    $tmp1,$rndlast,$dat1
+       veor    $tmp2,$rndlast,$dat2
+       veor    $tmp3,$rndlast,$dat3
+       veor    $tmp4,$rndlast,$dat4
+       vst1.8  {$tmp1},[$out],#16
+       vst1.8  {$tmp2},[$out],#16
+       vst1.8  {$tmp3},[$out],#16
+       vst1.8  {$tmp4},[$out],#16
+
+       b       .Lecb_done
+.align 4
+___
+$code.=<<___;
+.Loop3x_ecb_dec:
+       aesd    $dat0,q8
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q8
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q8
+       aesimc  $dat2,$dat2
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aesd    $dat0,q9
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q9
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q9
+       aesimc  $dat2,$dat2
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Loop3x_ecb_dec
+
+       aesd    $dat0,q8
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q8
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q8
+       aesimc  $dat2,$dat2
+       subs    $len,$len,#0x30
+       mov.lo  x6,$len                         // x6, $cnt, is zero at this point
+       aesd    $dat0,q9
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q9
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q9
+       aesimc  $dat2,$dat2
+       add     $inp,$inp,x6                    // $inp is adjusted in such way that
+                                               // at exit from the loop $dat1-$dat2
+                                               // are loaded with last "words"
+       mov     $key_,$key
+       aesd    $dat0,q12
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q12
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q12
+       aesimc  $dat2,$dat2
+       vld1.8  {$in0},[$inp],#16
+       aesd    $dat0,q13
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q13
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q13
+       aesimc  $dat2,$dat2
+       vld1.8  {$in1},[$inp],#16
+       aesd    $dat0,q14
+       aesimc  $dat0,$dat0
+       aesd    $dat1,q14
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q14
+       aesimc  $dat2,$dat2
+       vld1.8  {$in2},[$inp],#16
+       aesd    $dat0,q15
+       aesd    $dat1,q15
+       aesd    $dat2,q15
+       vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
+       add     $cnt,$rounds,#2
+       veor    $tmp0,$rndlast,$dat0
+       veor    $tmp1,$rndlast,$dat1
+       veor    $dat2,$dat2,$rndlast
+       vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
+       vst1.8  {$tmp0},[$out],#16
+       vorr    $dat0,$in0,$in0
+       vst1.8  {$tmp1},[$out],#16
+       vorr    $dat1,$in1,$in1
+       vst1.8  {$dat2},[$out],#16
+       vorr    $dat2,$in2,$in2
+       b.hs    .Loop3x_ecb_dec
+
+       cmn     $len,#0x30
+       b.eq    .Lecb_done
+       nop
+
+.Lecb_dec_tail:
+       aesd    $dat1,q8
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q8
+       aesimc  $dat2,$dat2
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aesd    $dat1,q9
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q9
+       aesimc  $dat2,$dat2
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Lecb_dec_tail
+
+       aesd    $dat1,q8
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q8
+       aesimc  $dat2,$dat2
+       aesd    $dat1,q9
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q9
+       aesimc  $dat2,$dat2
+       aesd    $dat1,q12
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q12
+       aesimc  $dat2,$dat2
+       cmn     $len,#0x20
+       aesd    $dat1,q13
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q13
+       aesimc  $dat2,$dat2
+       aesd    $dat1,q14
+       aesimc  $dat1,$dat1
+       aesd    $dat2,q14
+       aesimc  $dat2,$dat2
+       aesd    $dat1,q15
+       aesd    $dat2,q15
+       b.eq    .Lecb_dec_one
+       veor    $tmp1,$rndlast,$dat1
+       veor    $tmp2,$rndlast,$dat2
+       vst1.8  {$tmp1},[$out],#16
+       vst1.8  {$tmp2},[$out],#16
+       b       .Lecb_done
+
+.Lecb_dec_one:
+       veor    $tmp1,$rndlast,$dat2
+       vst1.8  {$tmp1},[$out],#16
+
+.Lecb_done:
+___
+}
+$code.=<<___   if ($flavour !~ /64/);
+       vldmia  sp!,{d8-d15}
+       ldmia   sp!,{r4-r8,pc}
+___
+$code.=<<___   if ($flavour =~ /64/);
+       ldr     x29,[sp],#16
+___
+$code.=<<___   if ($flavour =~ /64/);
+.Lecb_Final_abort:
+       ret
+___
+$code.=<<___;
+.size  ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
+___
+}}}
 {{{
 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
index b1868e1a6df2288e28181917dfa4340a152a551d..03afd719af0612dd7bd5c6ed8d5c99a78ad104a6 100644 (file)
@@ -89,6 +89,7 @@ void AES_xts_decrypt(const unsigned char *inp, unsigned char *out, size_t len,
 #    define HWAES_encrypt aes_v8_encrypt
 #    define HWAES_decrypt aes_v8_decrypt
 #    define HWAES_cbc_encrypt aes_v8_cbc_encrypt
+#    define HWAES_ecb_encrypt aes_v8_ecb_encrypt
 #    define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
 #   endif
 #  endif
@@ -411,6 +412,9 @@ void HWAES_decrypt(const unsigned char *in, unsigned char *out,
 void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
                        size_t length, const AES_KEY *key,
                        unsigned char *ivec, const int enc);
+void HWAES_ecb_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t length, const AES_KEY *key,
+                       const int enc);
 void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
                                 size_t len, const AES_KEY *key,
                                 const unsigned char ivec[16]);
index 1c672d2a465dfef396f955823642f3e0b2ab1a86..e19079912ba24f1876016db92efd475cbeb375b8 100644 (file)
@@ -29,6 +29,10 @@ typedef void (*cbc128_f) (const unsigned char *in, unsigned char *out,
                           size_t len, const void *key,
                           unsigned char ivec[16], int enc);
 
+typedef void (*ecb128_f) (const unsigned char *in, unsigned char *out,
+                          size_t len, const void *key,
+                          int enc);
+
 typedef void (*ctr128_f) (const unsigned char *in, unsigned char *out,
                           size_t blocks, const void *key,
                           const unsigned char ivec[16]);
index bc733ee16acda714e4ef17e7602b81010b25647b..e9a7c31f98908251b1c95f9fff7d086d1717a56b 100644 (file)
@@ -29,6 +29,10 @@ static int cipher_hw_aes_initkey(PROV_CIPHER_CTX *dat,
 # ifdef HWAES_cbc_encrypt
             if (dat->mode == EVP_CIPH_CBC_MODE)
                 dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
+# endif
+# ifdef HWAES_ecb_encrypt
+            if (dat->mode == EVP_CIPH_ECB_MODE)
+                dat->stream.ecb = (ecb128_f)HWAES_ecb_encrypt;
 # endif
         } else
 #endif
@@ -64,6 +68,11 @@ static int cipher_hw_aes_initkey(PROV_CIPHER_CTX *dat,
             dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
         else
 # endif
+# ifdef HWAES_ecb_encrypt
+        if (dat->mode == EVP_CIPH_ECB_MODE)
+            dat->stream.ecb = (ecb128_f)HWAES_ecb_encrypt;
+        else
+# endif
 # ifdef HWAES_ctr32_encrypt_blocks
         if (dat->mode == EVP_CIPH_CTR_MODE)
             dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;
index f1c466edc8faa7b5ad714221b0da7f2c5354974f..31062f1bc666ea968f5ff17ecb4e4154174ddedf 100644 (file)
@@ -34,8 +34,13 @@ int cipher_hw_generic_ecb(PROV_CIPHER_CTX *dat, unsigned char *out,
     if (len < bl)
         return 1;
 
-    for (i = 0, len -= bl; i <= len; i += bl)
-        (*dat->block) (in + i, out + i, dat->ks);
+    if (dat->stream.ecb) {
+        (*dat->stream.ecb) (in, out, len, dat->ks, dat->enc);
+    }
+    else {
+        for (i = 0, len -= bl; i <= len; i += bl)
+            (*dat->block) (in + i, out + i, dat->ks);
+    }
 
     return 1;
 }
index 1072b577fe41b79b25f75c84bccbd4b5e0eb5ea0..bf77a4021e85f29276fb2bc7afe654c72b2e8c3a 100644 (file)
@@ -37,6 +37,7 @@ struct prov_cipher_ctx_st {
     union {
         cbc128_f cbc;
         ctr128_f ctr;
+        ecb128_f ecb;
     } stream;
 
     unsigned int mode;