Reduce optimization in hppa builds
[openssl.git] / crypto / aes / asm / aesv8-armx.pl
index 57e7f5146fad1d28fa4b2636dae965badb31be53..33a2dd53dae1930d350070236d67a84edcc6c90a 100755 (executable)
@@ -1,5 +1,5 @@
 #! /usr/bin/env perl
-# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2014-2024 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the Apache License 2.0 (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
@@ -1751,6 +1751,755 @@ $code.=<<___;
 .size  ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
 ___
 }}}
+
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$roundsx,$cnt,$key_)=("w5","x5","w6","x7");
+my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
+my ($tctr3,$tctr4,$tctr5,$tctr6)=map("w$_",(11,13..15));
+my ($tctr7,$tctr8,$tctr9,$tctr10,$tctr11)=map("w$_",(19..23));
+
+# q0-q7 => v0-v7; q8-q23 => v16-v31; q24-q31 => v8-v15
+my ($ivec,$rndlast,$rndping,$rndpang)=map("q$_",(0..3));
+my ($in0,$in1,$in2,$in3,$in4,$in5)=map("q$_",(4..9));
+my ($in6,$in7,$in8,$in9,$in10,$in11)=map("q$_",(10..15));
+my ($dat0,$dat1,$dat2,$dat3,$dat4,$dat5)=map("q$_",(16..21));
+my ($dat6,$dat7,$dat8,$dat9,$dat10,$dat11)=map("q$_",(22..27));
+my ($tmp0,$tmp1,$tmp2)=map("q$_",(25..27));
+
+#q_X => qX, for ldp & stp
+my ($in0q,$in1q,$in2q,$in3q)=map("q_$_",(4..7));
+my ($in4q,$in5q,$in6q,$in7q,$in8q,$in9q,$in10q,$in11q)=map("q_$_",(16..23));
+
+my ($dat8d,$dat9d,$dat10d,$dat11d)=map("d$_",(8..11));
+
+$code.=<<___   if ($flavour =~ /64/);
+.globl ${prefix}_ctr32_encrypt_blocks_unroll12_eor3
+.type  ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks_unroll12_eor3:
+       AARCH64_VALID_CALL_TARGET
+       // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+       stp             x29,x30,[sp,#-80]!
+       stp             d8,d9,[sp, #16]
+       stp             d10,d11,[sp, #32]
+       stp             d12,d13,[sp, #48]
+       stp             d14,d15,[sp, #64]
+       add             x29,sp,#0
+
+        ldr            $rounds,[$key,#240]
+
+        ldr            $ctr, [$ivp, #12]
+#ifdef __AARCH64EB__
+       vld1.8          {$dat0},[$ivp]
+#else
+       vld1.32         {$dat0},[$ivp]
+#endif
+       vld1.32         {$rndping-$rndpang},[$key]              // load key schedule...
+        sub            $rounds,$rounds,#4
+        cmp            $len,#2
+        add            $key_,$key,$roundsx,lsl#4       // pointer to last round key
+        sub            $rounds,$rounds,#2
+        add            $key_, $key_, #64
+       vld1.32         {$rndlast},[$key_]
+        add            $key_,$key,#32
+        mov            $cnt,$rounds
+#ifndef __AARCH64EB__
+       rev             $ctr, $ctr
+#endif
+
+       vorr            $dat1,$dat0,$dat0
+        add            $tctr1, $ctr, #1
+       vorr            $dat2,$dat0,$dat0
+        add            $ctr, $ctr, #2
+       vorr            $ivec,$dat0,$dat0
+        rev            $tctr1, $tctr1
+       vmov.32         ${dat1}[3],$tctr1
+       b.ls            .Lctr32_tail_unroll
+        cmp            $len,#6
+        rev            $tctr2, $ctr
+        sub            $len,$len,#3            // bias
+       vmov.32         ${dat2}[3],$tctr2
+       b.lo            .Loop3x_ctr32_unroll
+        cmp            $len,#9
+       vorr            $dat3,$dat0,$dat0
+        add            $tctr3, $ctr, #1
+       vorr            $dat4,$dat0,$dat0
+        add            $tctr4, $ctr, #2
+        rev            $tctr3, $tctr3
+       vorr            $dat5,$dat0,$dat0
+        add            $ctr, $ctr, #3
+        rev            $tctr4, $tctr4
+       vmov.32         ${dat3}[3],$tctr3
+        rev            $tctr5, $ctr
+       vmov.32         ${dat4}[3],$tctr4
+       vmov.32         ${dat5}[3],$tctr5
+        sub            $len,$len,#3
+       b.lo            .Loop6x_ctr32_unroll
+
+       // push regs to stack when 12 data chunks are interleaved
+        stp            x19,x20,[sp,#-16]!
+        stp            x21,x22,[sp,#-16]!
+        stp            x23,x24,[sp,#-16]!
+        stp            $dat8d,$dat9d,[sp,#-32]!
+        stp            $dat10d,$dat11d,[sp,#-32]!
+
+        add            $tctr6,$ctr,#1
+        add            $tctr7,$ctr,#2
+        add            $tctr8,$ctr,#3
+        add            $tctr9,$ctr,#4
+        add            $tctr10,$ctr,#5
+        add            $ctr,$ctr,#6
+       vorr            $dat6,$dat0,$dat0
+        rev            $tctr6,$tctr6
+       vorr            $dat7,$dat0,$dat0
+        rev            $tctr7,$tctr7
+       vorr            $dat8,$dat0,$dat0
+        rev            $tctr8,$tctr8
+       vorr            $dat9,$dat0,$dat0
+        rev            $tctr9,$tctr9
+       vorr            $dat10,$dat0,$dat0
+        rev            $tctr10,$tctr10
+       vorr            $dat11,$dat0,$dat0
+        rev            $tctr11,$ctr
+
+        sub            $len,$len,#6            // bias
+       vmov.32         ${dat6}[3],$tctr6
+       vmov.32         ${dat7}[3],$tctr7
+       vmov.32         ${dat8}[3],$tctr8
+       vmov.32         ${dat9}[3],$tctr9
+       vmov.32         ${dat10}[3],$tctr10
+       vmov.32         ${dat11}[3],$tctr11
+       b               .Loop12x_ctr32_unroll
+
+.align 4
+.Loop12x_ctr32_unroll:
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndping
+       aesmc           $dat2,$dat2
+       aese            $dat3,$rndping
+       aesmc           $dat3,$dat3
+       aese            $dat4,$rndping
+       aesmc           $dat4,$dat4
+       aese            $dat5,$rndping
+       aesmc           $dat5,$dat5
+       aese            $dat6,$rndping
+       aesmc           $dat6,$dat6
+       aese            $dat7,$rndping
+       aesmc           $dat7,$dat7
+       aese            $dat8,$rndping
+       aesmc           $dat8,$dat8
+       aese            $dat9,$rndping
+       aesmc           $dat9,$dat9
+       aese            $dat10,$rndping
+       aesmc           $dat10,$dat10
+       aese            $dat11,$rndping
+       aesmc           $dat11,$dat11
+       vld1.32         {$rndping},[$key_],#16
+       subs            $cnt,$cnt,#2
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndpang
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndpang
+       aesmc           $dat2,$dat2
+       aese            $dat3,$rndpang
+       aesmc           $dat3,$dat3
+       aese            $dat4,$rndpang
+       aesmc           $dat4,$dat4
+       aese            $dat5,$rndpang
+       aesmc           $dat5,$dat5
+       aese            $dat6,$rndpang
+       aesmc           $dat6,$dat6
+       aese            $dat7,$rndpang
+       aesmc           $dat7,$dat7
+       aese            $dat8,$rndpang
+       aesmc           $dat8,$dat8
+       aese            $dat9,$rndpang
+       aesmc           $dat9,$dat9
+       aese            $dat10,$rndpang
+       aesmc           $dat10,$dat10
+       aese            $dat11,$rndpang
+       aesmc           $dat11,$dat11
+       vld1.32         {$rndpang},[$key_],#16
+       b.gt            .Loop12x_ctr32_unroll
+
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndping
+       aesmc           $dat2,$dat2
+       aese            $dat3,$rndping
+       aesmc           $dat3,$dat3
+       aese            $dat4,$rndping
+       aesmc           $dat4,$dat4
+       aese            $dat5,$rndping
+       aesmc           $dat5,$dat5
+       aese            $dat6,$rndping
+       aesmc           $dat6,$dat6
+       aese            $dat7,$rndping
+       aesmc           $dat7,$dat7
+       aese            $dat8,$rndping
+       aesmc           $dat8,$dat8
+       aese            $dat9,$rndping
+       aesmc           $dat9,$dat9
+       aese            $dat10,$rndping
+       aesmc           $dat10,$dat10
+       aese            $dat11,$rndping
+       aesmc           $dat11,$dat11
+       vld1.32         {$rndping},[$key_],#16
+
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndpang
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndpang
+       aesmc           $dat2,$dat2
+       aese            $dat3,$rndpang
+       aesmc           $dat3,$dat3
+       aese            $dat4,$rndpang
+       aesmc           $dat4,$dat4
+       aese            $dat5,$rndpang
+       aesmc           $dat5,$dat5
+       aese            $dat6,$rndpang
+       aesmc           $dat6,$dat6
+       aese            $dat7,$rndpang
+       aesmc           $dat7,$dat7
+       aese            $dat8,$rndpang
+       aesmc           $dat8,$dat8
+       aese            $dat9,$rndpang
+       aesmc           $dat9,$dat9
+       aese            $dat10,$rndpang
+       aesmc           $dat10,$dat10
+       aese            $dat11,$rndpang
+       aesmc           $dat11,$dat11
+       vld1.32         {$rndpang},[$key_],#16
+
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+        add            $tctr0,$ctr,#1
+        add            $tctr1,$ctr,#2
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+        add            $tctr2,$ctr,#3
+        add            $tctr3,$ctr,#4
+       aese            $dat2,$rndping
+       aesmc           $dat2,$dat2
+        add            $tctr4,$ctr,#5
+        add            $tctr5,$ctr,#6
+        rev            $tctr0,$tctr0
+       aese            $dat3,$rndping
+       aesmc           $dat3,$dat3
+        add            $tctr6,$ctr,#7
+        add            $tctr7,$ctr,#8
+        rev            $tctr1,$tctr1
+        rev            $tctr2,$tctr2
+       aese            $dat4,$rndping
+       aesmc           $dat4,$dat4
+        add            $tctr8,$ctr,#9
+        add            $tctr9,$ctr,#10
+        rev            $tctr3,$tctr3
+        rev            $tctr4,$tctr4
+       aese            $dat5,$rndping
+       aesmc           $dat5,$dat5
+        add            $tctr10,$ctr,#11
+        add            $tctr11,$ctr,#12
+        rev            $tctr5,$tctr5
+        rev            $tctr6,$tctr6
+       aese            $dat6,$rndping
+       aesmc           $dat6,$dat6
+        rev            $tctr7,$tctr7
+        rev            $tctr8,$tctr8
+       aese            $dat7,$rndping
+       aesmc           $dat7,$dat7
+        rev            $tctr9,$tctr9
+        rev            $tctr10,$tctr10
+       aese            $dat8,$rndping
+       aesmc           $dat8,$dat8
+        rev            $tctr11,$tctr11
+       aese            $dat9,$rndping
+       aesmc           $dat9,$dat9
+       aese            $dat10,$rndping
+       aesmc           $dat10,$dat10
+       aese            $dat11,$rndping
+       aesmc           $dat11,$dat11
+       vld1.32         {$rndping},[$key_],#16
+
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndpang
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndpang
+       aesmc           $dat2,$dat2
+       aese            $dat3,$rndpang
+       aesmc           $dat3,$dat3
+       vld1.8          {$in0,$in1,$in2,$in3},[$inp],#64
+       aese            $dat4,$rndpang
+       aesmc           $dat4,$dat4
+       aese            $dat5,$rndpang
+       aesmc           $dat5,$dat5
+       aese            $dat6,$rndpang
+       aesmc           $dat6,$dat6
+       aese            $dat7,$rndpang
+       aesmc           $dat7,$dat7
+       vld1.8          {$in4,$in5,$in6,$in7},[$inp],#64
+       aese            $dat8,$rndpang
+       aesmc           $dat8,$dat8
+       aese            $dat9,$rndpang
+       aesmc           $dat9,$dat9
+       aese            $dat10,$rndpang
+       aesmc           $dat10,$dat10
+       aese            $dat11,$rndpang
+       aesmc           $dat11,$dat11
+       vld1.8          {$in8,$in9,$in10,$in11},[$inp],#64
+       vld1.32         {$rndpang},[$key_],#16
+
+        mov            $key_, $key
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndping
+       aesmc           $dat2,$dat2
+       aese            $dat3,$rndping
+       aesmc           $dat3,$dat3
+       aese            $dat4,$rndping
+       aesmc           $dat4,$dat4
+       aese            $dat5,$rndping
+       aesmc           $dat5,$dat5
+       aese            $dat6,$rndping
+       aesmc           $dat6,$dat6
+       aese            $dat7,$rndping
+       aesmc           $dat7,$dat7
+       aese            $dat8,$rndping
+       aesmc           $dat8,$dat8
+       aese            $dat9,$rndping
+       aesmc           $dat9,$dat9
+       aese            $dat10,$rndping
+       aesmc           $dat10,$dat10
+       aese            $dat11,$rndping
+       aesmc           $dat11,$dat11
+       vld1.32         {$rndping},[$key_],#16  // re-pre-load rndkey[0]
+
+       aese            $dat0,$rndpang
+        eor3           $in0,$in0,$rndlast,$dat0
+       vorr            $dat0,$ivec,$ivec
+       aese            $dat1,$rndpang
+        eor3           $in1,$in1,$rndlast,$dat1
+       vorr            $dat1,$ivec,$ivec
+       aese            $dat2,$rndpang
+        eor3           $in2,$in2,$rndlast,$dat2
+       vorr            $dat2,$ivec,$ivec
+       aese            $dat3,$rndpang
+        eor3           $in3,$in3,$rndlast,$dat3
+       vorr            $dat3,$ivec,$ivec
+       aese            $dat4,$rndpang
+        eor3           $in4,$in4,$rndlast,$dat4
+       vorr            $dat4,$ivec,$ivec
+       aese            $dat5,$rndpang
+        eor3           $in5,$in5,$rndlast,$dat5
+       vorr            $dat5,$ivec,$ivec
+       aese            $dat6,$rndpang
+        eor3           $in6,$in6,$rndlast,$dat6
+       vorr            $dat6,$ivec,$ivec
+       aese            $dat7,$rndpang
+        eor3           $in7,$in7,$rndlast,$dat7
+       vorr            $dat7,$ivec,$ivec
+       aese            $dat8,$rndpang
+        eor3           $in8,$in8,$rndlast,$dat8
+       vorr            $dat8,$ivec,$ivec
+       aese            $dat9,$rndpang
+        eor3           $in9,$in9,$rndlast,$dat9
+       vorr            $dat9,$ivec,$ivec
+       aese            $dat10,$rndpang
+        eor3           $in10,$in10,$rndlast,$dat10
+       vorr            $dat10,$ivec,$ivec
+       aese            $dat11,$rndpang
+        eor3           $in11,$in11,$rndlast,$dat11
+       vorr            $dat11,$ivec,$ivec
+       vld1.32         {$rndpang},[$key_],#16  // re-pre-load rndkey[1]
+
+       vmov.32         ${dat0}[3],$tctr0
+       vmov.32         ${dat1}[3],$tctr1
+       vmov.32         ${dat2}[3],$tctr2
+       vmov.32         ${dat3}[3],$tctr3
+       vst1.8          {$in0,$in1,$in2,$in3},[$out],#64
+       vmov.32         ${dat4}[3],$tctr4
+       vmov.32         ${dat5}[3],$tctr5
+       vmov.32         ${dat6}[3],$tctr6
+       vmov.32         ${dat7}[3],$tctr7
+       vst1.8          {$in4,$in5,$in6,$in7},[$out],#64
+       vmov.32         ${dat8}[3],$tctr8
+       vmov.32         ${dat9}[3],$tctr9
+       vmov.32         ${dat10}[3],$tctr10
+       vmov.32         ${dat11}[3],$tctr11
+       vst1.8          {$in8,$in9,$in10,$in11},[$out],#64
+
+        mov            $cnt,$rounds
+
+        add            $ctr,$ctr,#12
+       subs            $len,$len,#12
+       b.hs            .Loop12x_ctr32_unroll
+
+       // pop regs from stack when 12 data chunks are interleaved
+        ldp            $dat10d,$dat11d,[sp],#32
+        ldp            $dat8d,$dat9d,[sp],#32
+        ldp            x23,x24,[sp],#16
+        ldp            x21,x22,[sp],#16
+        ldp            x19,x20,[sp],#16
+
+        add            $len,$len,#12
+        cbz            $len,.Lctr32_done_unroll
+        sub            $ctr,$ctr,#12
+
+        cmp            $len,#2
+       b.ls            .Lctr32_tail_unroll
+
+        cmp            $len,#6
+        sub            $len,$len,#3            // bias
+        add            $ctr,$ctr,#3
+       b.lo            .Loop3x_ctr32_unroll
+
+        sub            $len,$len,#3
+        add            $ctr,$ctr,#3
+       b.lo            .Loop6x_ctr32_unroll
+
+.align 4
+.Loop6x_ctr32_unroll:
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndping
+       aesmc           $dat2,$dat2
+       aese            $dat3,$rndping
+       aesmc           $dat3,$dat3
+       aese            $dat4,$rndping
+       aesmc           $dat4,$dat4
+       aese            $dat5,$rndping
+       aesmc           $dat5,$dat5
+       vld1.32         {$rndping},[$key_],#16
+       subs            $cnt,$cnt,#2
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndpang
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndpang
+       aesmc           $dat2,$dat2
+       aese            $dat3,$rndpang
+       aesmc           $dat3,$dat3
+       aese            $dat4,$rndpang
+       aesmc           $dat4,$dat4
+       aese            $dat5,$rndpang
+       aesmc           $dat5,$dat5
+       vld1.32         {$rndpang},[$key_],#16
+       b.gt            .Loop6x_ctr32_unroll
+
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndping
+       aesmc           $dat2,$dat2
+       aese            $dat3,$rndping
+       aesmc           $dat3,$dat3
+       aese            $dat4,$rndping
+       aesmc           $dat4,$dat4
+       aese            $dat5,$rndping
+       aesmc           $dat5,$dat5
+       vld1.32         {$rndping},[$key_],#16
+
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndpang
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndpang
+       aesmc           $dat2,$dat2
+       aese            $dat3,$rndpang
+       aesmc           $dat3,$dat3
+       aese            $dat4,$rndpang
+       aesmc           $dat4,$dat4
+       aese            $dat5,$rndpang
+       aesmc           $dat5,$dat5
+       vld1.32         {$rndpang},[$key_],#16
+
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+        add            $tctr0,$ctr,#1
+        add            $tctr1,$ctr,#2
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+        add            $tctr2,$ctr,#3
+        add            $tctr3,$ctr,#4
+       aese            $dat2,$rndping
+       aesmc           $dat2,$dat2
+        add            $tctr4,$ctr,#5
+        add            $tctr5,$ctr,#6
+        rev            $tctr0,$tctr0
+       aese            $dat3,$rndping
+       aesmc           $dat3,$dat3
+        rev            $tctr1,$tctr1
+        rev            $tctr2,$tctr2
+       aese            $dat4,$rndping
+       aesmc           $dat4,$dat4
+        rev            $tctr3,$tctr3
+        rev            $tctr4,$tctr4
+       aese            $dat5,$rndping
+       aesmc           $dat5,$dat5
+        rev            $tctr5,$tctr5
+       vld1.32         {$rndping},[$key_],#16
+
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndpang
+       aesmc           $dat1,$dat1
+       vld1.8          {$in0,$in1,$in2,$in3},[$inp],#64
+       aese            $dat2,$rndpang
+       aesmc           $dat2,$dat2
+       aese            $dat3,$rndpang
+       aesmc           $dat3,$dat3
+       vld1.8          {$in4,$in5},[$inp],#32
+       aese            $dat4,$rndpang
+       aesmc           $dat4,$dat4
+       aese            $dat5,$rndpang
+       aesmc           $dat5,$dat5
+       vld1.32         {$rndpang},[$key_],#16
+
+        mov            $key_, $key
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndping
+       aesmc           $dat2,$dat2
+       aese            $dat3,$rndping
+       aesmc           $dat3,$dat3
+       aese            $dat4,$rndping
+       aesmc           $dat4,$dat4
+       aese            $dat5,$rndping
+       aesmc           $dat5,$dat5
+       vld1.32         {$rndping},[$key_],#16  // re-pre-load rndkey[0]
+
+       aese            $dat0,$rndpang
+        eor3           $in0,$in0,$rndlast,$dat0
+       aese            $dat1,$rndpang
+        eor3           $in1,$in1,$rndlast,$dat1
+       aese            $dat2,$rndpang
+        eor3           $in2,$in2,$rndlast,$dat2
+       aese            $dat3,$rndpang
+        eor3           $in3,$in3,$rndlast,$dat3
+       aese            $dat4,$rndpang
+        eor3           $in4,$in4,$rndlast,$dat4
+       aese            $dat5,$rndpang
+        eor3           $in5,$in5,$rndlast,$dat5
+       vld1.32         {$rndpang},[$key_],#16  // re-pre-load rndkey[1]
+
+       vorr            $dat0,$ivec,$ivec
+       vorr            $dat1,$ivec,$ivec
+       vorr            $dat2,$ivec,$ivec
+       vorr            $dat3,$ivec,$ivec
+       vorr            $dat4,$ivec,$ivec
+       vorr            $dat5,$ivec,$ivec
+
+       vmov.32         ${dat0}[3],$tctr0
+       vmov.32         ${dat1}[3],$tctr1
+       vst1.8          {$in0,$in1,$in2,$in3},[$out],#64
+       vmov.32         ${dat2}[3],$tctr2
+       vmov.32         ${dat3}[3],$tctr3
+       vst1.8          {$in4,$in5},[$out],#32
+       vmov.32         ${dat4}[3],$tctr4
+       vmov.32         ${dat5}[3],$tctr5
+
+        cbz            $len,.Lctr32_done_unroll
+        mov            $cnt,$rounds
+
+        cmp            $len,#2
+       b.ls            .Lctr32_tail_unroll
+
+        sub            $len,$len,#3            // bias
+        add            $ctr,$ctr,#3
+        b              .Loop3x_ctr32_unroll
+
+.align 4
+.Loop3x_ctr32_unroll:
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndping
+       aesmc           $dat2,$dat2
+       vld1.32         {$rndping},[$key_],#16
+       subs            $cnt,$cnt,#2
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndpang
+       aesmc           $dat1,$dat1
+       aese            $dat2,$rndpang
+       aesmc           $dat2,$dat2
+       vld1.32         {$rndpang},[$key_],#16
+       b.gt            .Loop3x_ctr32_unroll
+
+       aese            $dat0,$rndping
+       aesmc           $tmp0,$dat0
+       aese            $dat1,$rndping
+       aesmc           $tmp1,$dat1
+       vld1.8          {$in0,$in1,$in2},[$inp],#48
+       vorr            $dat0,$ivec,$ivec
+       aese            $dat2,$rndping
+       aesmc           $dat2,$dat2
+       vld1.32         {$rndping},[$key_],#16
+       vorr            $dat1,$ivec,$ivec
+       aese            $tmp0,$rndpang
+       aesmc           $tmp0,$tmp0
+       aese            $tmp1,$rndpang
+       aesmc           $tmp1,$tmp1
+       aese            $dat2,$rndpang
+       aesmc           $tmp2,$dat2
+       vld1.32         {$rndpang},[$key_],#16
+       vorr            $dat2,$ivec,$ivec
+        add            $tctr0,$ctr,#1
+       aese            $tmp0,$rndping
+       aesmc           $tmp0,$tmp0
+       aese            $tmp1,$rndping
+       aesmc           $tmp1,$tmp1
+        add            $tctr1,$ctr,#2
+       aese            $tmp2,$rndping
+       aesmc           $tmp2,$tmp2
+       vld1.32         {$rndping},[$key_],#16
+        add            $ctr,$ctr,#3
+       aese            $tmp0,$rndpang
+       aesmc           $tmp0,$tmp0
+       aese            $tmp1,$rndpang
+       aesmc           $tmp1,$tmp1
+
+        rev            $tctr0,$tctr0
+       aese            $tmp2,$rndpang
+       aesmc           $tmp2,$tmp2
+       vld1.32         {$rndpang},[$key_],#16
+       vmov.32         ${dat0}[3], $tctr0
+        mov            $key_,$key
+        rev            $tctr1,$tctr1
+       aese            $tmp0,$rndping
+       aesmc           $tmp0,$tmp0
+
+       aese            $tmp1,$rndping
+       aesmc           $tmp1,$tmp1
+       vmov.32         ${dat1}[3], $tctr1
+        rev            $tctr2,$ctr
+       aese            $tmp2,$rndping
+       aesmc           $tmp2,$tmp2
+       vmov.32         ${dat2}[3], $tctr2
+
+       aese            $tmp0,$rndpang
+       aese            $tmp1,$rndpang
+       aese            $tmp2,$rndpang
+
+        eor3           $in0,$in0,$rndlast,$tmp0
+       vld1.32         {$rndping},[$key_],#16  // re-pre-load rndkey[0]
+        eor3           $in1,$in1,$rndlast,$tmp1
+        mov            $cnt,$rounds
+        eor3           $in2,$in2,$rndlast,$tmp2
+       vld1.32         {$rndpang},[$key_],#16  // re-pre-load rndkey[1]
+       vst1.8          {$in0,$in1,$in2},[$out],#48
+
+        cbz            $len,.Lctr32_done_unroll
+
+.Lctr32_tail_unroll:
+        cmp            $len,#1
+       b.eq            .Lctr32_tail_1_unroll
+
+.Lctr32_tail_2_unroll:
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+       vld1.32         {$rndping},[$key_],#16
+       subs            $cnt,$cnt,#2
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndpang
+       aesmc           $dat1,$dat1
+       vld1.32         {$rndpang},[$key_],#16
+       b.gt            .Lctr32_tail_2_unroll
+
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+       vld1.32         {$rndping},[$key_],#16
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndpang
+       aesmc           $dat1,$dat1
+       vld1.32         {$rndpang},[$key_],#16
+       vld1.8          {$in0,$in1},[$inp],#32
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+       vld1.32         {$rndping},[$key_],#16
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndpang
+       aesmc           $dat1,$dat1
+       vld1.32         {$rndpang},[$key_],#16
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       aese            $dat1,$rndping
+       aesmc           $dat1,$dat1
+       aese            $dat0,$rndpang
+       aese            $dat1,$rndpang
+
+        eor3           $in0,$in0,$rndlast,$dat0
+        eor3           $in1,$in1,$rndlast,$dat1
+       vst1.8          {$in0,$in1},[$out],#32
+        b              .Lctr32_done_unroll
+
+.Lctr32_tail_1_unroll:
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       vld1.32         {$rndping},[$key_],#16
+       subs            $cnt,$cnt,#2
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       vld1.32         {$rndpang},[$key_],#16
+       b.gt            .Lctr32_tail_1_unroll
+
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       vld1.32         {$rndping},[$key_],#16
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       vld1.32         {$rndpang},[$key_],#16
+       vld1.8          {$in0},[$inp]
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       vld1.32         {$rndping},[$key_],#16
+       aese            $dat0,$rndpang
+       aesmc           $dat0,$dat0
+       vld1.32         {$rndpang},[$key_],#16
+       aese            $dat0,$rndping
+       aesmc           $dat0,$dat0
+       aese            $dat0,$rndpang
+
+        eor3           $in0,$in0,$rndlast,$dat0
+       vst1.8          {$in0},[$out],#16
+
+.Lctr32_done_unroll:
+       ldp             d8,d9,[sp, #16]
+       ldp             d10,d11,[sp, #32]
+       ldp             d12,d13,[sp, #48]
+       ldp             d15,d16,[sp, #64]
+       ldr             x29,[sp],#80
+       ret
+.size  ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,.-${prefix}_ctr32_encrypt_blocks_unroll12_eor3
+___
+}}}
+
 {{{
 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
 my ($rounds,$cnt,$key_)=("w5","w6","x7");
@@ -1809,6 +2558,21 @@ $code.=<<___;
 #ifndef __ARMEB__
        rev             $ctr, $ctr
 #endif
+___
+$code.=<<___   if ($flavour =~ /64/);
+       vorr            $dat1,$dat0,$dat0
+       add             $tctr1, $ctr, #1
+       vorr            $dat2,$dat0,$dat0
+       add             $ctr, $ctr, #2
+       vorr            $ivec,$dat0,$dat0
+       rev             $tctr1, $tctr1
+       vmov.32         ${dat1}[3],$tctr1
+       b.ls            .Lctr32_tail
+       rev             $tctr2, $ctr
+       sub             $len,$len,#3            // bias
+       vmov.32         ${dat2}[3],$tctr2
+___
+$code.=<<___   if ($flavour !~ /64/);
        add             $tctr1, $ctr, #1
        vorr            $ivec,$dat0,$dat0
        rev             $tctr1, $tctr1
@@ -2015,11 +2779,25 @@ $code.=<<___;
        aese            $dat1,q8
        aesmc           $tmp1,$dat1
         vld1.8         {$in0},[$inp],#16
+___
+$code.=<<___   if ($flavour =~ /64/);
+        vorr           $dat0,$ivec,$ivec
+___
+$code.=<<___   if ($flavour !~ /64/);
         add            $tctr0,$ctr,#1
+___
+$code.=<<___;
        aese            $dat2,q8
        aesmc           $dat2,$dat2
         vld1.8         {$in1},[$inp],#16
+___
+$code.=<<___   if ($flavour =~ /64/);
+        vorr           $dat1,$ivec,$ivec
+___
+$code.=<<___   if ($flavour !~ /64/);
         rev            $tctr0,$tctr0
+___
+$code.=<<___;
        aese            $tmp0,q9
        aesmc           $tmp0,$tmp0
        aese            $tmp1,q9
@@ -2028,6 +2806,12 @@ $code.=<<___;
         mov            $key_,$key
        aese            $dat2,q9
        aesmc           $tmp2,$dat2
+___
+$code.=<<___   if ($flavour =~ /64/);
+        vorr           $dat2,$ivec,$ivec
+        add            $tctr0,$ctr,#1
+___
+$code.=<<___;
        aese            $tmp0,q12
        aesmc           $tmp0,$tmp0
        aese            $tmp1,q12
@@ -2043,22 +2827,47 @@ $code.=<<___;
        aese            $tmp1,q13
        aesmc           $tmp1,$tmp1
         veor           $in2,$in2,$rndlast
+___
+$code.=<<___   if ($flavour =~ /64/);
+        rev            $tctr0,$tctr0
+       aese            $tmp2,q13
+       aesmc           $tmp2,$tmp2
+        vmov.32        ${dat0}[3], $tctr0
+___
+$code.=<<___   if ($flavour !~ /64/);
         vmov.32        ${ivec}[3], $tctr0
        aese            $tmp2,q13
        aesmc           $tmp2,$tmp2
         vorr           $dat0,$ivec,$ivec
+___
+$code.=<<___;
         rev            $tctr1,$tctr1
        aese            $tmp0,q14
        aesmc           $tmp0,$tmp0
+___
+$code.=<<___   if ($flavour !~ /64/);
         vmov.32        ${ivec}[3], $tctr1
         rev            $tctr2,$ctr
+___
+$code.=<<___;
        aese            $tmp1,q14
        aesmc           $tmp1,$tmp1
+___
+$code.=<<___   if ($flavour =~ /64/);
+        vmov.32        ${dat1}[3], $tctr1
+        rev            $tctr2,$ctr
+       aese            $tmp2,q14
+       aesmc           $tmp2,$tmp2
+        vmov.32        ${dat2}[3], $tctr2
+___
+$code.=<<___   if ($flavour !~ /64/);
         vorr           $dat1,$ivec,$ivec
         vmov.32        ${ivec}[3], $tctr2
        aese            $tmp2,q14
        aesmc           $tmp2,$tmp2
         vorr           $dat2,$ivec,$ivec
+___
+$code.=<<___;
         subs           $len,$len,#3
        aese            $tmp0,q15
        aese            $tmp1,q15
@@ -2214,10 +3023,10 @@ $code.=<<___    if ($flavour =~ /64/);
        b.ne    .Lxts_enc_big_size
        // Encrypt the iv with key2, as the first XEX iv.
        ldr     $rounds,[$key2,#240]
-       vld1. {$dat},[$key2],#16
+       vld1.32 {$dat},[$key2],#16
        vld1.8  {$iv0},[$ivp]
        sub     $rounds,$rounds,#2
-       vld1. {$dat1},[$key2],#16
+       vld1.32 {$dat1},[$key2],#16
 
 .Loop_enc_iv_enc:
        aese    $iv0,$dat
@@ -2819,9 +3628,9 @@ $code.=<<___      if ($flavour =~ /64/);
 
        // Encrypt the composite block to get the last second encrypted text block
        ldr     $rounds,[$key1,#240]            // load key schedule...
-       vld1. {$dat},[$key1],#16
+       vld1.32 {$dat},[$key1],#16
        sub     $rounds,$rounds,#2
-       vld1. {$dat1},[$key1],#16             // load key schedule...
+       vld1.32 {$dat1},[$key1],#16             // load key schedule...
 .Loop_final_enc:
        aese    $tmpin,$dat0
        aesmc   $tmpin,$tmpin
@@ -2891,10 +3700,10 @@ $code.=<<___    if ($flavour =~ /64/);
        b.ne    .Lxts_dec_big_size
        // Encrypt the iv with key2, as the first XEX iv.
        ldr     $rounds,[$key2,#240]
-       vld1. {$dat},[$key2],#16
+       vld1.32 {$dat},[$key2],#16
        vld1.8  {$iv0},[$ivp]
        sub     $rounds,$rounds,#2
-       vld1. {$dat1},[$key2],#16
+       vld1.32 {$dat1},[$key2],#16
 
 .Loop_dec_small_iv_enc:
        aese    $iv0,$dat
@@ -2974,10 +3783,10 @@ $code.=<<___    if ($flavour =~ /64/);
 
        // Encrypt the iv with key2, as the first XEX iv
        ldr     $rounds,[$key2,#240]
-       vld1. {$dat},[$key2],#16
+       vld1.32 {$dat},[$key2],#16
        vld1.8  {$iv0},[$ivp]
        sub     $rounds,$rounds,#2
-       vld1. {$dat1},[$key2],#16
+       vld1.32 {$dat1},[$key2],#16
 
 .Loop_dec_iv_enc:
        aese    $iv0,$dat
@@ -3307,7 +4116,7 @@ $code.=<<___      if ($flavour =~ /64/);
 .align 4
 .Lxts_dec_tail4x:
        add     $inp,$inp,#16
-       vld1.32 {$dat0},[$inp],#16
+       tst     $tailcnt,#0xf
        veor    $tmp1,$dat1,$tmp0
        vst1.8  {$tmp1},[$out],#16
        veor    $tmp2,$dat2,$tmp2
@@ -3316,6 +4125,8 @@ $code.=<<___      if ($flavour =~ /64/);
        veor    $tmp4,$dat4,$tmp4
        vst1.8  {$tmp3-$tmp4},[$out],#32
 
+       b.eq    .Lxts_dec_abort
+       vld1.8  {$dat0},[$inp],#16
        b       .Lxts_done
 .align 4
 .Lxts_outer_dec_tail:
@@ -3493,7 +4304,7 @@ $code.=<<___      if ($flavour =~ /64/);
        // Processing the last two blocks with cipher stealing.
        mov     x7,x3
        cbnz    x2,.Lxts_dec_1st_done
-       vld1.32 {$dat0},[$inp],#16
+       vld1. {$dat0},[$inp],#16
 
        // Decrypt the last second block to get the last plain text block
 .Lxts_dec_1st_done:
@@ -3538,9 +4349,9 @@ $code.=<<___      if ($flavour =~ /64/);
 
        // Decrypt the composite block to get the last second plain text block
        ldr     $rounds,[$key_,#240]
-       vld1. {$dat},[$key_],#16
+       vld1.32 {$dat},[$key_],#16
        sub     $rounds,$rounds,#2
-       vld1. {$dat1},[$key_],#16
+       vld1.32 {$dat1},[$key_],#16
 .Loop_final_dec:
        aesd    $tmpin,$dat0
        aesimc  $tmpin,$tmpin
@@ -3578,7 +4389,8 @@ ___
 if ($flavour =~ /64/) {                        ######## 64-bit code
     my %opcode = (
        "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
-       "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
+       "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800,
+       "eor3"  =>      0xce000000,     );
 
     local *unaes = sub {
        my ($mnemonic,$arg)=@_;
@@ -3589,10 +4401,21 @@ if ($flavour =~ /64/) {                 ######## 64-bit code
                        $mnemonic,$arg;
     };
 
+    sub unsha3 {
+                my ($mnemonic,$arg)=@_;
+
+                $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
+                &&
+                sprintf ".inst\t0x%08x\t//%s %s",
+                       $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
+                       $mnemonic,$arg;
+    }
+
     foreach(split("\n",$code)) {
        s/\`([^\`]*)\`/eval($1)/geo;
 
-       s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
+       s/\bq([0-9]+)\b/"v".($1<8?$1:($1<24?$1+8:$1-16)).".16b"/geo;    # old->new registers
+       s/\bq_([0-9]+)\b/"q".$1/geo;    # old->new registers
        s/@\s/\/\//o;                   # old->new style commentary
 
        #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
@@ -3605,6 +4428,7 @@ if ($flavour =~ /64/) {                   ######## 64-bit code
        s/vshr/ushr/o           or
        s/^(\s+)v/$1/o          or      # strip off v prefix
        s/\bbx\s+lr\b/ret/o;
+       s/\b(eor3)\s+(v.*)/unsha3($1,$2)/ge;
 
        # fix up remaining legacy suffixes
        s/\.[ui]?8//o;