aes-gcm-armv8_64 asm support bigdian
authorJerryDevis <hisec1@huawei.com>
Sat, 11 Mar 2023 10:35:23 +0000 (18:35 +0800)
committerTomas Mraz <tomas@openssl.org>
Tue, 9 May 2023 14:21:04 +0000 (16:21 +0200)
Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/20489)

(cherry picked from commit 32344a74b7ee2693a5bfda361c40ec60ab5be624)

crypto/modes/asm/aes-gcm-armv8_64.pl

index ff5809ec22df662f2cf84b355c6eee8007c599c6..b5e76914270248d87c9f44331199b61601097b66 100755 (executable)
@@ -217,6 +217,7 @@ my $mod_constant="v8";
 my $mod_t="v31";
 
 my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
+my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s)=map("v$_.4s",(18..27));
 my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
 my $rk2q1="v20.1q";
 my $rk3q1="v21.1q";
@@ -269,28 +270,36 @@ aes_gcm_enc_128_kernel:
        stp     d14, d15, [sp, #96]
 
        ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
+#ifdef __AARCH64EB__
+       rev     $ctr96_b64x, $ctr96_b64x
+       rev     $ctr96_t32x, $ctr96_t32x
+#endif
        ldp     $rk10_l, $rk10_h, [$cc, #160]                     @ load rk10
-
+#ifdef __AARCH64EB__
+       ror     $rk10_l, $rk10_l, #32
+       ror     $rk10_h, $rk10_h, #32
+#endif
        ld1     {$acc_lb}, [$current_tag]
        ext     $acc_lb, $acc_lb, $acc_lb, #8
        rev64   $acc_lb, $acc_lb
        lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
        mov     $len, $main_end_input_ptr
 
-       ldr     $rk9q, [$cc, #144]                                @ load rk9
+       ld1     {$rk0s}, [$cc], #16                                                               @ load rk0
        add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
        sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
 
        lsr     $rctr32x, $ctr96_t32x, #32
        ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
+#ifndef __AARCH64EB__
        ext     $h4b, $h4b, $h4b, #8
-
+#endif
        fmov    $ctr1d, $ctr96_b64x                               @ CTR block 1
        rev     $rctr32w, $rctr32w                                @ rev_ctr32
 
        add     $rctr32w, $rctr32w, #1                            @ increment rev_ctr32
        orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
-       ldr     $rk0q, [$cc, #0]                                  @ load rk0
+       ld1     {$rk1s}, [$cc], #16                                                               @ load rk1
 
        rev     $ctr32w, $rctr32w                                 @ CTR block 1
        add     $rctr32w, $rctr32w, #1                            @ CTR block 1
@@ -310,45 +319,50 @@ aes_gcm_enc_128_kernel:
        rev     $ctr32w, $rctr32w                                 @ CTR block 3
 
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 3
-       ldr     $rk1q, [$cc, #16]                                 @ load rk1
+       ld1     {$rk2s}, [$cc], #16                                                               @ load rk2
 
        add     $rctr32w, $rctr32w, #1                            @ CTR block 3
        fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
 
        ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
+#ifndef __AARCH64EB__
        ext     $h3b, $h3b, $h3b, #8
-
+#endif
        aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
-       ldr     $rk2q, [$cc, #32]                                 @ load rk2
+       ld1     {$rk3s}, [$cc], #16                                                               @ load rk3
 
        aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
        ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
+#ifndef __AARCH64EB__
        ext     $h1b, $h1b, $h1b, #8
+#endif
 
        aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
-       ldr     $rk8q, [$cc, #128]                                @ load rk8
+       ld1     {$rk4s}, [$cc], #16                                                               @ load rk4
 
        aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
-       ldr     $rk3q, [$cc, #48]                                 @ load rk3
+       ld1     {$rk5s}, [$cc], #16                                                               @ load rk5
 
        aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
        trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
 
        aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 1
-       ldr     $rk6q, [$cc, #96]                                 @ load rk6
+       ld1     {$rk6s}, [$cc], #16                                                               @ load rk6
 
        aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
-       ldr     $rk7q, [$cc, #112]                                @ load rk7
+       ld1     {$rk7s}, [$cc], #16                                                               @ load rk7
 
        aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
        trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
 
        aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
-       ldr     $rk5q, [$cc, #80]                                 @ load rk5
+       ld1     {$rk8s}, [$cc], #16                                                               @ load rk8
 
        aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
        ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
+#ifndef __AARCH64EB__
        ext     $h2b, $h2b, $h2b, #8
+#endif
 
        aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
 
@@ -360,7 +374,7 @@ aes_gcm_enc_128_kernel:
        aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
 
        aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
-       ldr     $rk4q, [$cc, #64]                                 @ load rk4
+       ld1     {$rk9s}, [$cc], #16                                                               @ load rk9
 
        aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 3
 
@@ -422,13 +436,25 @@ aes_gcm_enc_128_kernel:
        b.ge    .L128_enc_tail                                    @ handle tail
 
        ldp     $input_l0, $input_h0, [$input_ptr, #0]            @ AES block 0 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        ldp     $input_l2, $input_h2, [$input_ptr, #32]           @ AES block 2 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l2, $input_l2
+       rev     $input_h2, $input_h2
+#endif
        ldp     $input_l1, $input_h1, [$input_ptr, #16]           @ AES block 1 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l1, $input_l1
+       rev     $input_h1, $input_h1
+#endif
        ldp     $input_l3, $input_h3, [$input_ptr, #48]           @ AES block 3 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l3, $input_l3
+       rev     $input_h3, $input_h3
+#endif
        eor     $input_l0, $input_l0, $rk10_l                     @ AES block 0 - round 10 low
        eor     $input_h0, $input_h0, $rk10_h                     @ AES block 0 - round 10 high
 
@@ -493,6 +519,10 @@ aes_gcm_enc_128_kernel:
 
        .L128_enc_main_loop:                                      @ main loop start
        ldp     $input_l3, $input_h3, [$input_ptr, #48]           @ AES block 4k+3 - load plaintext
+#ifdef __AARCH64EB__
+       rev     $input_l3, $input_l3
+       rev     $input_h3, $input_h3
+#endif
        rev64   $res0b, $res0b                                    @ GHASH block 4k (only t0 is free)
        rev64   $res2b, $res2b                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
 
@@ -521,7 +551,10 @@ aes_gcm_enc_128_kernel:
        pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
        eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
        ldp     $input_l0, $input_h0, [$input_ptr, #0]            @ AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
        rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
 
@@ -591,13 +624,19 @@ aes_gcm_enc_128_kernel:
 
        aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
        ldp     $input_l1, $input_h1, [$input_ptr, #16]           @ AES block 4k+5 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l1, $input_l1
+       rev     $input_h1, $input_h1
+#endif
        aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
        eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
 
        aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
        ldp     $input_l2, $input_h2, [$input_ptr, #32]           @ AES block 4k+6 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l2, $input_l2
+       rev     $input_h2, $input_h2
+#endif
        pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
        eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
 
@@ -854,7 +893,10 @@ aes_gcm_enc_128_kernel:
 
        sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   @ main_end_input_ptr is number of bytes left to process
        ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        cmp     $main_end_input_ptr, #48
 
        ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
@@ -892,7 +934,10 @@ aes_gcm_enc_128_kernel:
        st1     { $res1b}, [$output_ptr], #16                     @ AES final-3 block  - store result
 
        ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES final-2 block - load input low & high
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        rev64   $res0b, $res1b                                    @ GHASH final-3 block
 
        eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
@@ -921,7 +966,10 @@ aes_gcm_enc_128_kernel:
 
        rev64   $res0b, $res1b                                    @ GHASH final-2 block
        ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES final-1 block - load input low & high
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
 
        eor     $input_l0, $input_l0, $rk10_l                     @ AES final-1 block - round 10 low
@@ -955,7 +1003,10 @@ aes_gcm_enc_128_kernel:
 
        rev64   $res0b, $res1b                                    @ GHASH final-1 block
        ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES final block - load input low & high
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
 
        eor     $input_h0, $input_h0, $rk10_h                     @ AES final block - round 10 high
@@ -1018,9 +1069,11 @@ aes_gcm_enc_128_kernel:
        ld1     { $rk0}, [$output_ptr]                            @ load existing bytes where the possibly partial last block is to be stored
 
        eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH final block - mid
-
+#ifndef __AARCH64EB__
        rev     $ctr32w, $rctr32w
-
+#else
+       mov     $ctr32w, $rctr32w
+#endif
        pmull2  $rk2q1, $res0.2d, $h1.2d                          @ GHASH final block - high
 
        pmull   $t0.1q, $t0.1d, $h12k.1d                          @ GHASH final block - mid
@@ -1105,20 +1158,29 @@ aes_gcm_dec_128_kernel:
        lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
        mov     $len, $main_end_input_ptr
        ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
-
+#ifdef __AARCH64EB__
+       rev     $ctr96_b64x, $ctr96_b64x
+       rev     $ctr96_t32x, $ctr96_t32x
+#endif
+       ldp     $rk10_l, $rk10_h, [$cc, #160]                     @ load rk10
+#ifdef __AARCH64EB__
+       ror     $rk10_h, $rk10_h, 32
+       ror     $rk10_l, $rk10_l, 32
+#endif
        sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
-       ldr     $rk0q, [$cc, #0]                                  @ load rk0
+       ld1     {$rk0s}, [$cc], #16                                @ load rk0
 
        and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
        ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
 
        ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
+#ifndef __AARCH64EB__
        ext     $h2b, $h2b, $h2b, #8
-
+#endif
        lsr     $rctr32x, $ctr96_t32x, #32
        fmov    $ctr2d, $ctr96_b64x                               @ CTR block 2
 
-       ldr     $rk1q, [$cc, #16]                                 @ load rk1
+       ld1     {$rk1s}, [$cc], #16                                @ load rk1
        orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
        rev     $rctr32w, $rctr32w                                @ rev_ctr32
 
@@ -1129,7 +1191,7 @@ aes_gcm_dec_128_kernel:
        rev     $ctr32w, $rctr32w                                 @ CTR block 1
 
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
-       ldr     $rk2q, [$cc, #32]                                 @ load rk2
+       ld1     {$rk2s}, [$cc], #16                                @ load rk2
        add     $rctr32w, $rctr32w, #1                            @ CTR block 1
 
        fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
@@ -1150,23 +1212,22 @@ aes_gcm_dec_128_kernel:
        add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
 
        aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
-       ldr     $rk3q, [$cc, #48]                                 @ load rk3
+       ld1     {$rk3s}, [$cc], #16                                @ load rk3
 
        aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
-       ldr     $rk6q, [$cc, #96]                                 @ load rk6
+       ld1     {$rk4s}, [$cc], #16                                @ load rk4
 
        aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
-       ldr     $rk7q, [$cc, #112]                                @ load rk7
+       ld1     {$rk5s}, [$cc], #16                                @ load rk5
 
        aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
-       ldr     $rk4q, [$cc, #64]                                 @ load rk4
+       ld1     {$rk6s}, [$cc], #16                                @ load rk6
 
        aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
 
        aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
 
        aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
-       ldp     $rk10_l, $rk10_h, [$cc, #160]                     @ load rk10
 
        aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
        ld1     { $acc_lb}, [$current_tag]
@@ -1174,14 +1235,14 @@ aes_gcm_dec_128_kernel:
        rev64   $acc_lb, $acc_lb
 
        aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
-       ldr     $rk5q, [$cc, #80]                                 @ load rk5
+       ld1     {$rk7s}, [$cc], #16                                @ load rk7
 
        aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
 
        aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
 
        aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
-       ldr     $rk9q, [$cc, #144]                                @ load rk9
+       ld1     {$rk8s}, [$cc], #16                                @ load rk8
 
        aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 4
 
@@ -1189,10 +1250,11 @@ aes_gcm_dec_128_kernel:
 
        aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
        ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
+#ifndef __AARCH64EB__
        ext     $h3b, $h3b, $h3b, #8
-
+#endif
        aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 4
-       ldr     $rk8q, [$cc, #128]                                @ load rk8
+       ld1     {$rk9s}, [$cc], #16                                @ load rk9
 
        aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 5
 
@@ -1204,8 +1266,9 @@ aes_gcm_dec_128_kernel:
 
        aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 5
        ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
+#ifndef __AARCH64EB__
        ext     $h1b, $h1b, $h1b, #8
-
+#endif
        aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 5
 
        aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 6
@@ -1218,7 +1281,9 @@ aes_gcm_dec_128_kernel:
        trn1    $t0.2d,    $h1.2d,    $h2.2d                      @ h2h | h1h
 
        ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
+#ifndef __AARCH64EB__
        ext     $h4b, $h4b, $h4b, #8
+#endif
        trn2    $h12k.2d,  $h1.2d,    $h2.2d                      @ h2l | h1l
        add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
 
@@ -1252,12 +1317,10 @@ aes_gcm_dec_128_kernel:
        eor     $h34k.16b, $h34k.16b, $acc_h.16b                  @ h4k | h3k
        b.ge    .L128_dec_tail                                    @ handle tail
 
-       ldr     $res1q, [$input_ptr, #16]                         @ AES block 1 - load ciphertext
-
-       ldr     $res0q, [$input_ptr, #0]                          @ AES block 0 - load ciphertext
+       ld1     {$res0b, $res1b}, [$input_ptr], #32               @ AES block 0 - load ciphertext; AES block 1 - load ciphertext
 
        eor     $ctr1b, $res1b, $ctr1b                            @ AES block 1 - result
-       ldr     $res2q, [$input_ptr, #32]                         @ AES block 2 - load ciphertext
+       ld1     {$res2b}, [$input_ptr], #16                       @ AES block 2 - load ciphertext
 
        eor     $ctr0b, $res0b, $ctr0b                            @ AES block 0 - result
        rev64   $res0b, $res0b                                    @ GHASH block 0
@@ -1265,10 +1328,9 @@ aes_gcm_dec_128_kernel:
 
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4
        add     $rctr32w, $rctr32w, #1                            @ CTR block 4
-       ldr     $res3q, [$input_ptr, #48]                         @ AES block 3 - load ciphertext
+       ld1     {$res3b}, [$input_ptr], #16                       @ AES block 3 - load ciphertext
 
        rev64   $res1b, $res1b                                    @ GHASH block 1
-       add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
        mov     $output_l1, $ctr1.d[0]                            @ AES block 1 - mov low
 
        mov     $output_h1, $ctr1.d[1]                            @ AES block 1 - mov high
@@ -1283,7 +1345,9 @@ aes_gcm_dec_128_kernel:
        fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4
        rev     $ctr32w, $rctr32w                                 @ CTR block 5
        eor     $output_l1, $output_l1, $rk10_l                   @ AES block 1 - round 10 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l1, $output_l1
+#endif
        fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
        add     $rctr32w, $rctr32w, #1                            @ CTR block 5
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
@@ -1295,10 +1359,19 @@ aes_gcm_dec_128_kernel:
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
 
        eor     $output_h1, $output_h1, $rk10_h                   @ AES block 1 - round 10 high
+#ifdef __AARCH64EB__
+       rev     $output_h1, $output_h1
+#endif
        eor     $output_l0, $output_l0, $rk10_l                   @ AES block 0 - round 10 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        eor     $ctr2b, $res2b, $ctr2b                            @ AES block 2 - result
 
        eor     $output_h0, $output_h0, $rk10_h                   @ AES block 0 - round 10 high
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 0 - store result
 
        stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 1 - store result
@@ -1358,9 +1431,14 @@ aes_gcm_dec_128_kernel:
 
        aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
        eor     $output_l3, $output_l3, $rk10_l                   @ AES block 4k+3 - round 10 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l3, $output_l3
+#endif
        pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
        eor     $output_h2, $output_h2, $rk10_h                   @ AES block 4k+2 - round 10 high
+#ifdef __AARCH64EB__
+       rev     $output_h2, $output_h2
+#endif
        mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
 
        aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
@@ -1391,13 +1469,17 @@ aes_gcm_dec_128_kernel:
 
        pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
        eor     $output_h3, $output_h3, $rk10_h                   @ AES block 4k+3 - round 10 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h3, $output_h3
+#endif
        aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
        eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
 
        aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
        eor     $output_l2, $output_l2, $rk10_l                   @ AES block 4k+2 - round 10 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l2, $output_l2
+#endif
        aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
        movi    $mod_constant.8b, #0xc2
 
@@ -1414,7 +1496,7 @@ aes_gcm_dec_128_kernel:
 
        pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
        eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
-       ldr     $res0q, [$input_ptr, #0]                          @ AES block 4k+4 - load ciphertext
+       ld1     {$res0b}, [$input_ptr], #16                       @ AES block 4k+3 - load ciphertext
 
        aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
        add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
@@ -1435,7 +1517,7 @@ aes_gcm_dec_128_kernel:
        rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
 
        pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
-       ldr     $res1q, [$input_ptr, #16]                         @ AES block 4k+5 - load ciphertext
+       ld1     {$res1b}, [$input_ptr], #16                       @ AES block 4k+4 - load ciphertext
        ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
 
        aese    $ctr0b, $rk9                                      @ AES block 4k+4 - round 9
@@ -1450,17 +1532,16 @@ aes_gcm_dec_128_kernel:
        eor     $ctr0b, $res0b, $ctr0b                            @ AES block 4k+4 - result
 
        aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
-       ldr     $res2q, [$input_ptr, #32]                         @ AES block 4k+6 - load ciphertext
+       ld1     {$res2b}, [$input_ptr], #16                       @ AES block 4k+5 - load ciphertext
 
        add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+8
        eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
        eor     $ctr1b, $res1b, $ctr1b                            @ AES block 4k+5 - result
 
        aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
-       ldr     $res3q, [$input_ptr, #48]                         @ AES block 4k+3 - load ciphertext
+       ld1     {$res3b}, [$input_ptr], #16                       @ AES block 4k+6 - load ciphertext
 
        aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
-       add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
 
        rev64   $res1b, $res1b                                    @ GHASH block 4k+5
        eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
@@ -1482,11 +1563,15 @@ aes_gcm_dec_128_kernel:
 
        aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
        eor     $output_h0, $output_h0, $rk10_h                   @ AES block 4k+4 - round 10 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
        mov     $output_h1, $ctr1.d[1]                            @ AES block 4k+5 - mov high
        eor     $output_l0, $output_l0, $rk10_l                   @ AES block 4k+4 - round 10 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        eor     $ctr2b, $res2b, $ctr2b                            @ AES block 4k+6 - result
        mov     $output_l1, $ctr1.d[0]                            @ AES block 4k+5 - mov low
        add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
@@ -1503,9 +1588,15 @@ aes_gcm_dec_128_kernel:
        add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+10
 
        eor     $output_h1, $output_h1, $rk10_h                   @ AES block 4k+5 - round 10 high
+#ifdef __AARCH64EB__
+       rev     $output_h1, $output_h1
+#endif
        stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 4k+4 - store result
 
        eor     $output_l1, $output_l1, $rk10_l                   @ AES block 4k+5 - round 10 low
+#ifdef __AARCH64EB__
+       rev     $output_l1, $output_l1
+#endif
        stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 4k+5 - store result
 
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+10
@@ -1598,9 +1689,14 @@ aes_gcm_dec_128_kernel:
 
        aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
        eor     $output_l3, $output_l3, $rk10_l                   @ AES block 4k+3 - round 10 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l3, $output_l3
+#endif
        pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
        eor     $output_l2, $output_l2, $rk10_l                   @ AES block 4k+2 - round 10 low
+#ifdef __AARCH64EB__
+       rev     $output_l2, $output_l2
+#endif
        eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
 
        aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
@@ -1654,7 +1750,9 @@ aes_gcm_dec_128_kernel:
 
        pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
        eor     $output_h3, $output_h3, $rk10_h                   @ AES block 4k+3 - round 10 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h3, $output_h3
+#endif
        aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
        ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
 
@@ -1667,7 +1765,9 @@ aes_gcm_dec_128_kernel:
 
        aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
        eor     $output_h2, $output_h2, $rk10_h                   @ AES block 4k+2 - round 10 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h2, $output_h2
+#endif
        aese    $ctr0b, $rk9                                      @ AES block 4k+4 - round 9
        stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
 
@@ -1691,9 +1791,14 @@ aes_gcm_dec_128_kernel:
        cmp     $main_end_input_ptr, #48
 
        eor     $output_h0, $output_h0, $rk10_h                   @ AES block 4k+4 - round 10 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
        eor     $output_l0, $output_l0, $rk10_l                   @ AES block 4k+4 - round 10 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        b.gt    .L128_dec_blocks_more_than_3
 
        mov     $ctr3b, $ctr2b
@@ -1737,9 +1842,14 @@ aes_gcm_dec_128_kernel:
 
        movi    $t0.8b, #0                                        @ suppress further partial tag feed in
        eor     $output_h0, $output_h0, $rk10_h                   @ AES final-2 block - round 10 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                    @ GHASH final-3 block - mid
        eor     $output_l0, $output_l0, $rk10_l                   @ AES final-2 block - round 10 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        .L128_dec_blocks_more_than_2:                             @ blocks left >  2
 
        rev64   $res0b, $res1b                                    @ GHASH final-2 block
@@ -1765,12 +1875,18 @@ aes_gcm_dec_128_kernel:
        pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                      @ GHASH final-2 block - mid
 
        eor     $output_l0, $output_l0, $rk10_l                   @ AES final-1 block - round 10 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-2 block - low
 
        eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-2 block - high
 
        eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-2 block - mid
        eor     $output_h0, $output_h0, $rk10_h                   @ AES final-1 block - round 10 high
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        .L128_dec_blocks_more_than_1:                             @ blocks left >  1
 
        rev64   $res0b, $res1b                                    @ GHASH final-1 block
@@ -1801,8 +1917,13 @@ aes_gcm_dec_128_kernel:
 
        eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-1 block - high
        eor     $output_h0, $output_h0, $rk10_h                   @ AES final block - round 10 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        eor     $output_l0, $output_l0, $rk10_l                   @ AES final block - round 10 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-1 block - mid
        .L128_dec_blocks_less_than_1:                                            @ blocks left <= 1
 
@@ -1848,7 +1969,11 @@ aes_gcm_dec_128_kernel:
        bic     $end_input_ptr, $end_input_ptr, $ctr32x           @ mask out low existing bytes
        and     $output_l0, $output_l0, $ctr32x
 
+#ifndef __AARCH64EB__
        rev     $ctr32w, $rctr32w
+#else
+       mov     $ctr32w, $rctr32w
+#endif
 
        eor     $acc_mb, $acc_mb, $t0.16b                         @ GHASH final block - mid
        movi    $mod_constant.8b, #0xc2
@@ -1957,6 +2082,7 @@ my $mod_t="v31";
 
 my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
 my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
+my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s)=map("v$_.4s",(18..29));
 my $rk2q1="v20.1q";
 my $rk3q1="v21.1q";
 my $rk4v="v22";
@@ -1988,18 +2114,26 @@ aes_gcm_enc_192_kernel:
        stp     d14, d15, [sp, #96]
 
        ldp     $ctr96_b64x, $ctr96_t32x, [$counter]             @ ctr96_b64, ctr96_t32
+#ifdef __AARCH64EB__
+       rev     $ctr96_b64x, $ctr96_b64x
+       rev     $ctr96_t32x, $ctr96_t32x
+#endif
+       ldp     $rk12_l, $rk12_h, [$cc, #192]                     @ load rk12
+#ifdef __AARCH64EB__
+       ror     $rk12_l, $rk12_l, #32
+       ror     $rk12_h, $rk12_h, #32
+#endif
+       ld1     {$rk0s}, [$cc], #16                                  @ load rk0
 
-       ldr     $rk5q, [$cc, #80]                                @ load rk5
-
-       ldr     $rk4q, [$cc, #64]                                @ load rk4
+       ld1     {$rk1s}, [$cc], #16                                  @ load rk1
 
-       ldr     $rk8q, [$cc, #128]                               @ load rk8
+       ld1     {$rk2s}, [$cc], #16                                  @ load rk2
 
        lsr     $rctr32x, $ctr96_t32x, #32
-       ldr     $rk6q, [$cc, #96]                                @ load rk6
+       ld1     {$rk3s}, [$cc], #16                                  @ load rk3
        orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
 
-       ldr     $rk7q, [$cc, #112]                               @ load rk7
+       ld1     {$rk4s}, [$cc], #16                                  @ load rk4
        rev     $rctr32w, $rctr32w                               @ rev_ctr32
 
        add     $rctr32w, $rctr32w, #1                           @ increment rev_ctr32
@@ -2023,15 +2157,13 @@ aes_gcm_enc_192_kernel:
        rev     $ctr32w, $rctr32w                                @ CTR block 3
 
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 3
-       ldr     $rk0q, [$cc, #0]                                 @ load rk0
+       ld1     {$rk5s}, [$cc], #16                                  @ load rk5
 
        fmov    $ctr3.d[1], $ctr32x                              @ CTR block 3
 
-       ldr     $rk3q, [$cc, #48]                                @ load rk3
-
-       ldp     $rk12_l, $rk12_h, [$cc, #192]                    @ load rk12
+       ld1     {$rk6s}, [$cc], #16                                  @ load rk6
 
-       ldr     $rk1q, [$cc, #16]                                @ load rk1
+       ld1     {$rk7s}, [$cc], #16                                  @ load rk7
 
        aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 0
        ld1     { $acc_lb}, [$current_tag]
@@ -2039,29 +2171,32 @@ aes_gcm_enc_192_kernel:
        rev64   $acc_lb, $acc_lb
 
        aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 0
-       ldr     $rk11q, [$cc, #176]                              @ load rk11
+       ld1     {$rk8s}, [$cc], #16                                  @ load rk8
 
        aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 0
        ldr     $h4q, [$current_tag, #112]                       @ load h4l | h4h
+#ifndef __AARCH64EB__
        ext     $h4b, $h4b, $h4b, #8
-
+#endif
        aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 0
-       ldr     $rk2q, [$cc, #32]                                @ load rk2
+       ld1     {$rk9s}, [$cc], #16                                  @ load rk9
 
        aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 1
-       ldr     $rk10q, [$cc, #160]                              @ load rk10
+       ld1     {$rk10s}, [$cc], #16                             @ load rk10
 
        aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 1
        ldr     $h1q, [$current_tag, #32]                        @ load h1l | h1h
+#ifndef __AARCH64EB__
        ext     $h1b, $h1b, $h1b, #8
-
+#endif
        aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 1
-       ldr     $rk9q, [$cc, #144]                               @ load rk9
+       ld1     {$rk11s}, [$cc], #16                             @ load rk11
 
        aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 1
        ldr     $h3q, [$current_tag, #80]                        @ load h3l | h3h
+#ifndef __AARCH64EB__
        ext     $h3b, $h3b, $h3b, #8
-
+#endif
        aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 2
 
        aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 2
@@ -2100,8 +2235,9 @@ aes_gcm_enc_192_kernel:
 
        aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 6
        ldr     $h2q, [$current_tag, #64]                        @ load h2l | h2h
+#ifndef __AARCH64EB__
        ext     $h2b, $h2b, $h2b, #8
-
+#endif
        aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 6
 
        aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 6
@@ -2163,13 +2299,26 @@ aes_gcm_enc_192_kernel:
 
        rev     $ctr32w, $rctr32w                                @ CTR block 4
        ldp     $input_l0, $input_h0, [$input_ptr, #0]           @ AES block 0 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 4
        ldp     $input_l2, $input_h2, [$input_ptr, #32]          @ AES block 2 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l2, $input_l2
+       rev     $input_h2, $input_h2
+#endif
        ldp     $input_l3, $input_h3, [$input_ptr, #48]          @ AES block 3 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l3, $input_l3
+       rev     $input_h3, $input_h3
+#endif
        ldp     $input_l1, $input_h1, [$input_ptr, #16]          @ AES block 1 - load plaintext
+#ifdef __AARCH64EB__
+       rev     $input_l1, $input_l1
+       rev     $input_h1, $input_h1
+#endif
        add     $input_ptr, $input_ptr, #64                      @ AES input_ptr update
        cmp     $input_ptr, $main_end_input_ptr                  @ check if we have <= 8 blocks
 
@@ -2239,7 +2388,10 @@ aes_gcm_enc_192_kernel:
 
        aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 0
        ldp     $input_l1, $input_h1, [$input_ptr, #16]          @ AES block 4k+5 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l1, $input_l1
+       rev     $input_h1, $input_h1
+#endif
        ext     $acc_lb, $acc_lb, $acc_lb, #8                    @ PRE 0
        fmov    $ctr3d, $ctr96_b64x                              @ CTR block 4k+3
        rev64   $res0b, $res0b                                   @ GHASH block 4k (only t0 is free)
@@ -2250,10 +2402,16 @@ aes_gcm_enc_192_kernel:
        pmull2  $t1.1q, $res1.2d, $h3.2d                         @ GHASH block 4k+1 - high
        rev64   $res3b, $res3b                                   @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
        ldp     $input_l2, $input_h2, [$input_ptr, #32]          @ AES block 4k+6 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l2, $input_l2
+       rev     $input_h2, $input_h2
+#endif
        aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 0
        ldp     $input_l3, $input_h3, [$input_ptr, #48]          @ AES block 4k+3 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l3, $input_l3
+       rev     $input_h3, $input_h3
+#endif
        pmull   $t2.1q, $res1.1d, $h3.1d                         @ GHASH block 4k+1 - low
        eor     $res0b, $res0b, $acc_lb                          @ PRE 1
 
@@ -2330,7 +2488,10 @@ aes_gcm_enc_192_kernel:
 
        aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 4
        ldp     $input_l0, $input_h0, [$input_ptr, #0]           @ AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 6
        eor     $acc_lb, $acc_lb, $t5.16b                        @ GHASH block 4k+2 - low
 
@@ -2627,7 +2788,10 @@ aes_gcm_enc_192_kernel:
 
        sub     $main_end_input_ptr, $end_input_ptr, $input_ptr  @ main_end_input_ptr is number of bytes left to process
        ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        eor     $input_l0, $input_l0, $rk12_l                    @ AES block 4k+4 - round 12 low
        eor     $input_h0, $input_h0, $rk12_h                    @ AES block 4k+4 - round 12 high
 
@@ -2664,7 +2828,10 @@ aes_gcm_enc_192_kernel:
        st1     { $res1b}, [$output_ptr], #16                    @ AES final-3 block  - store result
 
        ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final-2 block - load input low & high
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        rev64   $res0b, $res1b                                   @ GHASH final-3 block
 
        eor     $input_l0, $input_l0, $rk12_l                    @ AES final-2 block - round 12 low
@@ -2695,7 +2862,10 @@ aes_gcm_enc_192_kernel:
 
        rev64   $res0b, $res1b                                   @ GHASH final-2 block
        ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final-1 block - load input low & high
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
 
        eor     $input_h0, $input_h0, $rk12_h                    @ AES final-1 block - round 12 high
@@ -2726,7 +2896,10 @@ aes_gcm_enc_192_kernel:
        st1     { $res1b}, [$output_ptr], #16                    @ AES final-1 block - store result
 
        ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final block - load input low & high
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        rev64   $res0b, $res1b                                   @ GHASH final-1 block
 
        eor     $input_l0, $input_l0, $rk12_l                    @ AES final block - round 12 low
@@ -2758,7 +2931,11 @@ aes_gcm_enc_192_kernel:
        .L192_enc_blocks_less_than_1:                            @ blocks left <= 1
 
        ld1     { $rk0}, [$output_ptr]                           @ load existing bytes where the possibly partial last block is to be stored
+#ifndef __AARCH64EB__
        rev     $ctr32w, $rctr32w
+#else
+       mov     $ctr32w, $rctr32w
+#endif
        and     $bit_length, $bit_length, #127                   @ bit_length %= 128
 
        sub     $bit_length, $bit_length, #128                   @ bit_length -= 128
@@ -2875,14 +3052,22 @@ aes_gcm_dec_192_kernel:
 
        add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
        ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
-
+#ifdef __AARCH64EB__
+       rev     $ctr96_b64x, $ctr96_b64x
+       rev     $ctr96_t32x, $ctr96_t32x
+#endif
+       ldp     $rk12_l, $rk12_h, [$cc, #192]                     @ load rk12
+#ifdef __AARCH64EB__
+       ror     $rk12_l, $rk12_l, #32
+       ror     $rk12_h, $rk12_h, #32
+#endif
        ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
 
-       ldr     $rk0q, [$cc, #0]                                  @ load rk0
+       ld1     {$rk0s}, [$cc], #16                                  @ load rk0
 
        lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
        mov     $len, $main_end_input_ptr
-       ldr     $rk2q, [$cc, #32]                                 @ load rk2
+       ld1     {$rk1s}, [$cc], #16                               @ load rk1
 
        lsr     $rctr32x, $ctr96_t32x, #32
        orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
@@ -2892,14 +3077,14 @@ aes_gcm_dec_192_kernel:
        fmov    $ctr1d, $ctr96_b64x                               @ CTR block 1
 
        add     $rctr32w, $rctr32w, #1                            @ increment rev_ctr32
-       ldr     $rk1q, [$cc, #16]                                 @ load rk1
+       ld1     {$rk2s}, [$cc], #16                               @ load rk2
 
        aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
        rev     $ctr32w, $rctr32w                                 @ CTR block 1
 
        add     $rctr32w, $rctr32w, #1                            @ CTR block 1
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
-       ldr     $rk3q, [$cc, #48]                                 @ load rk3
+       ld1     {$rk3s}, [$cc], #16                               @ load rk3
 
        fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
        rev     $ctr32w, $rctr32w                                 @ CTR block 2
@@ -2916,43 +3101,46 @@ aes_gcm_dec_192_kernel:
 
        fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
 
-       ldr     $rk8q, [$cc, #128]                                @ load rk8
+       ld1     {$rk4s}, [$cc], #16                               @ load rk4
 
        aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
 
        aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
-       ldr     $rk11q, [$cc, #176]                               @ load rk11
+       ld1     {$rk5s}, [$cc], #16                               @ load rk5
 
        aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
        ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
+#ifndef __AARCH64EB__
        ext     $h4b, $h4b, $h4b, #8
-
+#endif
        aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
        ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
+#ifndef __AARCH64EB__
        ext     $h2b, $h2b, $h2b, #8
-
+#endif
        aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
        ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
+#ifndef __AARCH64EB__
        ext     $h3b, $h3b, $h3b, #8
-
+#endif
        aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
-       ldp     $rk12_l, $rk12_h, [$cc, #192]                     @ load rk12
 
        aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
        ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
+#ifndef __AARCH64EB__
        ext     $h1b, $h1b, $h1b, #8
-
+#endif
        aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
-       ldr     $rk10q, [$cc, #160]                               @ load rk10
+       ld1     {$rk6s}, [$cc], #16                               @ load rk6
 
        aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
-       ldr     $rk9q, [$cc, #144]                                @ load rk9
+       ld1     {$rk7s}, [$cc], #16                               @ load rk7
 
        aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
-       ldr     $rk7q, [$cc, #112]                                @ load rk7
+       ld1     {$rk8s}, [$cc], #16                               @ load rk8
 
        aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
-       ldr     $rk4q, [$cc, #64]                                 @ load rk4
+       ld1     {$rk9s}, [$cc], #16                               @ load rk9
 
        aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
        ld1     { $acc_lb}, [$current_tag]
@@ -2966,7 +3154,7 @@ aes_gcm_dec_192_kernel:
        trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
 
        aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 4
-       ldr     $rk5q, [$cc, #80]                                 @ load rk5
+       ld1     {$rk10s}, [$cc], #16                              @ load rk10
 
        aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 4
        trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
@@ -2977,7 +3165,7 @@ aes_gcm_dec_192_kernel:
        trn2    $h12k.2d,  $h1.2d,    $h2.2d                      @ h2l | h1l
 
        aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 5
-       ldr     $rk6q, [$cc, #96]                                 @ load rk6
+       ld1     {$rk11s}, [$cc], #16                              @ load rk11
 
        aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 5
 
@@ -3041,17 +3229,13 @@ aes_gcm_dec_192_kernel:
        aese    $ctr0b, $rk11                                     @ AES block 0 - round 11
        b.ge    .L192_dec_tail                                    @ handle tail
 
-       ldr     $res1q, [$input_ptr, #16]                         @ AES block 1 - load ciphertext
-
-       ldr     $res0q, [$input_ptr, #0]                          @ AES block 0 - load ciphertext
+       ld1     {$res0b, $res1b}, [$input_ptr], #32               @ AES block 0,1 - load ciphertext
 
        eor     $ctr1b, $res1b, $ctr1b                            @ AES block 1 - result
 
        eor     $ctr0b, $res0b, $ctr0b                            @ AES block 0 - result
        rev     $ctr32w, $rctr32w                                 @ CTR block 4
-       ldr     $res3q, [$input_ptr, #48]                         @ AES block 3 - load ciphertext
-
-       ldr     $res2q, [$input_ptr, #32]                         @ AES block 2 - load ciphertext
+       ld1     {$res2b, $res3b}, [$input_ptr], #32               @ AES block 2,3 - load ciphertext
 
        mov     $output_l1, $ctr1.d[0]                            @ AES block 1 - mov low
 
@@ -3063,27 +3247,35 @@ aes_gcm_dec_192_kernel:
 
        mov     $output_h0, $ctr0.d[1]                            @ AES block 0 - mov high
        rev64   $res0b, $res0b                                    @ GHASH block 0
-       add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
 
        fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4
        rev64   $res1b, $res1b                                    @ GHASH block 1
        cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 8 blocks
 
        eor     $output_l1, $output_l1, $rk12_l                   @ AES block 1 - round 12 low
+#ifdef __AARCH64EB__
+       rev     $output_l1, $output_l1
+#endif
        fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4
        rev     $ctr32w, $rctr32w                                 @ CTR block 5
 
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
        fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
        eor     $output_h1, $output_h1, $rk12_h                   @ AES block 1 - round 12 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h1, $output_h1
+#endif
        add     $rctr32w, $rctr32w, #1                            @ CTR block 5
        fmov    $ctr1.d[1], $ctr32x                               @ CTR block 5
        eor     $output_l0, $output_l0, $rk12_l                   @ AES block 0 - round 12 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        rev     $ctr32w, $rctr32w                                 @ CTR block 6
        eor     $output_h0, $output_h0, $rk12_h                   @ AES block 0 - round 12 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 0 - store result
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
 
@@ -3138,7 +3330,9 @@ aes_gcm_dec_192_kernel:
 
        aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
        eor     $output_h2, $output_h2, $rk12_h                   @ AES block 4k+2 - round 12 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h2, $output_h2
+#endif
        aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
        eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
 
@@ -3152,7 +3346,9 @@ aes_gcm_dec_192_kernel:
        pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
        eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
        eor     $output_l2, $output_l2, $rk12_l                   @ AES block 4k+2 - round 12 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l2, $output_l2
+#endif
        aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
 
        aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
@@ -3230,15 +3426,17 @@ aes_gcm_dec_192_kernel:
        aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
 
        aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
-       ldr     $res2q, [$input_ptr, #32]                         @ AES block 4k+6 - load ciphertext
+       ld1     {$res0b}, [$input_ptr], #16                       @ AES block 4k+4 - load ciphertext
 
        aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
        eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
 
        pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
-       ldr     $res3q, [$input_ptr, #48]                         @ AES block 4k+7 - load ciphertext
+       ld1     {$res1b}, [$input_ptr], #16                       @ AES block 4k+5 - load ciphertext
        eor     $output_l3, $output_l3, $rk12_l                   @ AES block 4k+3 - round 12 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l3, $output_l3
+#endif
        aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
        ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
 
@@ -3249,10 +3447,10 @@ aes_gcm_dec_192_kernel:
        eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
 
        aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
-       ldr     $res0q, [$input_ptr, #0]                          @ AES block 4k+4 - load ciphertext
+       ld1     {$res2b}, [$input_ptr], #16                       @ AES block 4k+6 - load ciphertext
 
        aese    $ctr1b, $rk11                                     @ AES block 4k+5 - round 11
-       ldr     $res1q, [$input_ptr, #16]                         @ AES block 4k+5 - load ciphertext
+       ld1     {$res3b}, [$input_ptr], #16                       @ AES block 4k+7 - load ciphertext
        rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
 
        aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
@@ -3261,11 +3459,13 @@ aes_gcm_dec_192_kernel:
        aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
        eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
 
-       add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
        cmp     $input_ptr, $main_end_input_ptr                   @ LOOP CONTROL
 
        eor     $ctr0b, $res0b, $ctr0b                            @ AES block 4k+4 - result
        eor     $output_h3, $output_h3, $rk12_h                   @ AES block 4k+3 - round 12 high
+#ifdef __AARCH64EB__
+       rev     $output_h3, $output_h3
+#endif
        eor     $ctr1b, $res1b, $ctr1b                            @ AES block 4k+5 - result
 
        aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
@@ -3295,18 +3495,28 @@ aes_gcm_dec_192_kernel:
        rev     $ctr32w, $rctr32w                                 @ CTR block 4k+9
 
        eor     $output_l0, $output_l0, $rk12_l                   @ AES block 4k+4 - round 12 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+9
        eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
 
        fmov    $ctr1d, $ctr96_b64x                               @ CTR block 4k+9
        add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
        eor     $output_l1, $output_l1, $rk12_l                   @ AES block 4k+5 - round 12 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l1, $output_l1
+#endif
        fmov    $ctr1.d[1], $ctr32x                               @ CTR block 4k+9
        rev     $ctr32w, $rctr32w                                 @ CTR block 4k+10
        eor     $output_h1, $output_h1, $rk12_h                   @ AES block 4k+5 - round 12 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h1, $output_h1
+#endif
        eor     $output_h0, $output_h0, $rk12_h                   @ AES block 4k+4 - round 12 high
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 4k+4 - store result
        eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
 
@@ -3355,17 +3565,28 @@ aes_gcm_dec_192_kernel:
 
        pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
        eor     $output_h3, $output_h3, $rk12_h                   @ AES block 4k+3 - round 12 high
+#ifdef __AARCH64EB__
+       rev     $output_h3, $output_h3
+#endif
        fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
 
        aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
        eor     $output_l2, $output_l2, $rk12_l                   @ AES block 4k+2 - round 12 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l2, $output_l2
+#endif
        pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
        eor     $output_h2, $output_h2, $rk12_h                   @ AES block 4k+2 - round 12 high
+#ifdef __AARCH64EB__
+       rev     $output_h2, $output_h2
+#endif
        eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
 
        pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
        eor     $output_l3, $output_l3, $rk12_l                   @ AES block 4k+3 - round 12 low
+#ifdef __AARCH64EB__
+       rev     $output_l3, $output_l3
+#endif
        stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
 
        rev64   $res3b, $res3b                                    @ GHASH block 4k+3
@@ -3517,8 +3738,13 @@ aes_gcm_dec_192_kernel:
        cmp     $main_end_input_ptr, #48
 
        eor     $output_h0, $output_h0, $rk12_h                   @ AES block 4k+4 - round 12 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        eor     $output_l0, $output_l0, $rk12_l                   @ AES block 4k+4 - round 12 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        b.gt    .L192_dec_blocks_more_than_3
 
        movi    $acc_l.8b, #0
@@ -3562,10 +3788,16 @@ aes_gcm_dec_192_kernel:
        pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH final-3 block - high
 
        eor     $output_l0, $output_l0, $rk12_l                   @ AES final-2 block - round 12 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        movi    $t0.8b, #0                                        @ suppress further partial tag feed in
 
        pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                    @ GHASH final-3 block - mid
        eor     $output_h0, $output_h0, $rk12_h                   @ AES final-2 block - round 12 high
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        .L192_dec_blocks_more_than_2:                             @ blocks left >  2
 
        rev64   $res0b, $res1b                                    @ GHASH final-2 block
@@ -3595,8 +3827,13 @@ aes_gcm_dec_192_kernel:
 
        eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-2 block - high
        eor     $output_h0, $output_h0, $rk12_h                   @ AES final-1 block - round 12 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        eor     $output_l0, $output_l0, $rk12_l                   @ AES final-1 block - round 12 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-2 block - mid
        .L192_dec_blocks_more_than_1:                             @ blocks left >  1
 
@@ -3627,9 +3864,13 @@ aes_gcm_dec_192_kernel:
        movi    $t0.8b, #0                                        @ suppress further partial tag feed in
        eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-1 block - low
        eor     $output_h0, $output_h0, $rk12_h                   @ AES final block - round 12 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        eor     $output_l0, $output_l0, $rk12_l                   @ AES final block - round 12 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-1 block - mid
        .L192_dec_blocks_less_than_1:                             @ blocks left <= 1
 
@@ -3656,8 +3897,11 @@ aes_gcm_dec_192_kernel:
 
        orr     $output_l0, $output_l0, $end_input_ptr
        mov     $ctr0.d[1], $ctr96_b64x
-
+#ifndef __AARCH64EB__
        rev     $ctr32w, $rctr32w
+#else
+       mov     $ctr32w, $rctr32w
+#endif
 
        and     $res1b, $res1b, $ctr0b                            @ possibly partial last block has zeroes in highest bits
        str     $ctr32w, [$counter, #12]                          @ store the updated counter
@@ -3782,6 +4026,7 @@ my $mod_constant="v8";
 my $mod_t="v7";
 
 my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
+my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s,$rk12s,$rk13s)=map("v$_.4s",(18..31));
 my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
 my $rk2q1="v20.1q";
 my $rk3q1="v21.1q";
@@ -3817,14 +4062,22 @@ aes_gcm_enc_256_kernel:
        lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
        mov     $len, $main_end_input_ptr
        ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
-
+#ifdef __AARCH64EB__
+       rev     $ctr96_b64x, $ctr96_b64x
+       rev     $ctr96_t32x, $ctr96_t32x
+#endif
+       ldp     $rk14_l, $rk14_h, [$cc, #224]                     @ load rk14
+#ifdef __AARCH64EB__
+       ror     $rk14_l, $rk14_l, #32
+       ror     $rk14_h, $rk14_h, #32
+#endif
        ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
        sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
 
-       ldr     $rk0q, [$cc, #0]                                  @ load rk0
+       ld1     {$rk0s}, [$cc], #16                               @ load rk0
        and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
 
-       ldr     $rk7q, [$cc, #112]                                @ load rk7
+       ld1     {$rk1s}, [$cc], #16                               @ load rk1
        add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
 
        lsr     $rctr32x, $ctr96_t32x, #32
@@ -3843,14 +4096,14 @@ aes_gcm_enc_256_kernel:
 
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
        add     $rctr32w, $rctr32w, #1                            @ CTR block 1
-       ldr     $rk1q, [$cc, #16]                                 @ load rk1
+       ld1     {$rk2s}, [$cc], #16                               @ load rk2
 
        fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
        rev     $ctr32w, $rctr32w                                 @ CTR block 2
        add     $rctr32w, $rctr32w, #1                            @ CTR block 2
 
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 2
-       ldr     $rk2q, [$cc, #32]                                 @ load rk2
+       ld1     {$rk3s}, [$cc], #16                               @ load rk3
 
        fmov    $ctr2.d[1], $ctr32x                               @ CTR block 2
        rev     $ctr32w, $rctr32w                                 @ CTR block 3
@@ -3861,46 +4114,48 @@ aes_gcm_enc_256_kernel:
        fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
 
        aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
-       ldr     $rk3q, [$cc, #48]                                 @ load rk3
+       ld1     {$rk4s}, [$cc], #16                               @ load rk4
 
        aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
-       ldr     $rk6q, [$cc, #96]                                 @ load rk6
+       ld1     {$rk5s}, [$cc], #16                               @ load rk5
 
        aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
-       ldr     $rk5q, [$cc, #80]                                 @ load rk5
+       ld1     {$rk6s}, [$cc], #16                               @ load rk6
 
        aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
        ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
+#ifndef __AARCH64EB__
        ext     $h3b, $h3b, $h3b, #8
-
+#endif
        aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
-       ldr     $rk13q, [$cc, #208]                               @ load rk13
+       ld1     {$rk7s}, [$cc], #16                               @ load rk7
 
        aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
-       ldr     $rk4q, [$cc, #64]                                 @ load rk4
+       ld1     {$rk8s}, [$cc], #16                               @ load rk8
 
        aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
        ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
+#ifndef __AARCH64EB__
        ext     $h2b, $h2b, $h2b, #8
-
+#endif
        aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
-       ldr     $rk12q, [$cc, #192]                               @ load rk12
+       ld1     {$rk9s}, [$cc], #16                               @ load rk9
 
        aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
        ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
+#ifndef __AARCH64EB__
        ext     $h4b, $h4b, $h4b, #8
-
+#endif
        aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
-       ldr     $rk11q, [$cc, #176]                               @ load rk11
+       ld1     {$rk10s}, [$cc], #16                              @ load rk10
 
        aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
-       ldr     $rk8q, [$cc, #128]                                @ load rk8
+       ld1     {$rk11s}, [$cc], #16                              @ load rk11
 
        aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
        add     $rctr32w, $rctr32w, #1                            @ CTR block 3
 
        aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
-       ldp     $rk14_l, $rk14_h, [$cc, #224]                     @ load rk14
 
        aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 3
        ld1     { $acc_lb}, [$current_tag]
@@ -3927,14 +4182,15 @@ aes_gcm_enc_256_kernel:
        trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
 
        aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 6
-       ldr     $rk9q, [$cc, #144]                                @ load rk9
+       ld1     {$rk12s}, [$cc], #16                              @ load rk12
 
        aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 6
        ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
+#ifndef __AARCH64EB__
        ext     $h1b, $h1b, $h1b, #8
-
+#endif
        aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 6
-       ldr     $rk10q, [$cc, #160]                               @ load rk10
+       ld1     {$rk13s}, [$cc], #16                              @ load rk13
 
        aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 7
        trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
@@ -3999,13 +4255,26 @@ aes_gcm_enc_256_kernel:
        b.ge    .L256_enc_tail                                    @ handle tail
 
        ldp     $input_l1, $input_h1, [$input_ptr, #16]           @ AES block 1 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l1, $input_l1
+       rev     $input_h1, $input_h1
+#endif
        rev     $ctr32w, $rctr32w                                 @ CTR block 4
        ldp     $input_l0, $input_h0, [$input_ptr, #0]            @ AES block 0 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        ldp     $input_l3, $input_h3, [$input_ptr, #48]           @ AES block 3 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l3, $input_l3
+       rev     $input_h3, $input_h3
+#endif
        ldp     $input_l2, $input_h2, [$input_ptr, #32]           @ AES block 2 - load plaintext
+#ifdef __AARCH64EB__
+       rev     $input_l2, $input_l2
+       rev     $input_h2, $input_h2
+#endif
        add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
 
        eor     $input_l1, $input_l1, $rk14_l                     @ AES block 1 - round 14 low
@@ -4083,10 +4352,16 @@ aes_gcm_enc_256_kernel:
 
        aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
        ldp     $input_l3, $input_h3, [$input_ptr, #48]           @ AES block 4k+7 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l3, $input_l3
+       rev     $input_h3, $input_h3
+#endif
        aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
        ldp     $input_l2, $input_h2, [$input_ptr, #32]           @ AES block 4k+6 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l2, $input_l2
+       rev     $input_h2, $input_h2
+#endif
        aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
        eor     $res0b, $res0b, $acc_lb                           @ PRE 1
 
@@ -4172,7 +4447,10 @@ aes_gcm_enc_256_kernel:
 
        aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
        ldp     $input_l1, $input_h1, [$input_ptr, #16]           @ AES block 4k+5 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l1, $input_l1
+       rev     $input_h1, $input_h1
+#endif
        aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
        mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
 
@@ -4202,7 +4480,10 @@ aes_gcm_enc_256_kernel:
 
        aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
        ldp     $input_l0, $input_h0, [$input_ptr, #0]            @ AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
        shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
 
@@ -4497,7 +4778,10 @@ aes_gcm_enc_256_kernel:
        ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
        sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   @ main_end_input_ptr is number of bytes left to process
        ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        eor     $input_l0, $input_l0, $rk14_l                     @ AES block 4k+4 - round 14 low
        eor     $input_h0, $input_h0, $rk14_h                     @ AES block 4k+4 - round 14 high
 
@@ -4532,7 +4816,10 @@ aes_gcm_enc_256_kernel:
        st1     { $res1b}, [$output_ptr], #16                    @ AES final-3 block  - store result
 
        ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final-2 block - load input low & high
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        rev64   $res0b, $res1b                                   @ GHASH final-3 block
 
        eor     $input_l0, $input_l0, $rk14_l                    @ AES final-2 block - round 14 low
@@ -4561,7 +4848,10 @@ aes_gcm_enc_256_kernel:
        st1     { $res1b}, [$output_ptr], #16                    @ AES final-2 block - store result
 
        ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final-1 block - load input low & high
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        rev64   $res0b, $res1b                                   @ GHASH final-2 block
 
        eor     $input_l0, $input_l0, $rk14_l                    @ AES final-1 block - round 14 low
@@ -4597,7 +4887,10 @@ aes_gcm_enc_256_kernel:
        rev64   $res0b, $res1b                                   @ GHASH final-1 block
 
        ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final block - load input low & high
-
+#ifdef __AARCH64EB__
+       rev     $input_l0, $input_l0
+       rev     $input_h0, $input_h0
+#endif
        eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
 
        movi    $t0.8b, #0                                       @ suppress further partial tag feed in
@@ -4658,7 +4951,11 @@ aes_gcm_enc_256_kernel:
 
        pmull2  $rk2q1, $res0.2d, $h1.2d                         @ GHASH final block - high
        mov     $t0d, $res0.d[1]                                 @ GHASH final block - mid
+#ifndef __AARCH64EB__
        rev     $ctr32w, $rctr32w
+#else
+       mov     $ctr32w, $rctr32w
+#endif
 
        pmull   $rk3q1, $res0.1d, $h1.1d                         @ GHASH final block - low
 
@@ -4749,21 +5046,29 @@ aes_gcm_dec_256_kernel:
        lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
        mov     $len, $main_end_input_ptr
        ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
-
-       ldr     $rk8q, [$cc, #128]                                @ load rk8
+#ifdef __AARCH64EB__
+       rev     $ctr96_b64x, $ctr96_b64x
+       rev     $ctr96_t32x, $ctr96_t32x
+#endif
+       ldp     $rk14_l, $rk14_h, [$cc, #224]                     @ load rk14
+#ifdef __AARCH64EB__
+       ror     $rk14_h, $rk14_h, #32
+       ror     $rk14_l, $rk14_l, #32
+#endif
+       ld1     {$rk0s}, [$cc], #16                               @ load rk0
        sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
 
-       ldr     $rk7q, [$cc, #112]                                @ load rk7
+       ld1     {$rk1s}, [$cc], #16                               @ load rk1
        and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
 
        add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
-       ldr     $rk6q, [$cc, #96]                                 @ load rk6
+       ld1     {$rk2s}, [$cc], #16                               @ load rk2
 
        lsr     $rctr32x, $ctr96_t32x, #32
-       ldr     $rk5q, [$cc, #80]                                 @ load rk5
+       ld1     {$rk3s}, [$cc], #16                               @ load rk3
        orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
 
-       ldr     $rk3q, [$cc, #48]                                 @ load rk3
+       ld1     {$rk4s}, [$cc], #16                               @ load rk4
        add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
        rev     $rctr32w, $rctr32w                                @ rev_ctr32
 
@@ -4788,34 +5093,39 @@ aes_gcm_dec_256_kernel:
        rev     $ctr32w, $rctr32w                                 @ CTR block 3
 
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 3
-       ldr     $rk0q, [$cc, #0]                                  @ load rk0
+       ld1     {$rk5s}, [$cc], #16                               @ load rk5
 
        fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
        add     $rctr32w, $rctr32w, #1                            @ CTR block 3
 
-       ldr     $rk4q, [$cc, #64]                                 @ load rk4
+       ld1     {$rk6s}, [$cc], #16                               @ load rk6
 
-       ldr     $rk13q, [$cc, #208]                               @ load rk13
+       ld1     {$rk7s}, [$cc], #16                               @ load rk7
 
-       ldr     $rk1q, [$cc, #16]                                 @ load rk1
+       ld1     {$rk8s}, [$cc], #16                               @ load rk8
 
        aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
        ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
+#ifndef __AARCH64EB__
        ext     $h3b, $h3b, $h3b, #8
+#endif
 
        aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
        ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
+#ifndef __AARCH64EB__
        ext     $h4b, $h4b, $h4b, #8
+#endif
 
        aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
        ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
+#ifndef __AARCH64EB__
        ext     $h2b, $h2b, $h2b, #8
+#endif
 
        aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
-       ldr     $rk2q, [$cc, #32]                                 @ load rk2
+       ld1     {$rk9s}, [$cc], #16                                 @ load rk9
 
        aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 1
-       ldp     $rk14_l, $rk14_h, [$cc, #224]                     @ load rk14
 
        aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
        ld1     { $acc_lb}, [$current_tag]
@@ -4823,17 +5133,18 @@ aes_gcm_dec_256_kernel:
        rev64   $acc_lb, $acc_lb
 
        aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
-       ldr     $rk9q, [$cc, #144]                                @ load rk9
+       ld1     {$rk10s}, [$cc], #16                              @ load rk10
 
        aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
-       ldr     $rk12q, [$cc, #192]                               @ load rk12
+       ld1     {$rk11s}, [$cc], #16                              @ load rk11
 
        aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
        ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
+#ifndef __AARCH64EB__
        ext     $h1b, $h1b, $h1b, #8
-
+#endif
        aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
-       ldr     $rk10q, [$cc, #160]                               @ load rk10
+       ld1     {$rk12s}, [$cc], #16                              @ load rk12
 
        aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
 
@@ -4889,7 +5200,7 @@ aes_gcm_dec_256_kernel:
        aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 9
 
        aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 8
-       ldr     $rk11q, [$cc, #176]                               @ load rk11
+       ld1     {$rk13s}, [$cc], #16                             @ load rk13
 
        aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 9
 
@@ -4939,9 +5250,7 @@ aes_gcm_dec_256_kernel:
        aese    $ctr0b, $rk13                                     @ AES block 0 - round 13
        b.ge    .L256_dec_tail                                    @ handle tail
 
-       ldr     $res0q, [$input_ptr, #0]                          @ AES block 0 - load ciphertext
-
-       ldr     $res1q, [$input_ptr, #16]                         @ AES block 1 - load ciphertext
+       ld1     {$res0b, $res1b}, [$input_ptr], #32               @ AES block 0,1 - load ciphertext
 
        rev     $ctr32w, $rctr32w                                 @ CTR block 4
 
@@ -4949,7 +5258,7 @@ aes_gcm_dec_256_kernel:
 
        eor     $ctr1b, $res1b, $ctr1b                            @ AES block 1 - result
        rev64   $res1b, $res1b                                    @ GHASH block 1
-       ldr     $res3q, [$input_ptr, #48]                         @ AES block 3 - load ciphertext
+       ld1     {$res2b}, [$input_ptr], #16                       @ AES block 2 - load ciphertext
 
        mov     $output_h0, $ctr0.d[1]                            @ AES block 0 - mov high
 
@@ -4969,22 +5278,32 @@ aes_gcm_dec_256_kernel:
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
        mov     $output_h1, $ctr1.d[1]                            @ AES block 1 - mov high
        eor     $output_h0, $output_h0, $rk14_h                   @ AES block 0 - round 14 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        eor     $output_l0, $output_l0, $rk14_l                   @ AES block 0 - round 14 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 0 - store result
        fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
 
-       ldr     $res2q, [$input_ptr, #32]                         @ AES block 2 - load ciphertext
-       add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
+       ld1     {$res3b}, [$input_ptr], #16                       @ AES block 3 - load ciphertext
 
        fmov    $ctr1.d[1], $ctr32x                               @ CTR block 5
        rev     $ctr32w, $rctr32w                                 @ CTR block 6
        add     $rctr32w, $rctr32w, #1                            @ CTR block 6
 
        eor     $output_l1, $output_l1, $rk14_l                   @ AES block 1 - round 14 low
+#ifdef __AARCH64EB__
+       rev     $output_l1, $output_l1
+#endif
        orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
 
        eor     $output_h1, $output_h1, $rk14_h                   @ AES block 1 - round 14 high
+#ifdef __AARCH64EB__
+       rev     $output_h1, $output_h1
+#endif
        stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 1 - store result
 
        eor     $ctr2b, $res2b, $ctr2b                            @ AES block 2 - result
@@ -5027,7 +5346,9 @@ aes_gcm_dec_256_kernel:
 
        aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
        eor     $output_h2, $output_h2, $rk14_h                   @ AES block 4k+2 - round 14 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h2, $output_h2
+#endif
        aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
        mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
 
@@ -5036,7 +5357,9 @@ aes_gcm_dec_256_kernel:
 
        aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
        eor     $output_l2, $output_l2, $rk14_l                   @ AES block 4k+2 - round 14 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l2, $output_l2
+#endif
        aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
        stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
 
@@ -5049,9 +5372,14 @@ aes_gcm_dec_256_kernel:
 
        pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
        eor     $output_l3, $output_l3, $rk14_l                   @ AES block 4k+3 - round 14 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l3, $output_l3
+#endif
        pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
        eor     $output_h3, $output_h3, $rk14_h                   @ AES block 4k+3 - round 14 high
+#ifdef __AARCH64EB__
+       rev     $output_h3, $output_h3
+#endif
        eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
 
        aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
@@ -5145,7 +5473,7 @@ aes_gcm_dec_256_kernel:
        eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
 
        aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
-       ldr     $res0q, [$input_ptr, #0]                          @ AES block 4k+4 - load ciphertext
+       ld1     {$res0b}, [$input_ptr], #16                       @ AES block 4k+4 - load ciphertext
 
        aese    $ctr0b, $rk13                                     @ AES block 4k+4 - round 13
        ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
@@ -5154,7 +5482,7 @@ aes_gcm_dec_256_kernel:
        eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
 
        aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
-       ldr     $res1q, [$input_ptr, #16]                         @ AES block 4k+5 - load ciphertext
+       ld1     {$res1b}, [$input_ptr], #16                       @ AES block 4k+5 - load ciphertext
 
        aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
        eor     $ctr0b, $res0b, $ctr0b                            @ AES block 4k+4 - result
@@ -5166,10 +5494,10 @@ aes_gcm_dec_256_kernel:
        eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
 
        aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
-       ldr     $res3q, [$input_ptr, #48]                         @ AES block 4k+7 - load ciphertext
+       ld1     {$res2b}, [$input_ptr], #16                       @ AES block 4k+6 - load ciphertext
 
        aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 12
-       ldr     $res2q, [$input_ptr, #32]                         @ AES block 4k+6 - load ciphertext
+       ld1     {$res3b}, [$input_ptr], #16                       @ AES block 4k+7 - load ciphertext
 
        aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 11
        mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
@@ -5178,7 +5506,6 @@ aes_gcm_dec_256_kernel:
        eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
 
        aese    $ctr1b, $rk13                                     @ AES block 4k+5 - round 13
-       add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
        mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
 
        aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 12
@@ -5198,8 +5525,13 @@ aes_gcm_dec_256_kernel:
        add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
 
        eor     $output_l0, $output_l0, $rk14_l                   @ AES block 4k+4 - round 14 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        eor     $output_h0, $output_h0, $rk14_h                   @ AES block 4k+4 - round 14 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        mov     $output_h1, $ctr1.d[1]                            @ AES block 4k+5 - mov high
        eor     $ctr2b, $res2b, $ctr2b                            @ AES block 4k+6 - result
        eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
@@ -5219,9 +5551,15 @@ aes_gcm_dec_256_kernel:
 
        rev64   $res1b, $res1b                                    @ GHASH block 4k+5
        eor     $output_h1, $output_h1, $rk14_h                   @ AES block 4k+5 - round 14 high
+#ifdef __AARCH64EB__
+       rev     $output_h1, $output_h1
+#endif
        stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 4k+4 - store result
 
        eor     $output_l1, $output_l1, $rk14_l                   @ AES block 4k+5 - round 14 low
+#ifdef __AARCH64EB__
+       rev     $output_l1, $output_l1
+#endif
        stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 4k+5 - store result
 
        rev64   $res0b, $res0b                                    @ GHASH block 4k+4
@@ -5385,10 +5723,14 @@ aes_gcm_dec_256_kernel:
 
        aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
        eor     $output_h2, $output_h2, $rk14_h                   @ AES block 4k+2 - round 14 high
-
+#ifdef __AARCH64EB__
+       rev     $output_h2, $output_h2
+#endif
        aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
        eor     $output_l3, $output_l3, $rk14_l                   @ AES block 4k+3 - round 14 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l3, $output_l3
+#endif
        aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 11
        eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
 
@@ -5397,11 +5739,17 @@ aes_gcm_dec_256_kernel:
 
        aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 11
        eor     $output_l2, $output_l2, $rk14_l                   @ AES block 4k+2 - round 14 low
+#ifdef __AARCH64EB__
+       rev     $output_l2, $output_l2
+#endif
 
        aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 12
 
        pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
        eor     $output_h3, $output_h3, $rk14_h                   @ AES block 4k+3 - round 14 high
+#ifdef __AARCH64EB__
+       rev     $output_h3, $output_h3
+#endif
 
        aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 11
        stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
@@ -5438,8 +5786,14 @@ aes_gcm_dec_256_kernel:
        cmp     $main_end_input_ptr, #48
 
        eor     $output_l0, $output_l0, $rk14_l                   @ AES block 4k+4 - round 14 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
 
        eor     $output_h0, $output_h0, $rk14_h                   @ AES block 4k+4 - round 14 high
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        b.gt    .L256_dec_blocks_more_than_3
 
        sub     $rctr32w, $rctr32w, #1
@@ -5487,9 +5841,15 @@ aes_gcm_dec_256_kernel:
 
        pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   @ GHASH final-3 block - mid
        eor     $output_l0, $output_l0, $rk14_l                  @ AES final-2 block - round 14 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
 
        pmull   $acc_l.1q, $res0.1d, $h4.1d                      @ GHASH final-3 block - low
        eor     $output_h0, $output_h0, $rk14_h                  @ AES final-2 block - round 14 high
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        .L256_dec_blocks_more_than_2:                            @ blocks left >  2
 
        rev64   $res0b, $res1b                                   @ GHASH final-2 block
@@ -5517,9 +5877,15 @@ aes_gcm_dec_256_kernel:
 
        eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-2 block - high
        eor     $output_l0, $output_l0, $rk14_l                  @ AES final-1 block - round 14 low
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
 
        eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-2 block - mid
        eor     $output_h0, $output_h0, $rk14_h                  @ AES final-1 block - round 14 high
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        .L256_dec_blocks_more_than_1:                            @ blocks left >  1
 
        stp     $output_l0, $output_h0, [$output_ptr], #16       @ AES final-1 block  - store result
@@ -5547,13 +5913,18 @@ aes_gcm_dec_256_kernel:
 
        pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     @ GHASH final-1 block - mid
        eor     $output_l0, $output_l0, $rk14_l                  @ AES final block - round 14 low
-
+#ifdef __AARCH64EB__
+       rev     $output_l0, $output_l0
+#endif
        eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-1 block - low
 
        eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-1 block - high
 
        eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-1 block - mid
        eor     $output_h0, $output_h0, $rk14_h                  @ AES final block - round 14 high
+#ifdef __AARCH64EB__
+       rev     $output_h0, $output_h0
+#endif
        .L256_dec_blocks_less_than_1:                            @ blocks left <= 1
 
        and     $bit_length, $bit_length, #127                   @ bit_length %= 128
@@ -5579,7 +5950,11 @@ aes_gcm_dec_256_kernel:
        mov     $ctr0.d[1], $ctr96_b64x
        bic     $end_input_ptr, $end_input_ptr, $ctr32x          @ mask out low existing bytes
 
+#ifndef __AARCH64EB__
        rev     $ctr32w, $rctr32w
+#else
+       mov     $ctr32w, $rctr32w
+#endif
 
        bic     $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x      @ mask out high existing bytes
 
@@ -5720,7 +6095,7 @@ if ($flavour =~ /64/) {         ######## 64-bit code
         if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
             print "     it      $2\n";
         }
-
+        s/__AARCH64E([BL])__/__ARME$1__/go;
         print $_,"\n";
     }
 }