-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+# granted.
# ====================================================================
# Bit-sliced AES for ARM NEON
# to collect performance results, which for Cortex-A8 core are:
#
# encrypt 19.5 cycles per byte processed with 128-bit key
-# decrypt 24.0 cycles per byte processed with 128-bit key
+# decrypt 22.1 cycles per byte processed with 128-bit key
# key conv. 440 cycles per 128-bit key/0.18 of 8x block
#
-# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 22.6,
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
# which is [much] worse than anticipated (for further details see
# http://www.openssl.org/~appro/Snapdragon-S4.html).
#
-# Cortex-A15 manages in 14.2/19.6 cycles [when integer-only code
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
# manages in 20.0 cycles].
#
# When comparing to x86_64 results keep in mind that NEON unit is
#
# <appro@openssl.org>
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+# April-August 2013
+#
+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
+#
+# <ard.biesheuvel@linaro.org>
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
my @XMM=map("q$_",(0..15));
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
veor @b[2], @b[2], @b[1]
# modified to emit output in order suitable for feeding back to aesenc[last]
my @x=@_[0..7];
my @t=@_[8..15];
+my $inv=@_[16]; # optional
$code.=<<___;
vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
vext.8 @t[1], @x[1], @x[1], #12
veor @t[3], @t[3], @x[7]
vext.8 @x[6], @x[2], @x[2], #8
veor @x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
veor @x[2], @t[0], @t[4]
-
veor @x[4], @x[4], @t[3]
veor @x[5], @x[5], @t[7]
veor @x[3], @x[3], @t[6]
veor @x[6], @x[6], @t[2]
@ vmov @x[7], @t[1]
___
+$code.=<<___ if ($inv);
+ veor @t[3], @t[3], @x[4]
+ veor @x[5], @x[5], @t[7]
+ veor @x[2], @x[3], @t[6]
+ veor @x[3], @t[0], @t[4]
+ veor @x[4], @x[6], @t[2]
+ vmov @x[6], @t[3]
+ @ vmov @x[7], @t[1]
+___
}
-sub InvMixColumns {
+sub InvMixColumns_orig {
my @x=@_[0..7];
my @t=@_[8..15];
___
}
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
+# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
+
+$code.=<<___;
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 @t[0], @x[0], @x[0], #8
+ vext.8 @t[6], @x[6], @x[6], #8
+ vext.8 @t[7], @x[7], @x[7], #8
+ veor @t[0], @t[0], @x[0]
+ vext.8 @t[1], @x[1], @x[1], #8
+ veor @t[6], @t[6], @x[6]
+ vext.8 @t[2], @x[2], @x[2], #8
+ veor @t[7], @t[7], @x[7]
+ vext.8 @t[3], @x[3], @x[3], #8
+ veor @t[1], @t[1], @x[1]
+ vext.8 @t[4], @x[4], @x[4], #8
+ veor @t[2], @t[2], @x[2]
+ vext.8 @t[5], @x[5], @x[5], #8
+ veor @t[3], @t[3], @x[3]
+ veor @t[4], @t[4], @x[4]
+ veor @t[5], @t[5], @x[5]
+
+ veor @x[0], @x[0], @t[6]
+ veor @x[1], @x[1], @t[6]
+ veor @x[2], @x[2], @t[0]
+ veor @x[4], @x[4], @t[2]
+ veor @x[3], @x[3], @t[1]
+ veor @x[1], @x[1], @t[7]
+ veor @x[2], @x[2], @t[7]
+ veor @x[4], @x[4], @t[6]
+ veor @x[5], @x[5], @t[3]
+ veor @x[3], @x[3], @t[6]
+ veor @x[6], @x[6], @t[4]
+ veor @x[4], @x[4], @t[7]
+ veor @x[5], @x[5], @t[7]
+ veor @x[7], @x[7], @t[5]
+___
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
+}
+
sub swapmove {
my ($a,$b,$n,$mask,$t)=@_;
$code.=<<___;
}
$code.=<<___;
-#include "arm_arch.h"
+#ifndef __KERNEL__
+# include "arm_arch.h"
-#if __ARM_ARCH__>=7
-.text
-.code 32
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
.fpu neon
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#if defined(__thumb2__) && !defined(__APPLE__)
+.thumb
+#else
+.code 32
+# undef __thumb2__
+#endif
+
.type _bsaes_decrypt8,%function
.align 4
_bsaes_decrypt8:
- sub $const,pc,#8 @ _bsaes_decrypt8
+ adr $const,_bsaes_decrypt8
vldmia $key!, {@XMM[9]} @ round 0 key
+#ifdef __APPLE__
+ adr $const,.LM0ISR
+#else
add $const,$const,#.LM0ISR-_bsaes_decrypt8
+#endif
vldmia $const!, {@XMM[8]} @ .LM0ISR
veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
&InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
$code.=<<___;
vldmia $const, {@XMM[12]} @ .LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
addeq $const,$const,#0x10
bne .Ldec_loop
vldmia $const, {@XMM[12]} @ .LISRM0
.LM0:
.quad 0x02060a0e03070b0f, 0x0004080c0105090d
.LREVM0SR:
- .quad 0x090d02060c030708, 0x00040b0f050a0e01
-.Lxts_magic:
- .quad 1, 0x87
+ .quad 0x090d01050c000408, 0x03070b0f060a0e02
.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 6
.size _bsaes_const,.-_bsaes_const
.type _bsaes_encrypt8,%function
.align 4
_bsaes_encrypt8:
- sub $const,pc,#8 @ _bsaes_encrypt8
+ adr $const,_bsaes_encrypt8
vldmia $key!, {@XMM[9]} @ round 0 key
+#ifdef __APPLE__
+ adr $const,.LM0SR
+#else
sub $const,$const,#_bsaes_encrypt8-.LM0SR
+#endif
vldmia $const!, {@XMM[8]} @ .LM0SR
_bsaes_encrypt8_alt:
&MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
$code.=<<___;
vldmia $const, {@XMM[12]} @ .LSR
+ ite eq @ Thumb2 thing, samity check in ARM
addeq $const,$const,#0x10
bne .Lenc_loop
vldmia $const, {@XMM[12]} @ .LSRM0
.type _bsaes_key_convert,%function
.align 4
_bsaes_key_convert:
- sub $const,pc,#8 @ _bsaes_key_convert
+ adr $const,_bsaes_key_convert
vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
+#ifdef __APPLE__
+ adr $const,.LM0
+#else
sub $const,$const,#_bsaes_key_convert-.LM0
+#endif
vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
vmov.i8 @XMM[8], #0x01 @ bit masks
.type bsaes_cbc_encrypt,%function
.align 5
bsaes_cbc_encrypt:
+#ifndef __KERNEL__
cmp $len, #128
+#ifndef __thumb__
blo AES_cbc_encrypt
+#else
+ bhs 1f
+ b AES_cbc_encrypt
+1:
+#endif
+#endif
@ it is up to the caller to make sure we are called with enc == 0
+ mov ip, sp
stmdb sp!, {r4-r10, lr}
- vstmdb sp!, {d8-d15} @ ABI specification says so
- ldr $ivp, [sp, #0x60] @ IV is 1st arg on the stack
+ VFP_ABI_PUSH
+ ldr $ivp, [ip] @ IV is 1st arg on the stack
mov $len, $len, lsr#4 @ len in 16 byte blocks
sub sp, #0x10 @ scratch space to carry over the IV
mov $fp, sp @ save sp
- @ allocate the key schedule on the stack
ldr $rounds, [$key, #240] @ get # of rounds
- sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key
- add sp, sp, #`128-32` @ size of bit-sliced key schedule
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ sifze of bit-slices key schedule
@ populate the key schedule
mov r4, $key @ pass key
mov r5, $rounds @ pass # of rounds
- mov r12, $keysched @ pass key schedule
+ mov sp, r12 @ sp is $keysched
bl _bsaes_key_convert
vldmia $keysched, {@XMM[6]}
vstmia r12, {@XMM[15]} @ save last round key
veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
vstmia $keysched, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0:
+#endif
vld1.8 {@XMM[15]}, [$ivp] @ load IV
b .Lcbc_dec_loop
vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
mov r5, $rounds
vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
cmp $len, #2
blo .Lcbc_dec_one
vld1.8 {@XMM[1]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
mov r5, $rounds
vstmia $fp, {@XMM[15]} @ put aside IV
beq .Lcbc_dec_two
vmov @XMM[4],@XMM[15] @ just in case ensure that IV
vmov @XMM[5],@XMM[0] @ and input are preserved
bl AES_decrypt
- vld1.8 {@XMM[0]}, [$fp,:64] @ load result
+ vld1.8 {@XMM[0]}, [$fp] @ load result
veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
vst1.8 {@XMM[0]}, [$rounds] @ write output
.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
vmov.i32 q0, #0
vmov.i32 q1, #0
.Lcbc_dec_bzero: @ wipe key schedule [if any]
vstmia $keysched!, {q0-q1}
- teq $keysched, $fp
+ cmp $keysched, $fp
bne .Lcbc_dec_bzero
+#endif
- add sp, $fp, #0x10
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
vst1.8 {@XMM[15]}, [$ivp] @ return IV
- vldmia sp!, {d8-d15}
+ VFP_ABI_POP
ldmia sp!, {r4-r10, pc}
.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
___
cmp $len, #8 @ use plain AES for
blo .Lctr_enc_short @ small sizes
+ mov ip, sp
stmdb sp!, {r4-r10, lr}
- vstmdb sp!, {d8-d15} @ ABI specification says so
- ldr $ctr, [sp, #0x60] @ ctr is 1st arg on the stack
+ VFP_ABI_PUSH
+ ldr $ctr, [ip] @ ctr is 1st arg on the stack
sub sp, sp, #0x10 @ scratch space to carry over the ctr
mov $fp, sp @ save sp
- @ allocate the key schedule on the stack
ldr $rounds, [$key, #240] @ get # of rounds
- sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key
- add sp, sp, #`128-32` @ size of bit-sliced key schedule
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ size of bit-sliced key schedule
@ populate the key schedule
mov r4, $key @ pass key
mov r5, $rounds @ pass # of rounds
- mov r12, $keysched @ pass key schedule
+ mov sp, r12 @ sp is $keysched
bl _bsaes_key_convert
veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
vstmia r12, {@XMM[7]} @ save last round key
vld1.8 {@XMM[0]}, [$ctr] @ load counter
+#ifdef __APPLE__
+ mov $ctr, #:lower16:(.LREVM0SR-.LM0)
+ add $ctr, $const, $ctr
+#else
add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
+#endif
vldmia $keysched, {@XMM[4]} @ load round0 key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
- vmov.i32 `&Dhi("@XMM[8]")`,#1 @ compose 1<<96
- vmov.i32 `&Dlo("@XMM[8]")`,#0
- vrev32.8 `&Dhi("@XMM[0]")`,`&Dhi("@XMM[0]")`
- vshl.u64 `&Dhi("@XMM[8]")`,#32
- vrev32.8 `&Dhi("@XMM[4]")`,`&Dhi("@XMM[4]")`
+.align 2
+0: add r12, $key, #248
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+ adrl $ctr, .LREVM0SR @ borrow $ctr
+ vldmia r12, {@XMM[4]} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 @XMM[8],#1 @ compose 1<<96
+ veor @XMM[9],@XMM[9],@XMM[9]
+ vrev32.8 @XMM[0],@XMM[0]
+ vext.8 @XMM[8],@XMM[9],@XMM[8],#4
+ vrev32.8 @XMM[4],@XMM[4]
vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
b .Lctr_enc_loop
@ to flip byte order in 32-bit counter
vldmia $keysched, {@XMM[9]} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, $keysched, #0x10 @ pass next round key
+#else
+ add r4, $key, #`248+16`
+#endif
vldmia $ctr, {@XMM[8]} @ .LREVM0SR
mov r5, $rounds @ pass rounds
vstmia $fp, {@XMM[10]} @ save next counter
+#ifdef __APPLE__
+ mov $const, #:lower16:(.LREVM0SR-.LSR)
+ sub $const, $ctr, $const
+#else
sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
+#endif
bl _bsaes_encrypt8_alt
vst1.8 {@XMM[4]}, [$out]!
veor @XMM[5], @XMM[15]
vst1.8 {@XMM[6]}, [$out]!
- vmov.i32 `&Dhi("@XMM[8]")`,#1 @ compose 1<<96
+ vmov.i32 @XMM[8], #1 @ compose 1<<96
vst1.8 {@XMM[3]}, [$out]!
- vmov.i32 `&Dlo("@XMM[8]")`,#0
+ veor @XMM[9], @XMM[9], @XMM[9]
vst1.8 {@XMM[7]}, [$out]!
- vshl.u64 `&Dhi("@XMM[8]")`,#32
+ vext.8 @XMM[8], @XMM[9], @XMM[8], #4
vst1.8 {@XMM[2]}, [$out]!
vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
vst1.8 {@XMM[5]}, [$out]!
.Lctr_enc_done:
vmov.i32 q0, #0
vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
.Lctr_enc_bzero: @ wipe key schedule [if any]
vstmia $keysched!, {q0-q1}
- teq $keysched, $fp
+ cmp $keysched, $fp
bne .Lctr_enc_bzero
+#else
+ vstmia $keysched, {q0-q1}
+#endif
- add sp, $fp, #0x10
- vldmia sp!, {d8-d15}
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ VFP_ABI_POP
ldmia sp!, {r4-r10, pc} @ return
.align 4
rev r8, r8
#endif
sub sp, sp, #0x10
- vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
+ vst1.8 {@XMM[1]}, [sp] @ copy counter value
sub sp, sp, #0x10
.Lctr_enc_short_loop:
bl AES_encrypt
vld1.8 {@XMM[0]}, [r4]! @ load input
- vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
+ vld1.8 {@XMM[1]}, [sp] @ load encrypted counter
add r8, r8, #1
#ifdef __ARMEL__
rev r0, r8
subs r6, r6, #1
bne .Lctr_enc_short_loop
- add sp, sp, #0x20
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vstmia sp!, {q0-q1}
+
ldmia sp!, {r4-r8, pc}
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
___
.type bsaes_xts_encrypt,%function
.align 4
bsaes_xts_encrypt:
+ mov ip, sp
stmdb sp!, {r4-r10, lr} @ 0x20
- vstmdb sp!, {d8-d15} @ 0x40
+ VFP_ABI_PUSH
mov r6, sp @ future $fp
- sub sp, #0x10 @ 0x10
mov $inp, r0
mov $out, r1
mov $len, r2
mov $key, r3
- bic sp, #0xf @ align at 16 bytes
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
@ generate initial tweak
- ldr r0, [r6, #0x64] @ iv[]
+ ldr r0, [ip, #4] @ iv[]
mov r1, sp
- ldr r2, [r6, #0x60] @ key2
+ ldr r2, [ip, #0] @ key2
bl AES_encrypt
+ mov r0,sp @ pointer to initial tweak
+#endif
- @ allocate the key schedule on the stack
ldr $rounds, [$key, #240] @ get # of rounds
mov $fp, r6
- mov r0, sp @ pointer to initial tweak
- sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key
- @ add sp, sp, #`128-32` @ size of bit-sliced key schedule
- sub sp, sp, #`32+16` @ place for tweak[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
@ populate the key schedule
mov r4, $key @ pass key
mov r5, $rounds @ pass # of rounds
- add r12, sp, #0x90 @ pass key schedule
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
bl _bsaes_key_convert
veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
vstmia r12, {@XMM[7]} @ save last round key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
vld1.8 {@XMM[8]}, [r0] @ initial tweak
- add $magic, $const, #.Lxts_magic-.LM0
+ adr $magic, .Lxts_magic
subs $len, #0x80
blo .Lxts_enc_short
vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[6], @XMM[6], @XMM[14]
mov r5, $rounds @ pass rounds
veor @XMM[7], @XMM[7], @XMM[15]
vld1.8 {@XMM[6]}, [$inp]!
veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[6], @XMM[6], @XMM[14]
mov r5, $rounds @ pass rounds
mov r0, sp
b .Lxts_enc_done
.align 4
.Lxts_enc_6:
- vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
-
veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[5], @XMM[5], @XMM[13]
mov r5, $rounds @ pass rounds
mov r0, sp
vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
b .Lxts_enc_done
-.align 4
-.Lxts_enc_5:
- vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+ .quad 1, 0x87
+
+.align 5
+.Lxts_enc_5:
veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[4], @XMM[4], @XMM[12]
mov r5, $rounds @ pass rounds
mov r0, sp
b .Lxts_enc_done
.align 4
.Lxts_enc_4:
- vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
-
veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[3], @XMM[3], @XMM[11]
mov r5, $rounds @ pass rounds
mov r0, sp
b .Lxts_enc_done
.align 4
.Lxts_enc_3:
- vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
-
veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[2], @XMM[2], @XMM[10]
mov r5, $rounds @ pass rounds
mov r0, sp
b .Lxts_enc_done
.align 4
.Lxts_enc_2:
- vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
-
veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[1], @XMM[1], @XMM[9]
mov r5, $rounds @ pass rounds
mov r0, sp
.align 4
.Lxts_enc_1:
mov r0, sp
- veor @XMM[0], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
mov r1, sp
vst1.8 {@XMM[0]}, [sp,:128]
mov r2, $key
vmov @XMM[8], @XMM[9] @ next round tweak
.Lxts_enc_done:
+#ifndef XTS_CHAIN_TWEAK
adds $len, #0x10
beq .Lxts_enc_ret
sub r6, $out, #0x10
veor @XMM[0], @XMM[0], @XMM[8]
vst1.8 {@XMM[0]}, [r6]
mov $fp, r4
+#endif
.Lxts_enc_ret:
bic r0, $fp, #0xf
vmov.i32 q0, #0
vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
.Lxts_enc_bzero: @ wipe key schedule [if any]
vstmia sp!, {q0-q1}
- teq sp, r0
+ cmp sp, r0
bne .Lxts_enc_bzero
mov sp, $fp
- vldmia sp!, {d8-d15}
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
ldmia sp!, {r4-r10, pc} @ return
.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
.type bsaes_xts_decrypt,%function
.align 4
bsaes_xts_decrypt:
+ mov ip, sp
stmdb sp!, {r4-r10, lr} @ 0x20
- vstmdb sp!, {d8-d15} @ 0x40
+ VFP_ABI_PUSH
mov r6, sp @ future $fp
- sub sp, #0x10 @ 0x10
mov $inp, r0
mov $out, r1
mov $len, r2
mov $key, r3
- bic sp, #0xf @ align at 16 bytes
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
@ generate initial tweak
- ldr r0, [r6, #0x64] @ iv[]
+ ldr r0, [ip, #4] @ iv[]
mov r1, sp
- ldr r2, [r6, #0x60] @ key2
+ ldr r2, [ip, #0] @ key2
bl AES_encrypt
+ mov r0, sp @ pointer to initial tweak
+#endif
- @ allocate the key schedule on the stack
ldr $rounds, [$key, #240] @ get # of rounds
mov $fp, r6
- mov r0, sp @ pointer to initial tweak
- sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key
- @ add sp, sp, #`128-32` @ size of bit-sliced key schedule
- sub sp, sp, #`32+16` @ place for tweak[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
@ populate the key schedule
mov r4, $key @ pass key
mov r5, $rounds @ pass # of rounds
- add r12, sp, #0x90 @ pass key schedule
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
bl _bsaes_key_convert
add r4, sp, #0x90
vldmia r4, {@XMM[6]}
vstmia r12, {@XMM[15]} @ save last round key
veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
vstmia r4, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
vld1.8 {@XMM[8]}, [r0] @ initial tweak
- add $magic, $const, #.Lxts_magic-.LM0
+ adr $magic, .Lxts_magic
+#ifndef XTS_CHAIN_TWEAK
tst $len, #0xf @ if not multiple of 16
+ it ne @ Thumb2 thing, sanity check in ARM
subne $len, #0x10 @ subtract another 16 bytes
+#endif
subs $len, #0x80
blo .Lxts_dec_short
vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[6], @XMM[6], @XMM[14]
mov r5, $rounds @ pass rounds
veor @XMM[7], @XMM[7], @XMM[15]
vld1.8 {@XMM[6]}, [$inp]!
veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[6], @XMM[6], @XMM[14]
mov r5, $rounds @ pass rounds
mov r0, sp
vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[5], @XMM[5], @XMM[13]
mov r5, $rounds @ pass rounds
mov r0, sp
b .Lxts_dec_done
.align 4
.Lxts_dec_5:
- vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
-
veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[4], @XMM[4], @XMM[12]
mov r5, $rounds @ pass rounds
mov r0, sp
b .Lxts_dec_done
.align 4
.Lxts_dec_4:
- vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
-
veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[3], @XMM[3], @XMM[11]
mov r5, $rounds @ pass rounds
mov r0, sp
b .Lxts_dec_done
.align 4
.Lxts_dec_3:
- vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
-
veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[2], @XMM[2], @XMM[10]
mov r5, $rounds @ pass rounds
mov r0, sp
b .Lxts_dec_done
.align 4
.Lxts_dec_2:
- vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
-
veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
veor @XMM[1], @XMM[1], @XMM[9]
mov r5, $rounds @ pass rounds
mov r0, sp
.align 4
.Lxts_dec_1:
mov r0, sp
- veor @XMM[0], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
mov r1, sp
vst1.8 {@XMM[0]}, [sp,:128]
+ mov r5, $magic @ preserve magic
mov r2, $key
mov r4, $fp @ preserve fp
- mov r5, $magic @ preserve magic
bl AES_decrypt
vmov @XMM[8], @XMM[9] @ next round tweak
.Lxts_dec_done:
+#ifndef XTS_CHAIN_TWEAK
adds $len, #0x10
beq .Lxts_dec_ret
veor @XMM[0], @XMM[0], @XMM[8]
vst1.8 {@XMM[0]}, [r6]
mov $fp, r4
+#endif
.Lxts_dec_ret:
bic r0, $fp, #0xf
vmov.i32 q0, #0
vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
.Lxts_dec_bzero: @ wipe key schedule [if any]
vstmia sp!, {q0-q1}
- teq sp, r0
+ cmp sp, r0
bne .Lxts_dec_bzero
mov sp, $fp
- vldmia sp!, {d8-d15}
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
ldmia sp!, {r4-r10, pc} @ return
.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
print $code;
close STDOUT;