X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Faes%2Fasm%2Fbsaes-armv7.pl;h=ba641eb547f68b59e04de2881da1b0271500b3cc;hp=2353aced5dd7f62b9124bf2e3ff3f1bdb33841ad;hb=0822d41b6d54132df96c02cc6f6fa9b179378351;hpb=29f41e8a80c1a7341998958adc32cf270032d7e5 diff --git a/crypto/aes/asm/bsaes-armv7.pl b/crypto/aes/asm/bsaes-armv7.pl index 2353aced5d..ba641eb547 100644 --- a/crypto/aes/asm/bsaes-armv7.pl +++ b/crypto/aes/asm/bsaes-armv7.pl @@ -1,10 +1,21 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. +# +# Specific modes and adaptation for Linux kernel by Ard Biesheuvel +# . Permission to use under GPL terms is +# granted. # ==================================================================== # Bit-sliced AES for ARM NEON @@ -19,14 +30,14 @@ # to collect performance results, which for Cortex-A8 core are: # # encrypt 19.5 cycles per byte processed with 128-bit key -# decrypt 24.0 cycles per byte processed with 128-bit key +# decrypt 22.1 cycles per byte processed with 128-bit key # key conv. 440 cycles per 128-bit key/0.18 of 8x block # -# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 22.6, +# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, # which is [much] worse than anticipated (for further details see # http://www.openssl.org/~appro/Snapdragon-S4.html). # -# Cortex-A15 manages in 14.2/19.6 cycles [when integer-only code +# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code # manages in 20.0 cycles]. # # When comparing to x86_64 results keep in mind that NEON unit is @@ -37,8 +48,26 @@ # # -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +# April-August 2013 +# +# Add CBC, CTR and XTS subroutines, adapt for kernel use. +# +# + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} my ($inp,$out,$len,$key)=("r0","r1","r2","r3"); my @XMM=map("q$_",(0..15)); @@ -62,7 +91,7 @@ my @s=@_[12..15]; sub InBasisChange { # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb -# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb +# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb my @b=@_[0..7]; $code.=<<___; veor @b[2], @b[2], @b[1] @@ -367,6 +396,7 @@ sub MixColumns { # modified to emit output in order suitable for feeding back to aesenc[last] my @x=@_[0..7]; my @t=@_[8..15]; +my $inv=@_[16]; # optional $code.=<<___; vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32 vext.8 @t[1], @x[1], @x[1], #12 @@ -407,8 +437,9 @@ $code.=<<___; veor @t[3], @t[3], @x[7] vext.8 @x[6], @x[2], @x[2], #8 veor @x[7], @t[1], @t[5] +___ +$code.=<<___ if (!$inv); veor @x[2], @t[0], @t[4] - veor @x[4], @x[4], @t[3] veor @x[5], @x[5], @t[7] veor @x[3], @x[3], @t[6] @@ -416,9 +447,18 @@ $code.=<<___; veor @x[6], @x[6], @t[2] @ vmov @x[7], @t[1] ___ +$code.=<<___ if ($inv); + veor @t[3], @t[3], @x[4] + veor @x[5], @x[5], @t[7] + veor @x[2], @x[3], @t[6] + veor @x[3], @t[0], @t[4] + veor @x[4], @x[6], @t[2] + vmov @x[6], @t[3] + @ vmov @x[7], @t[1] +___ } -sub InvMixColumns { +sub InvMixColumns_orig { my @x=@_[0..7]; my @t=@_[8..15]; @@ -571,6 +611,54 @@ $code.=<<___; ___ } +sub InvMixColumns { +my @x=@_[0..7]; +my @t=@_[8..15]; + +# Thanks to Jussi Kivilinna for providing pointer to +# +# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | +# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | +# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | +# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | + +$code.=<<___; + @ multiplication by 0x05-0x00-0x04-0x00 + vext.8 @t[0], @x[0], @x[0], #8 + vext.8 @t[6], @x[6], @x[6], #8 + vext.8 @t[7], @x[7], @x[7], #8 + veor @t[0], @t[0], @x[0] + vext.8 @t[1], @x[1], @x[1], #8 + veor @t[6], @t[6], @x[6] + vext.8 @t[2], @x[2], @x[2], #8 + veor @t[7], @t[7], @x[7] + vext.8 @t[3], @x[3], @x[3], #8 + veor @t[1], @t[1], @x[1] + vext.8 @t[4], @x[4], @x[4], #8 + veor @t[2], @t[2], @x[2] + vext.8 @t[5], @x[5], @x[5], #8 + veor @t[3], @t[3], @x[3] + veor @t[4], @t[4], @x[4] + veor @t[5], @t[5], @x[5] + + veor @x[0], @x[0], @t[6] + veor @x[1], @x[1], @t[6] + veor @x[2], @x[2], @t[0] + veor @x[4], @x[4], @t[2] + veor @x[3], @x[3], @t[1] + veor @x[1], @x[1], @t[7] + veor @x[2], @x[2], @t[7] + veor @x[4], @x[4], @t[6] + veor @x[5], @x[5], @t[3] + veor @x[3], @x[3], @t[6] + veor @x[6], @x[6], @t[4] + veor @x[4], @x[4], @t[7] + veor @x[5], @x[5], @t[7] + veor @x[7], @x[7], @t[5] +___ + &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 +} + sub swapmove { my ($a,$b,$n,$mask,$t)=@_; $code.=<<___; @@ -620,19 +708,49 @@ ___ } $code.=<<___; -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" -#if __ARM_ARCH__>=7 -.text -.code 32 +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +# define VFP_ABI_FRAME 0x40 +#else +# define VFP_ABI_PUSH +# define VFP_ABI_POP +# define VFP_ABI_FRAME 0 +# define BSAES_ASM_EXTENDED_KEY +# define XTS_CHAIN_TWEAK +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +#ifdef __thumb__ +# define adrl adr +#endif + +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a .fpu neon +.text +.syntax unified @ ARMv7-capable assembler is expected to handle this +#if defined(__thumb2__) && !defined(__APPLE__) +.thumb +#else +.code 32 +# undef __thumb2__ +#endif + .type _bsaes_decrypt8,%function .align 4 _bsaes_decrypt8: - sub $const,pc,#8 @ _bsaes_decrypt8 + adr $const,_bsaes_decrypt8 vldmia $key!, {@XMM[9]} @ round 0 key +#ifdef __APPLE__ + adr $const,.LM0ISR +#else add $const,$const,#.LM0ISR-_bsaes_decrypt8 +#endif vldmia $const!, {@XMM[8]} @ .LM0ISR veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key @@ -677,6 +795,7 @@ ___ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); $code.=<<___; vldmia $const, {@XMM[12]} @ .LISR + ite eq @ Thumb2 thing, sanity check in ARM addeq $const,$const,#0x10 bne .Ldec_loop vldmia $const, {@XMM[12]} @ .LISRM0 @@ -716,9 +835,7 @@ _bsaes_const: .LM0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d .LREVM0SR: - .quad 0x090d02060c030708, 0x00040b0f050a0e01 -.Lxts_magic: - .quad 1, 0x87 + .quad 0x090d01050c000408, 0x03070b0f060a0e02 .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by " .align 6 .size _bsaes_const,.-_bsaes_const @@ -726,9 +843,13 @@ _bsaes_const: .type _bsaes_encrypt8,%function .align 4 _bsaes_encrypt8: - sub $const,pc,#8 @ _bsaes_encrypt8 + adr $const,_bsaes_encrypt8 vldmia $key!, {@XMM[9]} @ round 0 key +#ifdef __APPLE__ + adr $const,.LM0SR +#else sub $const,$const,#_bsaes_encrypt8-.LM0SR +#endif vldmia $const!, {@XMM[8]} @ .LM0SR _bsaes_encrypt8_alt: @@ -775,6 +896,7 @@ ___ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); $code.=<<___; vldmia $const, {@XMM[12]} @ .LSR + ite eq @ Thumb2 thing, samity check in ARM addeq $const,$const,#0x10 bne .Lenc_loop vldmia $const, {@XMM[12]} @ .LSRM0 @@ -829,9 +951,13 @@ $code.=<<___; .type _bsaes_key_convert,%function .align 4 _bsaes_key_convert: - sub $const,pc,#8 @ _bsaes_key_convert + adr $const,_bsaes_key_convert vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key +#ifdef __APPLE__ + adr $const,.LM0 +#else sub $const,$const,#_bsaes_key_convert-.LM0 +#endif vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key vmov.i8 @XMM[8], #0x01 @ bit masks @@ -998,32 +1124,62 @@ $code.=<<___; .type bsaes_cbc_encrypt,%function .align 5 bsaes_cbc_encrypt: +#ifndef __KERNEL__ cmp $len, #128 +#ifndef __thumb__ blo AES_cbc_encrypt +#else + bhs 1f + b AES_cbc_encrypt +1: +#endif +#endif @ it is up to the caller to make sure we are called with enc == 0 + mov ip, sp stmdb sp!, {r4-r10, lr} - vstmdb sp!, {d8-d15} @ ABI specification says so - ldr $ivp, [sp, #0x60] @ IV is 1st arg on the stack + VFP_ABI_PUSH + ldr $ivp, [ip] @ IV is 1st arg on the stack mov $len, $len, lsr#4 @ len in 16 byte blocks sub sp, #0x10 @ scratch space to carry over the IV mov $fp, sp @ save sp - @ allocate the key schedule on the stack ldr $rounds, [$key, #240] @ get # of rounds - sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key - add sp, sp, #`128-32` @ size of bit-sliced key schedule +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + add r12, #`128-32` @ sifze of bit-slices key schedule @ populate the key schedule mov r4, $key @ pass key mov r5, $rounds @ pass # of rounds - mov r12, $keysched @ pass key schedule + mov sp, r12 @ sp is $keysched bl _bsaes_key_convert vldmia $keysched, {@XMM[6]} vstmia r12, {@XMM[15]} @ save last round key veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key vstmia $keysched, {@XMM[7]} +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + add r4, $key, #248 + vldmia r4, {@XMM[6]} + vstmia r12, {@XMM[15]} @ save last round key + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key + vstmia r4, {@XMM[7]} + +.align 2 +0: +#endif vld1.8 {@XMM[15]}, [$ivp] @ load IV b .Lcbc_dec_loop @@ -1035,7 +1191,11 @@ bsaes_cbc_encrypt: vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! +#ifndef BSAES_ASM_EXTENDED_KEY mov r4, $keysched @ pass the key +#else + add r4, $key, #248 +#endif vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! mov r5, $rounds vld1.8 {@XMM[6]-@XMM[7]}, [$inp] @@ -1075,7 +1235,11 @@ bsaes_cbc_encrypt: cmp $len, #2 blo .Lcbc_dec_one vld1.8 {@XMM[1]}, [$inp]! +#ifndef BSAES_ASM_EXTENDED_KEY mov r4, $keysched @ pass the key +#else + add r4, $key, #248 +#endif mov r5, $rounds vstmia $fp, {@XMM[15]} @ put aside IV beq .Lcbc_dec_two @@ -1201,22 +1365,25 @@ bsaes_cbc_encrypt: vmov @XMM[4],@XMM[15] @ just in case ensure that IV vmov @XMM[5],@XMM[0] @ and input are preserved bl AES_decrypt - vld1.8 {@XMM[0]}, [$fp,:64] @ load result + vld1.8 {@XMM[0]}, [$fp] @ load result veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV vmov @XMM[15], @XMM[5] @ @XMM[5] holds input vst1.8 {@XMM[0]}, [$rounds] @ write output .Lcbc_dec_done: +#ifndef BSAES_ASM_EXTENDED_KEY vmov.i32 q0, #0 vmov.i32 q1, #0 .Lcbc_dec_bzero: @ wipe key schedule [if any] vstmia $keysched!, {q0-q1} - teq $keysched, $fp + cmp $keysched, $fp bne .Lcbc_dec_bzero +#endif - add sp, $fp, #0x10 + mov sp, $fp + add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb vst1.8 {@XMM[15]}, [$ivp] @ return IV - vldmia sp!, {d8-d15} + VFP_ABI_POP ldmia sp!, {r4-r10, pc} .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt ___ @@ -1235,34 +1402,62 @@ bsaes_ctr32_encrypt_blocks: cmp $len, #8 @ use plain AES for blo .Lctr_enc_short @ small sizes + mov ip, sp stmdb sp!, {r4-r10, lr} - vstmdb sp!, {d8-d15} @ ABI specification says so - ldr $ctr, [sp, #0x60] @ ctr is 1st arg on the stack + VFP_ABI_PUSH + ldr $ctr, [ip] @ ctr is 1st arg on the stack sub sp, sp, #0x10 @ scratch space to carry over the ctr mov $fp, sp @ save sp - @ allocate the key schedule on the stack ldr $rounds, [$key, #240] @ get # of rounds - sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key - add sp, sp, #`128-32` @ size of bit-sliced key schedule +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + add r12, #`128-32` @ size of bit-sliced key schedule @ populate the key schedule mov r4, $key @ pass key mov r5, $rounds @ pass # of rounds - mov r12, $keysched @ pass key schedule + mov sp, r12 @ sp is $keysched bl _bsaes_key_convert veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key vstmia r12, {@XMM[7]} @ save last round key vld1.8 {@XMM[0]}, [$ctr] @ load counter +#ifdef __APPLE__ + mov $ctr, #:lower16:(.LREVM0SR-.LM0) + add $ctr, $const, $ctr +#else add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr +#endif vldmia $keysched, {@XMM[4]} @ load round0 key +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} @ save last round key - vmov.i32 `&Dhi("@XMM[8]")`,#1 @ compose 1<<96 - vmov.i32 `&Dlo("@XMM[8]")`,#0 - vrev32.8 `&Dhi("@XMM[0]")`,`&Dhi("@XMM[0]")` - vshl.u64 `&Dhi("@XMM[8]")`,#32 - vrev32.8 `&Dhi("@XMM[4]")`,`&Dhi("@XMM[4]")` +.align 2 +0: add r12, $key, #248 + vld1.8 {@XMM[0]}, [$ctr] @ load counter + adrl $ctr, .LREVM0SR @ borrow $ctr + vldmia r12, {@XMM[4]} @ load round0 key + sub sp, #0x10 @ place for adjusted round0 key +#endif + + vmov.i32 @XMM[8],#1 @ compose 1<<96 + veor @XMM[9],@XMM[9],@XMM[9] + vrev32.8 @XMM[0],@XMM[0] + vext.8 @XMM[8],@XMM[9],@XMM[8],#4 + vrev32.8 @XMM[4],@XMM[4] vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 vstmia $keysched, {@XMM[4]} @ save adjusted round0 key b .Lctr_enc_loop @@ -1283,11 +1478,20 @@ bsaes_ctr32_encrypt_blocks: @ to flip byte order in 32-bit counter vldmia $keysched, {@XMM[9]} @ load round0 key +#ifndef BSAES_ASM_EXTENDED_KEY add r4, $keysched, #0x10 @ pass next round key +#else + add r4, $key, #`248+16` +#endif vldmia $ctr, {@XMM[8]} @ .LREVM0SR mov r5, $rounds @ pass rounds vstmia $fp, {@XMM[10]} @ save next counter +#ifdef __APPLE__ + mov $const, #:lower16:(.LREVM0SR-.LSR) + sub $const, $ctr, $const +#else sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants +#endif bl _bsaes_encrypt8_alt @@ -1309,11 +1513,11 @@ bsaes_ctr32_encrypt_blocks: vst1.8 {@XMM[4]}, [$out]! veor @XMM[5], @XMM[15] vst1.8 {@XMM[6]}, [$out]! - vmov.i32 `&Dhi("@XMM[8]")`,#1 @ compose 1<<96 + vmov.i32 @XMM[8], #1 @ compose 1<<96 vst1.8 {@XMM[3]}, [$out]! - vmov.i32 `&Dlo("@XMM[8]")`,#0 + veor @XMM[9], @XMM[9], @XMM[9] vst1.8 {@XMM[7]}, [$out]! - vshl.u64 `&Dhi("@XMM[8]")`,#32 + vext.8 @XMM[8], @XMM[9], @XMM[8], #4 vst1.8 {@XMM[2]}, [$out]! vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 vst1.8 {@XMM[5]}, [$out]! @@ -1359,13 +1563,18 @@ bsaes_ctr32_encrypt_blocks: .Lctr_enc_done: vmov.i32 q0, #0 vmov.i32 q1, #0 +#ifndef BSAES_ASM_EXTENDED_KEY .Lctr_enc_bzero: @ wipe key schedule [if any] vstmia $keysched!, {q0-q1} - teq $keysched, $fp + cmp $keysched, $fp bne .Lctr_enc_bzero +#else + vstmia $keysched, {q0-q1} +#endif - add sp, $fp, #0x10 - vldmia sp!, {d8-d15} + mov sp, $fp + add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb + VFP_ABI_POP ldmia sp!, {r4-r10, pc} @ return .align 4 @@ -1383,7 +1592,7 @@ bsaes_ctr32_encrypt_blocks: rev r8, r8 #endif sub sp, sp, #0x10 - vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value + vst1.8 {@XMM[1]}, [sp] @ copy counter value sub sp, sp, #0x10 .Lctr_enc_short_loop: @@ -1394,7 +1603,7 @@ bsaes_ctr32_encrypt_blocks: bl AES_encrypt vld1.8 {@XMM[0]}, [r4]! @ load input - vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter + vld1.8 {@XMM[1]}, [sp] @ load encrypted counter add r8, r8, #1 #ifdef __ARMEL__ rev r0, r8 @@ -1407,7 +1616,10 @@ bsaes_ctr32_encrypt_blocks: subs r6, r6, #1 bne .Lctr_enc_short_loop - add sp, sp, #0x20 + vmov.i32 q0, #0 + vmov.i32 q1, #0 + vstmia sp!, {q0-q1} + ldmia sp!, {r4-r8, pc} .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks ___ @@ -1428,41 +1640,66 @@ $code.=<<___; .type bsaes_xts_encrypt,%function .align 4 bsaes_xts_encrypt: + mov ip, sp stmdb sp!, {r4-r10, lr} @ 0x20 - vstmdb sp!, {d8-d15} @ 0x40 + VFP_ABI_PUSH mov r6, sp @ future $fp - sub sp, #0x10 @ 0x10 mov $inp, r0 mov $out, r1 mov $len, r2 mov $key, r3 - bic sp, #0xf @ align at 16 bytes + sub r0, sp, #0x10 @ 0x10 + bic r0, #0xf @ align at 16 bytes + mov sp, r0 + +#ifdef XTS_CHAIN_TWEAK + ldr r0, [ip] @ pointer to input tweak +#else @ generate initial tweak - ldr r0, [r6, #0x64] @ iv[] + ldr r0, [ip, #4] @ iv[] mov r1, sp - ldr r2, [r6, #0x60] @ key2 + ldr r2, [ip, #0] @ key2 bl AES_encrypt + mov r0,sp @ pointer to initial tweak +#endif - @ allocate the key schedule on the stack ldr $rounds, [$key, #240] @ get # of rounds mov $fp, r6 - mov r0, sp @ pointer to initial tweak - sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key - @ add sp, sp, #`128-32` @ size of bit-sliced key schedule - sub sp, sp, #`32+16` @ place for tweak[9] +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + @ add r12, #`128-32` @ size of bit-sliced key schedule + sub r12, #`32+16` @ place for tweak[9] @ populate the key schedule mov r4, $key @ pass key mov r5, $rounds @ pass # of rounds - add r12, sp, #0x90 @ pass key schedule + mov sp, r12 + add r12, #0x90 @ pass key schedule bl _bsaes_key_convert veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key vstmia r12, {@XMM[7]} @ save last round key +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} + +.align 2 +0: sub sp, #0x90 @ place for tweak[9] +#endif vld1.8 {@XMM[8]}, [r0] @ initial tweak - add $magic, $const, #.Lxts_magic-.LM0 + adr $magic, .Lxts_magic subs $len, #0x80 blo .Lxts_enc_short @@ -1502,7 +1739,11 @@ $code.=<<___; vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! veor @XMM[5], @XMM[5], @XMM[13] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[6], @XMM[6], @XMM[14] mov r5, $rounds @ pass rounds veor @XMM[7], @XMM[7], @XMM[15] @@ -1567,7 +1808,11 @@ $code.=<<___; vld1.8 {@XMM[6]}, [$inp]! veor @XMM[5], @XMM[5], @XMM[13] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[6], @XMM[6], @XMM[14] mov r5, $rounds @ pass rounds mov r0, sp @@ -1594,10 +1839,12 @@ $code.=<<___; b .Lxts_enc_done .align 4 .Lxts_enc_6: - vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak - veor @XMM[4], @XMM[4], @XMM[12] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[5], @XMM[5], @XMM[13] mov r5, $rounds @ pass rounds mov r0, sp @@ -1619,12 +1866,20 @@ $code.=<<___; vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak b .Lxts_enc_done -.align 4 -.Lxts_enc_5: - vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak +@ put this in range for both ARM and Thumb mode adr instructions +.align 5 +.Lxts_magic: + .quad 1, 0x87 + +.align 5 +.Lxts_enc_5: veor @XMM[3], @XMM[3], @XMM[11] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[4], @XMM[4], @XMM[12] mov r5, $rounds @ pass rounds mov r0, sp @@ -1647,10 +1902,12 @@ $code.=<<___; b .Lxts_enc_done .align 4 .Lxts_enc_4: - vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak - veor @XMM[2], @XMM[2], @XMM[10] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[3], @XMM[3], @XMM[11] mov r5, $rounds @ pass rounds mov r0, sp @@ -1670,10 +1927,12 @@ $code.=<<___; b .Lxts_enc_done .align 4 .Lxts_enc_3: - vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak - veor @XMM[1], @XMM[1], @XMM[9] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[2], @XMM[2], @XMM[10] mov r5, $rounds @ pass rounds mov r0, sp @@ -1692,10 +1951,12 @@ $code.=<<___; b .Lxts_enc_done .align 4 .Lxts_enc_2: - vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak - veor @XMM[0], @XMM[0], @XMM[8] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[1], @XMM[1], @XMM[9] mov r5, $rounds @ pass rounds mov r0, sp @@ -1712,7 +1973,7 @@ $code.=<<___; .align 4 .Lxts_enc_1: mov r0, sp - veor @XMM[0], @XMM[8] + veor @XMM[0], @XMM[0], @XMM[8] mov r1, sp vst1.8 {@XMM[0]}, [sp,:128] mov r2, $key @@ -1728,6 +1989,7 @@ $code.=<<___; vmov @XMM[8], @XMM[9] @ next round tweak .Lxts_enc_done: +#ifndef XTS_CHAIN_TWEAK adds $len, #0x10 beq .Lxts_enc_ret sub r6, $out, #0x10 @@ -1755,18 +2017,25 @@ $code.=<<___; veor @XMM[0], @XMM[0], @XMM[8] vst1.8 {@XMM[0]}, [r6] mov $fp, r4 +#endif .Lxts_enc_ret: bic r0, $fp, #0xf vmov.i32 q0, #0 vmov.i32 q1, #0 +#ifdef XTS_CHAIN_TWEAK + ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak +#endif .Lxts_enc_bzero: @ wipe key schedule [if any] vstmia sp!, {q0-q1} - teq sp, r0 + cmp sp, r0 bne .Lxts_enc_bzero mov sp, $fp - vldmia sp!, {d8-d15} +#ifdef XTS_CHAIN_TWEAK + vst1.8 {@XMM[8]}, [r1] +#endif + VFP_ABI_POP ldmia sp!, {r4-r10, pc} @ return .size bsaes_xts_encrypt,.-bsaes_xts_encrypt @@ -1775,47 +2044,77 @@ $code.=<<___; .type bsaes_xts_decrypt,%function .align 4 bsaes_xts_decrypt: + mov ip, sp stmdb sp!, {r4-r10, lr} @ 0x20 - vstmdb sp!, {d8-d15} @ 0x40 + VFP_ABI_PUSH mov r6, sp @ future $fp - sub sp, #0x10 @ 0x10 mov $inp, r0 mov $out, r1 mov $len, r2 mov $key, r3 - bic sp, #0xf @ align at 16 bytes + sub r0, sp, #0x10 @ 0x10 + bic r0, #0xf @ align at 16 bytes + mov sp, r0 + +#ifdef XTS_CHAIN_TWEAK + ldr r0, [ip] @ pointer to input tweak +#else @ generate initial tweak - ldr r0, [r6, #0x64] @ iv[] + ldr r0, [ip, #4] @ iv[] mov r1, sp - ldr r2, [r6, #0x60] @ key2 + ldr r2, [ip, #0] @ key2 bl AES_encrypt + mov r0, sp @ pointer to initial tweak +#endif - @ allocate the key schedule on the stack ldr $rounds, [$key, #240] @ get # of rounds mov $fp, r6 - mov r0, sp @ pointer to initial tweak - sub sp, sp, $rounds, lsl#7 @ 128 bytes per inner round key - @ add sp, sp, #`128-32` @ size of bit-sliced key schedule - sub sp, sp, #`32+16` @ place for tweak[9] +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + @ add r12, #`128-32` @ size of bit-sliced key schedule + sub r12, #`32+16` @ place for tweak[9] @ populate the key schedule mov r4, $key @ pass key mov r5, $rounds @ pass # of rounds - add r12, sp, #0x90 @ pass key schedule + mov sp, r12 + add r12, #0x90 @ pass key schedule bl _bsaes_key_convert add r4, sp, #0x90 vldmia r4, {@XMM[6]} vstmia r12, {@XMM[15]} @ save last round key veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key vstmia r4, {@XMM[7]} +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + add r4, $key, #248 + vldmia r4, {@XMM[6]} + vstmia r12, {@XMM[15]} @ save last round key + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key + vstmia r4, {@XMM[7]} + +.align 2 +0: sub sp, #0x90 @ place for tweak[9] +#endif vld1.8 {@XMM[8]}, [r0] @ initial tweak - add $magic, $const, #.Lxts_magic-.LM0 + adr $magic, .Lxts_magic +#ifndef XTS_CHAIN_TWEAK tst $len, #0xf @ if not multiple of 16 + it ne @ Thumb2 thing, sanity check in ARM subne $len, #0x10 @ subtract another 16 bytes +#endif subs $len, #0x80 blo .Lxts_dec_short @@ -1855,7 +2154,11 @@ $code.=<<___; vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! veor @XMM[5], @XMM[5], @XMM[13] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[6], @XMM[6], @XMM[14] mov r5, $rounds @ pass rounds veor @XMM[7], @XMM[7], @XMM[15] @@ -1920,7 +2223,11 @@ $code.=<<___; vld1.8 {@XMM[6]}, [$inp]! veor @XMM[5], @XMM[5], @XMM[13] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[6], @XMM[6], @XMM[14] mov r5, $rounds @ pass rounds mov r0, sp @@ -1950,7 +2257,11 @@ $code.=<<___; vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak veor @XMM[4], @XMM[4], @XMM[12] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[5], @XMM[5], @XMM[13] mov r5, $rounds @ pass rounds mov r0, sp @@ -1974,10 +2285,12 @@ $code.=<<___; b .Lxts_dec_done .align 4 .Lxts_dec_5: - vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak - veor @XMM[3], @XMM[3], @XMM[11] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[4], @XMM[4], @XMM[12] mov r5, $rounds @ pass rounds mov r0, sp @@ -2000,10 +2313,12 @@ $code.=<<___; b .Lxts_dec_done .align 4 .Lxts_dec_4: - vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak - veor @XMM[2], @XMM[2], @XMM[10] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[3], @XMM[3], @XMM[11] mov r5, $rounds @ pass rounds mov r0, sp @@ -2023,10 +2338,12 @@ $code.=<<___; b .Lxts_dec_done .align 4 .Lxts_dec_3: - vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak - veor @XMM[1], @XMM[1], @XMM[9] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[2], @XMM[2], @XMM[10] mov r5, $rounds @ pass rounds mov r0, sp @@ -2045,10 +2362,12 @@ $code.=<<___; b .Lxts_dec_done .align 4 .Lxts_dec_2: - vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak - veor @XMM[0], @XMM[0], @XMM[8] +#ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif veor @XMM[1], @XMM[1], @XMM[9] mov r5, $rounds @ pass rounds mov r0, sp @@ -2065,12 +2384,12 @@ $code.=<<___; .align 4 .Lxts_dec_1: mov r0, sp - veor @XMM[0], @XMM[8] + veor @XMM[0], @XMM[0], @XMM[8] mov r1, sp vst1.8 {@XMM[0]}, [sp,:128] + mov r5, $magic @ preserve magic mov r2, $key mov r4, $fp @ preserve fp - mov r5, $magic @ preserve magic bl AES_decrypt @@ -2083,6 +2402,7 @@ $code.=<<___; vmov @XMM[8], @XMM[9] @ next round tweak .Lxts_dec_done: +#ifndef XTS_CHAIN_TWEAK adds $len, #0x10 beq .Lxts_dec_ret @@ -2132,18 +2452,25 @@ $code.=<<___; veor @XMM[0], @XMM[0], @XMM[8] vst1.8 {@XMM[0]}, [r6] mov $fp, r4 +#endif .Lxts_dec_ret: bic r0, $fp, #0xf vmov.i32 q0, #0 vmov.i32 q1, #0 +#ifdef XTS_CHAIN_TWEAK + ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak +#endif .Lxts_dec_bzero: @ wipe key schedule [if any] vstmia sp!, {q0-q1} - teq sp, r0 + cmp sp, r0 bne .Lxts_dec_bzero mov sp, $fp - vldmia sp!, {d8-d15} +#ifdef XTS_CHAIN_TWEAK + vst1.8 {@XMM[8]}, [r1] +#endif + VFP_ABI_POP ldmia sp!, {r4-r10, pc} @ return .size bsaes_xts_decrypt,.-bsaes_xts_decrypt @@ -2155,6 +2482,14 @@ ___ $code =~ s/\`([^\`]*)\`/eval($1)/gem; +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + print $code; close STDOUT;