X-Git-Url: https://git.openssl.org/?p=openssl.git;a=blobdiff_plain;f=crypto%2Faes%2Fasm%2Fbsaes-x86_64.pl;h=2c79c2b67c897d60eca6ea0a5d28af75e4ca3f84;hp=67799cebc09fec5e93d5c23e64b689035dcc28b8;hb=b84460ad3a3e4fcb22efaa0a8365b826f4264ecf;hpb=a75a52a43e32e3ab3b7640872a2d65cfa6c22426 diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl index 67799cebc0..2c79c2b67c 100644 --- a/crypto/aes/asm/bsaes-x86_64.pl +++ b/crypto/aes/asm/bsaes-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + ################################################################### ### AES-128 [originally in CTR mode] ### @@ -38,8 +45,10 @@ # Emilia's this(*) difference # # Core 2 9.30 8.69 +7% -# Nehalem(**) 7.63 6.98 +9% -# Atom 17.1 17.4 -2%(***) +# Nehalem(**) 7.63 6.88 +11% +# Atom 17.1 16.4 +4% +# Silvermont - 12.9 +# Goldmont - 8.85 # # (*) Comparison is not completely fair, because "this" is ECB, # i.e. no extra processing such as counter values calculation @@ -50,14 +59,6 @@ # (**) Results were collected on Westmere, which is considered to # be equivalent to Nehalem for this code. # -# (***) Slowdown on Atom is rather strange per se, because original -# implementation has a number of 9+-bytes instructions, which -# are bad for Atom front-end, and which I eliminated completely. -# In attempt to address deterioration sbox() was tested in FP -# SIMD "domain" (movaps instead of movdqa, xorps instead of -# pxor, etc.). While it resulted in nominal 4% improvement on -# Atom, it hurted Westmere by more than 2x factor. -# # As for key schedule conversion subroutine. Interface to OpenSSL # relies on per-invocation on-the-fly conversion. This naturally # has impact on performance, especially for short inputs. Conversion @@ -65,12 +66,12 @@ # function is: # # conversion conversion/8x block -# Core 2 410 0.37 -# Nehalem 310 0.35 -# Atom 570 0.26 +# Core 2 240 0.22 +# Nehalem 180 0.20 +# Atom 430 0.20 # # The ratio values mean that 128-byte blocks will be processed -# 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%, +# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, # etc. Then keep in mind that input sizes not divisible by 128 are # *effectively* slower, especially shortest ones, e.g. consecutive # 144-byte blocks are processed 44% slower than one would expect, @@ -83,8 +84,16 @@ # Add decryption procedure. Performance in CPU cycles spent to decrypt # one byte out of 4096-byte buffer with 128-bit key is: # -# Core 2 11.0 -# Nehalem 9.16 +# Core 2 9.98 +# Nehalem 7.80 +# Atom 17.9 +# Silvermont 14.0 +# Goldmont 10.2 +# +# November 2011. +# +# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is +# suboptimal, but XTS is meant to be used with larger blocks... # # @@ -99,10 +108,12 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open STDOUT,"| $^X $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) +my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... { my ($key,$rounds,$const)=("%rax","%r10d","%r11"); @@ -120,7 +131,7 @@ my @s=@_[12..15]; sub InBasisChange { # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb -# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb +# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb my @b=@_[0..7]; $code.=<<___; pxor @b[6], @b[5] @@ -370,7 +381,7 @@ $code.=<<___; pxor @s[0], @t[3] pxor @s[1], @t[2] pxor @s[2], @t[1] - pxor @s[3], @t[0] + pxor @s[3], @t[0] #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 @@ -426,21 +437,21 @@ my $mask=pop; $code.=<<___; pxor 0x00($key),@x[0] pxor 0x10($key),@x[1] - pshufb $mask,@x[0] pxor 0x20($key),@x[2] - pshufb $mask,@x[1] pxor 0x30($key),@x[3] - pshufb $mask,@x[2] + pshufb $mask,@x[0] + pshufb $mask,@x[1] pxor 0x40($key),@x[4] - pshufb $mask,@x[3] pxor 0x50($key),@x[5] - pshufb $mask,@x[4] + pshufb $mask,@x[2] + pshufb $mask,@x[3] pxor 0x60($key),@x[6] - pshufb $mask,@x[5] pxor 0x70($key),@x[7] + pshufb $mask,@x[4] + pshufb $mask,@x[5] pshufb $mask,@x[6] - lea 0x80($key),$key pshufb $mask,@x[7] + lea 0x80($key),$key ___ } @@ -448,6 +459,7 @@ sub MixColumns { # modified to emit output in order suitable for feeding back to aesenc[last] my @x=@_[0..7]; my @t=@_[8..15]; +my $inv=@_[16]; # optional $code.=<<___; pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 pshufd \$0x93, @x[1], @t[1] @@ -489,7 +501,8 @@ $code.=<<___; pxor @t[4], @t[0] pshufd \$0x4E, @x[2], @x[6] pxor @t[5], @t[1] - +___ +$code.=<<___ if (!$inv); pxor @t[3], @x[4] pxor @t[7], @x[5] pxor @t[6], @x[3] @@ -497,9 +510,20 @@ $code.=<<___; pxor @t[2], @x[6] movdqa @t[1], @x[7] ___ +$code.=<<___ if ($inv); + pxor @x[4], @t[3] + pxor @t[7], @x[5] + pxor @x[3], @t[6] + movdqa @t[0], @x[3] + pxor @t[2], @x[6] + movdqa @t[6], @x[2] + movdqa @t[1], @x[7] + movdqa @x[6], @x[4] + movdqa @t[3], @x[6] +___ } -sub InvMixColumns { +sub InvMixColumns_orig { my @x=@_[0..7]; my @t=@_[8..15]; @@ -653,6 +677,54 @@ $code.=<<___; ___ } +sub InvMixColumns { +my @x=@_[0..7]; +my @t=@_[8..15]; + +# Thanks to Jussi Kivilinna for providing pointer to +# +# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | +# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | +# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | +# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | + +$code.=<<___; + # multiplication by 0x05-0x00-0x04-0x00 + pshufd \$0x4E, @x[0], @t[0] + pshufd \$0x4E, @x[6], @t[6] + pxor @x[0], @t[0] + pshufd \$0x4E, @x[7], @t[7] + pxor @x[6], @t[6] + pshufd \$0x4E, @x[1], @t[1] + pxor @x[7], @t[7] + pshufd \$0x4E, @x[2], @t[2] + pxor @x[1], @t[1] + pshufd \$0x4E, @x[3], @t[3] + pxor @x[2], @t[2] + pxor @t[6], @x[0] + pxor @t[6], @x[1] + pshufd \$0x4E, @x[4], @t[4] + pxor @x[3], @t[3] + pxor @t[0], @x[2] + pxor @t[1], @x[3] + pshufd \$0x4E, @x[5], @t[5] + pxor @x[4], @t[4] + pxor @t[7], @x[1] + pxor @t[2], @x[4] + pxor @x[5], @t[5] + + pxor @t[7], @x[2] + pxor @t[6], @x[3] + pxor @t[6], @x[4] + pxor @t[3], @x[5] + pxor @t[4], @x[6] + pxor @t[7], @x[4] + pxor @t[7], @x[5] + pxor @t[5], @x[7] +___ + &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 +} + sub aesenc { # not used my @b=@_[0..7]; my @t=@_[8..15]; @@ -738,8 +810,8 @@ ___ $code.=<<___; .text -.extern AES_encrypt -.extern AES_decrypt +.extern asm_AES_encrypt +.extern asm_AES_decrypt .type _bsaes_encrypt8,\@abi-omnipotent .align 64 @@ -748,21 +820,21 @@ _bsaes_encrypt8: movdqa ($key), @XMM[9] # round 0 key lea 0x10($key), $key - movdqa 0x60($const), @XMM[8] # .LM0SR + movdqa 0x50($const), @XMM[8] # .LM0SR pxor @XMM[9], @XMM[0] # xor with round0 key pxor @XMM[9], @XMM[1] - pshufb @XMM[8], @XMM[0] pxor @XMM[9], @XMM[2] - pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[3] - pshufb @XMM[8], @XMM[2] + pshufb @XMM[8], @XMM[0] + pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[4] - pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[5] - pshufb @XMM[8], @XMM[4] + pshufb @XMM[8], @XMM[2] + pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[6] - pshufb @XMM[8], @XMM[5] pxor @XMM[9], @XMM[7] + pshufb @XMM[8], @XMM[4] + pshufb @XMM[8], @XMM[5] pshufb @XMM[8], @XMM[6] pshufb @XMM[8], @XMM[7] _bsaes_encrypt8_bitslice: @@ -815,18 +887,18 @@ _bsaes_decrypt8: movdqa -0x30($const), @XMM[8] # .LM0ISR pxor @XMM[9], @XMM[0] # xor with round0 key pxor @XMM[9], @XMM[1] - pshufb @XMM[8], @XMM[0] pxor @XMM[9], @XMM[2] - pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[3] - pshufb @XMM[8], @XMM[2] + pshufb @XMM[8], @XMM[0] + pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[4] - pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[5] - pshufb @XMM[8], @XMM[4] + pshufb @XMM[8], @XMM[2] + pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[6] - pshufb @XMM[8], @XMM[5] pxor @XMM[9], @XMM[7] + pshufb @XMM[8], @XMM[4] + pshufb @XMM[8], @XMM[5] pshufb @XMM[8], @XMM[6] pshufb @XMM[8], @XMM[7] ___ @@ -899,53 +971,89 @@ $code.=<<___; .type _bsaes_key_convert,\@abi-omnipotent .align 16 _bsaes_key_convert: - lea .LBS1(%rip), $const + lea .Lmasks(%rip), $const movdqu ($inp), %xmm7 # load round 0 key - movdqa -0x10($const), %xmm8 # .LBS0 - movdqa 0x00($const), %xmm9 # .LBS1 - movdqa 0x10($const), %xmm10 # .LBS2 - movdqa 0x40($const), %xmm13 # .LM0 - movdqa 0x60($const), %xmm14 # .LNOT - - movdqu 0x10($inp), %xmm6 # load round 1 key lea 0x10($inp), $inp + movdqa 0x00($const), %xmm0 # 0x01... + movdqa 0x10($const), %xmm1 # 0x02... + movdqa 0x20($const), %xmm2 # 0x04... + movdqa 0x30($const), %xmm3 # 0x08... + movdqa 0x40($const), %xmm4 # .LM0 + pcmpeqd %xmm5, %xmm5 # .LNOT + + movdqu ($inp), %xmm6 # load round 1 key movdqa %xmm7, ($out) # save round 0 key lea 0x10($out), $out dec $rounds jmp .Lkey_loop .align 16 .Lkey_loop: - pshufb %xmm13, %xmm6 # .LM0 - movdqa %xmm6, %xmm7 -___ - &bitslice_key (map("%xmm$_",(0..7, 8..12))); -$code.=<<___; - pxor %xmm14, %xmm5 # "pnot" - pxor %xmm14, %xmm6 - pxor %xmm14, %xmm0 - pxor %xmm14, %xmm1 - lea 0x10($inp), $inp - movdqa %xmm0, 0x00($out) # write bit-sliced round key - movdqa %xmm1, 0x10($out) - movdqa %xmm2, 0x20($out) - movdqa %xmm3, 0x30($out) - movdqa %xmm4, 0x40($out) - movdqa %xmm5, 0x50($out) - movdqa %xmm6, 0x60($out) - movdqa %xmm7, 0x70($out) + pshufb %xmm4, %xmm6 # .LM0 + + movdqa %xmm0, %xmm8 + movdqa %xmm1, %xmm9 + + pand %xmm6, %xmm8 + pand %xmm6, %xmm9 + movdqa %xmm2, %xmm10 + pcmpeqb %xmm0, %xmm8 + psllq \$4, %xmm0 # 0x10... + movdqa %xmm3, %xmm11 + pcmpeqb %xmm1, %xmm9 + psllq \$4, %xmm1 # 0x20... + + pand %xmm6, %xmm10 + pand %xmm6, %xmm11 + movdqa %xmm0, %xmm12 + pcmpeqb %xmm2, %xmm10 + psllq \$4, %xmm2 # 0x40... + movdqa %xmm1, %xmm13 + pcmpeqb %xmm3, %xmm11 + psllq \$4, %xmm3 # 0x80... + + movdqa %xmm2, %xmm14 + movdqa %xmm3, %xmm15 + pxor %xmm5, %xmm8 # "pnot" + pxor %xmm5, %xmm9 + + pand %xmm6, %xmm12 + pand %xmm6, %xmm13 + movdqa %xmm8, 0x00($out) # write bit-sliced round key + pcmpeqb %xmm0, %xmm12 + psrlq \$4, %xmm0 # 0x01... + movdqa %xmm9, 0x10($out) + pcmpeqb %xmm1, %xmm13 + psrlq \$4, %xmm1 # 0x02... + lea 0x10($inp), $inp + + pand %xmm6, %xmm14 + pand %xmm6, %xmm15 + movdqa %xmm10, 0x20($out) + pcmpeqb %xmm2, %xmm14 + psrlq \$4, %xmm2 # 0x04... + movdqa %xmm11, 0x30($out) + pcmpeqb %xmm3, %xmm15 + psrlq \$4, %xmm3 # 0x08... + movdqu ($inp), %xmm6 # load next round key + + pxor %xmm5, %xmm13 # "pnot" + pxor %xmm5, %xmm14 + movdqa %xmm12, 0x40($out) + movdqa %xmm13, 0x50($out) + movdqa %xmm14, 0x60($out) + movdqa %xmm15, 0x70($out) lea 0x80($out),$out - movdqu ($inp), %xmm6 # load next round key dec $rounds jnz .Lkey_loop - movdqa 0x70($const), %xmm7 # .L63 + movdqa 0x50($const), %xmm7 # .L63 #movdqa %xmm6, ($out) # don't save last round key ret .size _bsaes_key_convert,.-_bsaes_key_convert ___ } -if (1 && !$win64) { # following four functions are unsupported interface +if (0 && !$win64) { # following four functions are unsupported interface # used for benchmarking... $code.=<<___; .globl bsaes_enc_key_convert @@ -1051,19 +1159,29 @@ my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); -if (0) { # suppress unreferenced ECB subroutines, spare some space... +if ($ecb) { $code.=<<___; .globl bsaes_ecb_encrypt_blocks .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent .align 16 bsaes_ecb_encrypt_blocks: +.cfi_startproc + mov %rsp, %rax +.Lecb_enc_prologue: push %rbp +.cfi_push %rbp push %rbx +.cfi_push %rbx push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 lea -0x48(%rsp),%rsp +.cfi_adjust_cfa_offset 0x48 ___ $code.=<<___ if ($win64); lea -0xa0(%rsp), %rsp @@ -1081,6 +1199,7 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov %rsp,%rbp # backup %rsp +.cfi_def_cfa_register %rbp mov 240($arg4),%eax # rounds mov $arg1,$inp # backup arguments mov $arg2,$out @@ -1208,7 +1327,7 @@ $code.=<<___; lea ($inp), $arg1 lea ($out), $arg2 lea ($key), $arg3 - call AES_encrypt + call asm_AES_encrypt lea 16($inp), $inp lea 16($out), $out dec $len @@ -1224,7 +1343,8 @@ $code.=<<___; cmp %rax, %rbp jb .Lecb_enc_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax +.cfi_def_cfa %rax,8 ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1237,31 +1357,50 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lecb_enc_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rbp - lea 0x78(%rsp), %rsp + mov -48(%rax), %r15 +.cfi_restore %r15 + mov -40(%rax), %r14 +.cfi_restore %r14 + mov -32(%rax), %r13 +.cfi_restore %r13 + mov -24(%rax), %r12 +.cfi_restore %r12 + mov -16(%rax), %rbx +.cfi_restore %rbx + mov -8(%rax), %rbp +.cfi_restore %rbp + lea (%rax), %rsp # restore %rsp +.cfi_def_cfa_register %rsp .Lecb_enc_epilogue: ret +.cfi_endproc .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks .globl bsaes_ecb_decrypt_blocks .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent .align 16 bsaes_ecb_decrypt_blocks: +.cfi_startproc + mov %rsp, %rax +.Lecb_dec_prologue: push %rbp +.cfi_push %rbp push %rbx +.cfi_push %rbx push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 lea -0x48(%rsp),%rsp +.cfi_adjust_cfa_offset 0x48 ___ $code.=<<___ if ($win64); lea -0xa0(%rsp), %rsp @@ -1279,6 +1418,7 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov %rsp,%rbp # backup %rsp +.cfi_def_cfa_register %rbp mov 240($arg4),%eax # rounds mov $arg1,$inp # backup arguments mov $arg2,$out @@ -1407,7 +1547,7 @@ $code.=<<___; lea ($inp), $arg1 lea ($out), $arg2 lea ($key), $arg3 - call AES_decrypt + call asm_AES_decrypt lea 16($inp), $inp lea 16($out), $out dec $len @@ -1423,7 +1563,8 @@ $code.=<<___; cmp %rax, %rbp jb .Lecb_dec_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax +.cfi_def_cfa %rax,8 ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1436,44 +1577,63 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lecb_dec_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rbp - lea 0x78(%rsp), %rsp + mov -48(%rax), %r15 +.cfi_restore %r15 + mov -40(%rax), %r14 +.cfi_restore %r14 + mov -32(%rax), %r13 +.cfi_restore %r13 + mov -24(%rax), %r12 +.cfi_restore %r12 + mov -16(%rax), %rbx +.cfi_restore %rbx + mov -8(%rax), %rbp +.cfi_restore %rbp + lea (%rax), %rsp # restore %rsp +.cfi_def_cfa_register %rsp .Lecb_dec_epilogue: ret +.cfi_endproc .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks ___ } $code.=<<___; -.extern AES_cbc_encrypt +.extern asm_AES_cbc_encrypt .globl bsaes_cbc_encrypt .type bsaes_cbc_encrypt,\@abi-omnipotent .align 16 bsaes_cbc_encrypt: +.cfi_startproc ___ $code.=<<___ if ($win64); mov 48(%rsp),$arg6 # pull direction flag ___ $code.=<<___; cmp \$0,$arg6 - jne AES_cbc_encrypt + jne asm_AES_cbc_encrypt cmp \$128,$arg3 - jb AES_cbc_encrypt + jb asm_AES_cbc_encrypt + mov %rsp, %rax +.Lcbc_dec_prologue: push %rbp +.cfi_push %rbp push %rbx +.cfi_push %rbx push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 lea -0x48(%rsp), %rsp +.cfi_adjust_cfa_offset 0x48 ___ $code.=<<___ if ($win64); mov 0xa0(%rsp),$arg5 # pull ivp @@ -1492,28 +1652,29 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov %rsp, %rbp # backup %rsp +.cfi_def_cfa_register %rbp mov 240($arg4), %eax # rounds mov $arg1, $inp # backup arguments mov $arg2, $out mov $arg3, $len mov $arg4, $key - mov $arg5, %rdx + mov $arg5, %rbx shr \$4, $len # bytes to blocks - mov %eax, %ebx # rounds + mov %eax, %edx # rounds shl \$7, %rax # 128 bytes per inner round key sub \$`128-32`, %rax # size of bit-sliced key schedule sub %rax, %rsp mov %rsp, %rax # pass key schedule mov $key, %rcx # pass key - mov %ebx, %r10d # pass rounds + mov %edx, %r10d # pass rounds call _bsaes_key_convert pxor (%rsp),%xmm7 # fix up 0 round key movdqa %xmm6,(%rax) # save last round key movdqa %xmm7,(%rsp) - movdqu (%rdx), @XMM[15] # load IV + movdqu (%rbx), @XMM[15] # load IV sub \$8,$len .Lcbc_dec_loop: movdqu 0x00($inp), @XMM[0] # load input @@ -1524,7 +1685,7 @@ $code.=<<___; movdqu 0x50($inp), @XMM[5] mov %rsp, %rax # pass key schedule movdqu 0x60($inp), @XMM[6] - mov %ebx,%r10d # pass rounds + mov %edx,%r10d # pass rounds movdqu 0x70($inp), @XMM[7] movdqa @XMM[15], 0x20(%rbp) # put aside IV @@ -1564,7 +1725,7 @@ $code.=<<___; movdqu 0x00($inp), @XMM[0] # load input mov %rsp, %rax # pass key schedule - mov %ebx, %r10d # pass rounds + mov %edx, %r10d # pass rounds cmp \$2,$len jb .Lcbc_dec_one movdqu 0x10($inp), @XMM[1] @@ -1691,14 +1852,16 @@ $code.=<<___; jmp .Lcbc_dec_done .align 16 .Lcbc_dec_one: - movdqa @XMM[15], 0x20(%rbp) # put aside IV - call _bsaes_decrypt8 - pxor 0x20(%rbp), @XMM[0] # ^= IV - movdqu 0x00($inp), @XMM[15] # IV - movdqu @XMM[0], 0x00($out) # write output + lea ($inp), $arg1 + lea 0x20(%rbp), $arg2 # buffer output + lea ($key), $arg3 + call asm_AES_decrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[15] # ^= IV + movdqu @XMM[15], ($out) # write output + movdqa @XMM[0], @XMM[15] # IV .Lcbc_dec_done: - movdqu @XMM[15], (%rdx) # return IV + movdqu @XMM[15], (%rbx) # return IV lea (%rsp), %rax pxor %xmm0, %xmm0 .Lcbc_dec_bzero: # wipe key schedule [if any] @@ -1708,7 +1871,8 @@ $code.=<<___; cmp %rax, %rbp ja .Lcbc_dec_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax +.cfi_def_cfa %rax,8 ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1721,31 +1885,50 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lcbc_dec_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rbp - lea 0x78(%rsp), %rsp + mov -48(%rax), %r15 +.cfi_restore %r15 + mov -40(%rax), %r14 +.cfi_restore %r14 + mov -32(%rax), %r13 +.cfi_restore %r13 + mov -24(%rax), %r12 +.cfi_restore %r12 + mov -16(%rax), %rbx +.cfi_restore %rbx + mov -8(%rax), %rbp +.cfi_restore %rbp + lea (%rax), %rsp # restore %rsp +.cfi_def_cfa_register %rsp .Lcbc_dec_epilogue: ret +.cfi_endproc .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt .globl bsaes_ctr32_encrypt_blocks .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent .align 16 bsaes_ctr32_encrypt_blocks: +.cfi_startproc + mov %rsp, %rax +.Lctr_enc_prologue: push %rbp +.cfi_push %rbp push %rbx +.cfi_push %rbx push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 lea -0x48(%rsp), %rsp +.cfi_adjust_cfa_offset 0x48 ___ $code.=<<___ if ($win64); mov 0xa0(%rsp),$arg5 # pull ivp @@ -1764,6 +1947,7 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov %rsp, %rbp # backup %rsp +.cfi_def_cfa_register %rbp movdqu ($arg5), %xmm0 # load counter mov 240($arg4), %eax # rounds mov $arg1, $inp # backup arguments @@ -1819,21 +2003,21 @@ $code.=<<___; movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR pxor @XMM[9], @XMM[0] # xor with round0 key pxor @XMM[9], @XMM[1] - pshufb @XMM[8], @XMM[0] pxor @XMM[9], @XMM[2] - pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[3] - pshufb @XMM[8], @XMM[2] + pshufb @XMM[8], @XMM[0] + pshufb @XMM[8], @XMM[1] pxor @XMM[9], @XMM[4] - pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[5] - pshufb @XMM[8], @XMM[4] + pshufb @XMM[8], @XMM[2] + pshufb @XMM[8], @XMM[3] pxor @XMM[9], @XMM[6] - pshufb @XMM[8], @XMM[5] pxor @XMM[9], @XMM[7] + pshufb @XMM[8], @XMM[4] + pshufb @XMM[8], @XMM[5] pshufb @XMM[8], @XMM[6] - lea .LBS0(%rip), %r11 # constants table pshufb @XMM[8], @XMM[7] + lea .LBS0(%rip), %r11 # constants table mov %ebx,%r10d # pass rounds call _bsaes_encrypt8_bitslice @@ -1875,6 +2059,7 @@ $code.=<<___; jmp .Lctr_enc_done .align 16 .Lctr_enc_loop_done: + add \$8, $len movdqu 0x00($inp), @XMM[8] # load input pxor @XMM[8], @XMM[0] movdqu @XMM[0], 0x00($out) # write output @@ -1912,7 +2097,7 @@ $code.=<<___; lea 0x20(%rbp), $arg1 lea 0x30(%rbp), $arg2 lea ($key), $arg3 - call AES_encrypt + call asm_AES_encrypt movdqu ($inp), @XMM[1] lea 16($inp), $inp mov 0x2c(%rbp), %eax # load 32-bit counter @@ -1936,7 +2121,8 @@ $code.=<<___; cmp %rax, %rbp ja .Lctr_enc_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax +.cfi_def_cfa %rax,8 ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1949,20 +2135,861 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lctr_enc_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rbp - lea 0x78(%rsp), %rsp + mov -48(%rax), %r15 +.cfi_restore %r15 + mov -40(%rax), %r14 +.cfi_restore %r14 + mov -32(%rax), %r13 +.cfi_restore %r13 + mov -24(%rax), %r12 +.cfi_restore %r12 + mov -16(%rax), %rbx +.cfi_restore %rbx + mov -8(%rax), %rbp +.cfi_restore %rbp + lea (%rax), %rsp # restore %rsp +.cfi_def_cfa_register %rsp .Lctr_enc_epilogue: ret +.cfi_endproc .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks ___ +###################################################################### +# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2, +# const unsigned char iv[16]); +# +my ($twmask,$twres,$twtmp)=@XMM[13..15]; +$arg6=~s/d$//; + +$code.=<<___; +.globl bsaes_xts_encrypt +.type bsaes_xts_encrypt,\@abi-omnipotent +.align 16 +bsaes_xts_encrypt: +.cfi_startproc + mov %rsp, %rax +.Lxts_enc_prologue: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -0x48(%rsp), %rsp +.cfi_adjust_cfa_offset 0x48 +___ +$code.=<<___ if ($win64); + mov 0xa0(%rsp),$arg5 # pull key2 + mov 0xa8(%rsp),$arg6 # pull ivp + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lxts_enc_body: +___ +$code.=<<___; + mov %rsp, %rbp # backup %rsp +.cfi_def_cfa_register %rbp + mov $arg1, $inp # backup arguments + mov $arg2, $out + mov $arg3, $len + mov $arg4, $key + + lea ($arg6), $arg1 + lea 0x20(%rbp), $arg2 + lea ($arg5), $arg3 + call asm_AES_encrypt # generate initial tweak + + mov 240($key), %eax # rounds + mov $len, %rbx # backup $len + + mov %eax, %edx # rounds + shl \$7, %rax # 128 bytes per inner round key + sub \$`128-32`, %rax # size of bit-sliced key schedule + sub %rax, %rsp + + mov %rsp, %rax # pass key schedule + mov $key, %rcx # pass key + mov %edx, %r10d # pass rounds + call _bsaes_key_convert + pxor %xmm6, %xmm7 # fix up last round key + movdqa %xmm7, (%rax) # save last round key + + and \$-16, $len + sub \$0x80, %rsp # place for tweak[8] + movdqa 0x20(%rbp), @XMM[7] # initial tweak + + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + + sub \$0x80, $len + jc .Lxts_enc_short + jmp .Lxts_enc_loop + +.align 16 +.Lxts_enc_loop: +___ + for ($i=0;$i<7;$i++) { + $code.=<<___; + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + movdqa @XMM[7], @XMM[$i] + movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] +___ + $code.=<<___ if ($i>=1); + movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] +___ + $code.=<<___ if ($i>=2); + pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] +___ + } +$code.=<<___; + movdqu 0x60($inp), @XMM[8+6] + pxor @XMM[8+5], @XMM[5] + movdqu 0x70($inp), @XMM[8+7] + lea 0x80($inp), $inp + movdqa @XMM[7], 0x70(%rsp) + pxor @XMM[8+6], @XMM[6] + lea 0x80(%rsp), %rax # pass key schedule + pxor @XMM[8+7], @XMM[7] + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor 0x40(%rsp), @XMM[3] + movdqu @XMM[6], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[3], 0x40($out) + pxor 0x60(%rsp), @XMM[2] + movdqu @XMM[7], 0x50($out) + pxor 0x70(%rsp), @XMM[5] + movdqu @XMM[2], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + + movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] + + sub \$0x80,$len + jnc .Lxts_enc_loop + +.Lxts_enc_short: + add \$0x80, $len + jz .Lxts_enc_done +___ + for ($i=0;$i<7;$i++) { + $code.=<<___; + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + movdqa @XMM[7], @XMM[$i] + movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] +___ + $code.=<<___ if ($i>=1); + movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] + cmp \$`0x10*$i`,$len + je .Lxts_enc_$i +___ + $code.=<<___ if ($i>=2); + pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] +___ + } +$code.=<<___; + movdqu 0x60($inp), @XMM[8+6] + pxor @XMM[8+5], @XMM[5] + movdqa @XMM[7], 0x70(%rsp) + lea 0x70($inp), $inp + pxor @XMM[8+6], @XMM[6] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor 0x40(%rsp), @XMM[3] + movdqu @XMM[6], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[3], 0x40($out) + pxor 0x60(%rsp), @XMM[2] + movdqu @XMM[7], 0x50($out) + movdqu @XMM[2], 0x60($out) + lea 0x70($out), $out + + movdqa 0x70(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_6: + pxor @XMM[8+4], @XMM[4] + lea 0x60($inp), $inp + pxor @XMM[8+5], @XMM[5] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor 0x40(%rsp), @XMM[3] + movdqu @XMM[6], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[3], 0x40($out) + movdqu @XMM[7], 0x50($out) + lea 0x60($out), $out + + movdqa 0x60(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_5: + pxor @XMM[8+3], @XMM[3] + lea 0x50($inp), $inp + pxor @XMM[8+4], @XMM[4] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor 0x40(%rsp), @XMM[3] + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + lea 0x50($out), $out + + movdqa 0x50(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_4: + pxor @XMM[8+2], @XMM[2] + lea 0x40($inp), $inp + pxor @XMM[8+3], @XMM[3] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + lea 0x40($out), $out + + movdqa 0x40(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_3: + pxor @XMM[8+1], @XMM[1] + lea 0x30($inp), $inp + pxor @XMM[8+2], @XMM[2] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + lea 0x30($out), $out + + movdqa 0x30(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_2: + pxor @XMM[8+0], @XMM[0] + lea 0x20($inp), $inp + pxor @XMM[8+1], @XMM[1] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + lea 0x20($out), $out + + movdqa 0x20(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_1: + pxor @XMM[0], @XMM[8] + lea 0x10($inp), $inp + movdqa @XMM[8], 0x20(%rbp) + lea 0x20(%rbp), $arg1 + lea 0x20(%rbp), $arg2 + lea ($key), $arg3 + call asm_AES_encrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[0] # ^= tweak[] + #pxor @XMM[8], @XMM[0] + #lea 0x80(%rsp), %rax # pass key schedule + #mov %edx, %r10d # pass rounds + #call _bsaes_encrypt8 + #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + movdqu @XMM[0], 0x00($out) # write output + lea 0x10($out), $out + + movdqa 0x10(%rsp), @XMM[7] # next iteration tweak + +.Lxts_enc_done: + and \$15, %ebx + jz .Lxts_enc_ret + mov $out, %rdx + +.Lxts_enc_steal: + movzb ($inp), %eax + movzb -16(%rdx), %ecx + lea 1($inp), $inp + mov %al, -16(%rdx) + mov %cl, 0(%rdx) + lea 1(%rdx), %rdx + sub \$1,%ebx + jnz .Lxts_enc_steal + + movdqu -16($out), @XMM[0] + lea 0x20(%rbp), $arg1 + pxor @XMM[7], @XMM[0] + lea 0x20(%rbp), $arg2 + movdqa @XMM[0], 0x20(%rbp) + lea ($key), $arg3 + call asm_AES_encrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[7] + movdqu @XMM[7], -16($out) + +.Lxts_enc_ret: + lea (%rsp), %rax + pxor %xmm0, %xmm0 +.Lxts_enc_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + ja .Lxts_enc_bzero + + lea 0x78(%rbp),%rax +.cfi_def_cfa %rax,8 +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rax), %rax +.Lxts_enc_tail: +___ +$code.=<<___; + mov -48(%rax), %r15 +.cfi_restore %r15 + mov -40(%rax), %r14 +.cfi_restore %r14 + mov -32(%rax), %r13 +.cfi_restore %r13 + mov -24(%rax), %r12 +.cfi_restore %r12 + mov -16(%rax), %rbx +.cfi_restore %rbx + mov -8(%rax), %rbp +.cfi_restore %rbp + lea (%rax), %rsp # restore %rsp +.cfi_def_cfa_register %rsp +.Lxts_enc_epilogue: + ret +.cfi_endproc +.size bsaes_xts_encrypt,.-bsaes_xts_encrypt + +.globl bsaes_xts_decrypt +.type bsaes_xts_decrypt,\@abi-omnipotent +.align 16 +bsaes_xts_decrypt: +.cfi_startproc + mov %rsp, %rax +.Lxts_dec_prologue: + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -0x48(%rsp), %rsp +.cfi_adjust_cfa_offset 0x48 +___ +$code.=<<___ if ($win64); + mov 0xa0(%rsp),$arg5 # pull key2 + mov 0xa8(%rsp),$arg6 # pull ivp + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lxts_dec_body: +___ +$code.=<<___; + mov %rsp, %rbp # backup %rsp + mov $arg1, $inp # backup arguments + mov $arg2, $out + mov $arg3, $len + mov $arg4, $key + + lea ($arg6), $arg1 + lea 0x20(%rbp), $arg2 + lea ($arg5), $arg3 + call asm_AES_encrypt # generate initial tweak + + mov 240($key), %eax # rounds + mov $len, %rbx # backup $len + + mov %eax, %edx # rounds + shl \$7, %rax # 128 bytes per inner round key + sub \$`128-32`, %rax # size of bit-sliced key schedule + sub %rax, %rsp + + mov %rsp, %rax # pass key schedule + mov $key, %rcx # pass key + mov %edx, %r10d # pass rounds + call _bsaes_key_convert + pxor (%rsp), %xmm7 # fix up round 0 key + movdqa %xmm6, (%rax) # save last round key + movdqa %xmm7, (%rsp) + + xor %eax, %eax # if ($len%16) len-=16; + and \$-16, $len + test \$15, %ebx + setnz %al + shl \$4, %rax + sub %rax, $len + + sub \$0x80, %rsp # place for tweak[8] + movdqa 0x20(%rbp), @XMM[7] # initial tweak + + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + + sub \$0x80, $len + jc .Lxts_dec_short + jmp .Lxts_dec_loop + +.align 16 +.Lxts_dec_loop: +___ + for ($i=0;$i<7;$i++) { + $code.=<<___; + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + movdqa @XMM[7], @XMM[$i] + movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] +___ + $code.=<<___ if ($i>=1); + movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] +___ + $code.=<<___ if ($i>=2); + pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] +___ + } +$code.=<<___; + movdqu 0x60($inp), @XMM[8+6] + pxor @XMM[8+5], @XMM[5] + movdqu 0x70($inp), @XMM[8+7] + lea 0x80($inp), $inp + movdqa @XMM[7], 0x70(%rsp) + pxor @XMM[8+6], @XMM[6] + lea 0x80(%rsp), %rax # pass key schedule + pxor @XMM[8+7], @XMM[7] + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + pxor 0x40(%rsp), @XMM[2] + movdqu @XMM[4], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[2], 0x40($out) + pxor 0x60(%rsp), @XMM[3] + movdqu @XMM[7], 0x50($out) + pxor 0x70(%rsp), @XMM[5] + movdqu @XMM[3], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + + movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] + + sub \$0x80,$len + jnc .Lxts_dec_loop + +.Lxts_dec_short: + add \$0x80, $len + jz .Lxts_dec_done +___ + for ($i=0;$i<7;$i++) { + $code.=<<___; + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + movdqa @XMM[7], @XMM[$i] + movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] +___ + $code.=<<___ if ($i>=1); + movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] + cmp \$`0x10*$i`,$len + je .Lxts_dec_$i +___ + $code.=<<___ if ($i>=2); + pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] +___ + } +$code.=<<___; + movdqu 0x60($inp), @XMM[8+6] + pxor @XMM[8+5], @XMM[5] + movdqa @XMM[7], 0x70(%rsp) + lea 0x70($inp), $inp + pxor @XMM[8+6], @XMM[6] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + pxor 0x40(%rsp), @XMM[2] + movdqu @XMM[4], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[2], 0x40($out) + pxor 0x60(%rsp), @XMM[3] + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + lea 0x70($out), $out + + movdqa 0x70(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_6: + pxor @XMM[8+4], @XMM[4] + lea 0x60($inp), $inp + pxor @XMM[8+5], @XMM[5] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + pxor 0x40(%rsp), @XMM[2] + movdqu @XMM[4], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + lea 0x60($out), $out + + movdqa 0x60(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_5: + pxor @XMM[8+3], @XMM[3] + lea 0x50($inp), $inp + pxor @XMM[8+4], @XMM[4] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + pxor 0x40(%rsp), @XMM[2] + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + lea 0x50($out), $out + + movdqa 0x50(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_4: + pxor @XMM[8+2], @XMM[2] + lea 0x40($inp), $inp + pxor @XMM[8+3], @XMM[3] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + lea 0x40($out), $out + + movdqa 0x40(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_3: + pxor @XMM[8+1], @XMM[1] + lea 0x30($inp), $inp + pxor @XMM[8+2], @XMM[2] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + lea 0x30($out), $out + + movdqa 0x30(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_2: + pxor @XMM[8+0], @XMM[0] + lea 0x20($inp), $inp + pxor @XMM[8+1], @XMM[1] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + lea 0x20($out), $out + + movdqa 0x20(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_1: + pxor @XMM[0], @XMM[8] + lea 0x10($inp), $inp + movdqa @XMM[8], 0x20(%rbp) + lea 0x20(%rbp), $arg1 + lea 0x20(%rbp), $arg2 + lea ($key), $arg3 + call asm_AES_decrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[0] # ^= tweak[] + #pxor @XMM[8], @XMM[0] + #lea 0x80(%rsp), %rax # pass key schedule + #mov %edx, %r10d # pass rounds + #call _bsaes_decrypt8 + #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + movdqu @XMM[0], 0x00($out) # write output + lea 0x10($out), $out + + movdqa 0x10(%rsp), @XMM[7] # next iteration tweak + +.Lxts_dec_done: + and \$15, %ebx + jz .Lxts_dec_ret + + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp + pshufd \$0x13, $twtmp, $twres + movdqa @XMM[7], @XMM[6] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + movdqu ($inp), @XMM[0] + pxor $twres, @XMM[7] + + lea 0x20(%rbp), $arg1 + pxor @XMM[7], @XMM[0] + lea 0x20(%rbp), $arg2 + movdqa @XMM[0], 0x20(%rbp) + lea ($key), $arg3 + call asm_AES_decrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[7] + mov $out, %rdx + movdqu @XMM[7], ($out) + +.Lxts_dec_steal: + movzb 16($inp), %eax + movzb (%rdx), %ecx + lea 1($inp), $inp + mov %al, (%rdx) + mov %cl, 16(%rdx) + lea 1(%rdx), %rdx + sub \$1,%ebx + jnz .Lxts_dec_steal + + movdqu ($out), @XMM[0] + lea 0x20(%rbp), $arg1 + pxor @XMM[6], @XMM[0] + lea 0x20(%rbp), $arg2 + movdqa @XMM[0], 0x20(%rbp) + lea ($key), $arg3 + call asm_AES_decrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[6] + movdqu @XMM[6], ($out) + +.Lxts_dec_ret: + lea (%rsp), %rax + pxor %xmm0, %xmm0 +.Lxts_dec_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + ja .Lxts_dec_bzero + + lea 0x78(%rbp),%rax +.cfi_def_cfa %rax,8 +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rax), %rax +.Lxts_dec_tail: +___ +$code.=<<___; + mov -48(%rax), %r15 +.cfi_restore %r15 + mov -40(%rax), %r14 +.cfi_restore %r14 + mov -32(%rax), %r13 +.cfi_restore %r13 + mov -24(%rax), %r12 +.cfi_restore %r12 + mov -16(%rax), %rbx +.cfi_restore %rbx + mov -8(%rax), %rbp +.cfi_restore %rbp + lea (%rax), %rsp # restore %rsp +.cfi_def_cfa_register %rsp +.Lxts_dec_epilogue: + ret +.cfi_endproc +.size bsaes_xts_decrypt,.-bsaes_xts_decrypt +___ } $code.=<<___; .type _bsaes_const,\@object @@ -1984,14 +3011,8 @@ _bsaes_const: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b .LSRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d -.LM0: - .quad 0x02060a0e03070b0f, 0x0004080c0105090d .LM0SR: .quad 0x0a0e02060f03070b, 0x0004080c05090d01 -.LNOT: # magic constants - .quad 0xffffffffffffffff, 0xffffffffffffffff -.L63: - .quad 0x6363636363636363, 0x6363636363636363 .LSWPUP: # byte-swap upper dword .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 .LSWPUPM0SR: @@ -2012,11 +3033,199 @@ _bsaes_const: .quad 0x0000000000000000, 0x0000000700000000 .LADD8: .quad 0x0000000000000000, 0x0000000800000000 +.Lxts_magic: + .long 0x87,0,1,0 +.Lmasks: + .quad 0x0101010101010101, 0x0101010101010101 + .quad 0x0202020202020202, 0x0202020202020202 + .quad 0x0404040404040404, 0x0404040404040404 + .quad 0x0808080808080808, 0x0808080808080808 +.LM0: + .quad 0x02060a0e03070b0f, 0x0004080c0105090d +.L63: + .quad 0x6363636363636363, 0x6363636363636363 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" .align 64 .size _bsaes_const,.-_bsaes_const ___ +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<=prologue label + jbe .Lin_prologue + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=tail label + jae .Lin_tail + + mov 160($context),%rax # pull context->Rbp + + lea 0x40(%rax),%rsi # %xmm save area + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0xa0+0x78(%rax),%rax # adjust stack pointer + +.Lin_tail: + mov -48(%rax),%rbp + mov -40(%rax),%rbx + mov -32(%rax),%r12 + mov -24(%rax),%r13 + mov -16(%rax),%r14 + mov -8(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lin_prologue: + mov %rax,152($context) # restore context->Rsp + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 +___ +$code.=<<___ if ($ecb); + .rva .Lecb_enc_prologue + .rva .Lecb_enc_epilogue + .rva .Lecb_enc_info + + .rva .Lecb_dec_prologue + .rva .Lecb_dec_epilogue + .rva .Lecb_dec_info +___ +$code.=<<___; + .rva .Lcbc_dec_prologue + .rva .Lcbc_dec_epilogue + .rva .Lcbc_dec_info + + .rva .Lctr_enc_prologue + .rva .Lctr_enc_epilogue + .rva .Lctr_enc_info + + .rva .Lxts_enc_prologue + .rva .Lxts_enc_epilogue + .rva .Lxts_enc_info + + .rva .Lxts_dec_prologue + .rva .Lxts_dec_epilogue + .rva .Lxts_dec_info + +.section .xdata +.align 8 +___ +$code.=<<___ if ($ecb); +.Lecb_enc_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] + .rva .Lecb_enc_tail + .long 0 +.Lecb_dec_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] + .rva .Lecb_dec_tail + .long 0 +___ +$code.=<<___; +.Lcbc_dec_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] + .rva .Lcbc_dec_tail + .long 0 +.Lctr_enc_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] + .rva .Lctr_enc_tail + .long 0 +.Lxts_enc_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] + .rva .Lxts_enc_tail + .long 0 +.Lxts_dec_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] + .rva .Lxts_dec_tail + .long 0 +___ +} + $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code;