#! /usr/bin/env perl
-# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# Nehalem 29.6/40.3/14.6 10.0/11.8
# Atom 57.3/74.2/32.1 60.9/77.2(***)
# Silvermont 52.7/64.0/19.5 48.8/60.8(***)
+# Goldmont 38.9/49.0/17.8 10.6/12.6
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
.type _vpaes_encrypt_core,\@abi-omnipotent
.align 16
_vpaes_encrypt_core:
+.cfi_startproc
mov %rdx, %r9
mov \$16, %r11
mov 240(%rdx),%eax
pxor %xmm4, %xmm0 # 0 = A
pshufb %xmm1, %xmm0
ret
+.cfi_endproc
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
-
+
##
## Decryption core
##
.type _vpaes_decrypt_core,\@abi-omnipotent
.align 16
_vpaes_decrypt_core:
+.cfi_startproc
mov %rdx, %r9 # load key
mov 240(%rdx),%eax
movdqa %xmm9, %xmm1
pxor %xmm4, %xmm0 # 0 = A
pshufb %xmm2, %xmm0
ret
+.cfi_endproc
.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
########################################################
.type _vpaes_schedule_core,\@abi-omnipotent
.align 16
_vpaes_schedule_core:
+.cfi_startproc
# rdi = key
# rsi = size in bits
# rdx = buffer
##
.Lschedule_128:
mov \$10, %esi
-
+
.Loop_schedule_128:
call _vpaes_schedule_round
dec %rsi
.Loop_schedule_192:
call _vpaes_schedule_round
- palignr \$8,%xmm6,%xmm0
+ palignr \$8,%xmm6,%xmm0
call _vpaes_schedule_mangle # save key n
call _vpaes_schedule_192_smear
call _vpaes_schedule_mangle # save key n+1
movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
call _vpaes_schedule_transform # input transform
mov \$7, %esi
-
+
.Loop_schedule_256:
call _vpaes_schedule_mangle # output low result
movdqa %xmm0, %xmm6 # save cur_lo in xmm6
call _vpaes_schedule_round
dec %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
# low round. swap xmm7 and xmm6
pshufd \$0xFF, %xmm0, %xmm0
movdqa %xmm6, %xmm7
call _vpaes_schedule_low_round
movdqa %xmm5, %xmm7
-
+
jmp .Loop_schedule_256
-
+
##
## .aes_schedule_mangle_last
##
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
ret
+.cfi_endproc
.size _vpaes_schedule_core,.-_vpaes_schedule_core
##
.type _vpaes_schedule_192_smear,\@abi-omnipotent
.align 16
_vpaes_schedule_192_smear:
+.cfi_startproc
pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
pxor %xmm1, %xmm6 # -> c+d c 0 0
movdqa %xmm6, %xmm0
movhlps %xmm1, %xmm6 # clobber low side with zeros
ret
+.cfi_endproc
.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
##
.type _vpaes_schedule_round,\@abi-omnipotent
.align 16
_vpaes_schedule_round:
+.cfi_startproc
# extract rcon from xmm8
pxor %xmm1, %xmm1
palignr \$15, %xmm8, %xmm1
# rotate
pshufd \$0xFF, %xmm0, %xmm0
palignr \$1, %xmm0, %xmm0
-
+
# fall through...
-
+
# low round: same as high round, but no rotation and no rcon.
_vpaes_schedule_low_round:
# smear xmm7
pxor %xmm4, %xmm0 # 0 = sbox output
# add in smeared stuff
- pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm0
movdqa %xmm0, %xmm7
ret
+.cfi_endproc
.size _vpaes_schedule_round,.-_vpaes_schedule_round
##
.type _vpaes_schedule_transform,\@abi-omnipotent
.align 16
_vpaes_schedule_transform:
+.cfi_startproc
movdqa %xmm9, %xmm1
pandn %xmm0, %xmm1
psrld \$4, %xmm1
pshufb %xmm1, %xmm0
pxor %xmm2, %xmm0
ret
+.cfi_endproc
.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
##
.type _vpaes_schedule_mangle,\@abi-omnipotent
.align 16
_vpaes_schedule_mangle:
+.cfi_startproc
movdqa %xmm0, %xmm4 # save xmm0 for later
movdqa .Lk_mc_forward(%rip),%xmm5
test %rcx, %rcx
and \$0x30, %r8
movdqu %xmm3, (%rdx)
ret
+.cfi_endproc
.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
#
.type ${PREFIX}_set_encrypt_key,\@function,3
.align 16
${PREFIX}_set_encrypt_key:
+.cfi_startproc
___
$code.=<<___ if ($win64);
lea -0xb8(%rsp),%rsp
$code.=<<___;
xor %eax,%eax
ret
+.cfi_endproc
.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
.globl ${PREFIX}_set_decrypt_key
.type ${PREFIX}_set_decrypt_key,\@function,3
.align 16
${PREFIX}_set_decrypt_key:
+.cfi_startproc
___
$code.=<<___ if ($win64);
lea -0xb8(%rsp),%rsp
$code.=<<___;
xor %eax,%eax
ret
+.cfi_endproc
.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
.globl ${PREFIX}_encrypt
.type ${PREFIX}_encrypt,\@function,3
.align 16
${PREFIX}_encrypt:
+.cfi_startproc
___
$code.=<<___ if ($win64);
lea -0xb8(%rsp),%rsp
___
$code.=<<___;
ret
+.cfi_endproc
.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
.globl ${PREFIX}_decrypt
.type ${PREFIX}_decrypt,\@function,3
.align 16
${PREFIX}_decrypt:
+.cfi_startproc
___
$code.=<<___ if ($win64);
lea -0xb8(%rsp),%rsp
___
$code.=<<___;
ret
+.cfi_endproc
.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
___
{
.type ${PREFIX}_cbc_encrypt,\@function,6
.align 16
${PREFIX}_cbc_encrypt:
+.cfi_startproc
xchg $key,$len
___
($len,$key)=($key,$len);
$code.=<<___;
.Lcbc_abort:
ret
+.cfi_endproc
.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
___
}
.type _vpaes_preheat,\@abi-omnipotent
.align 16
_vpaes_preheat:
+.cfi_startproc
lea .Lk_s0F(%rip), %r10
movdqa -0x20(%r10), %xmm10 # .Lk_inv
movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
movdqa 0x50(%r10), %xmm15 # .Lk_sb2
movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
ret
+.cfi_endproc
.size _vpaes_preheat,.-_vpaes_preheat
########################################################
## ##