-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
######################################################################
## Constant-time SSSE3 AES core implementation.
#
# Core 2(**) 29.6/41.1/14.3 21.9/25.2(***)
# Nehalem 29.6/40.3/14.6 10.0/11.8
-# Atom 57.3/74.2/32.1 60.9/82.3(***)
+# Atom 57.3/74.2/32.1 60.9/77.2(***)
+# Silvermont 52.7/64.0/19.5 48.8/60.8(***)
+# Goldmont 38.9/49.0/17.8 10.6/12.6
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$PREFIX="vpaes";
pshufb %xmm1, %xmm0
ret
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
-
+
##
## Decryption core
##
movdqa -0x10(%r10),%xmm1 # 0 : sb9t
pshufb %xmm2, %xmm4 # 4 = sb9u
pshufb %xmm3, %xmm1 # 0 = sb9t
- pxor %xmm0, %xmm4
- add \$16, %r9 # next round key
- pxor %xmm4, %xmm1 # 0 = ch
-
+ pxor %xmm4, %xmm0
movdqa 0x00(%r10),%xmm4 # 4 : sbdu
- pshufb %xmm5, %xmm1 # MC ch
- pshufb %xmm2, %xmm4 # 4 = sbdu
- movdqa 0x10(%r10),%xmm0 # 0 : sbdt
- pxor %xmm1, %xmm4 # 4 = ch
- pshufb %xmm3, %xmm0 # 0 = sbdt
- sub \$1,%rax # nr--
- pxor %xmm4, %xmm0 # 0 = ch
+ pxor %xmm1, %xmm0 # 0 = ch
+ movdqa 0x10(%r10),%xmm1 # 0 : sbdt
- movdqa 0x20(%r10),%xmm4 # 4 : sbbu
+ pshufb %xmm2, %xmm4 # 4 = sbdu
pshufb %xmm5, %xmm0 # MC ch
+ pshufb %xmm3, %xmm1 # 0 = sbdt
+ pxor %xmm4, %xmm0 # 4 = ch
+ movdqa 0x20(%r10),%xmm4 # 4 : sbbu
+ pxor %xmm1, %xmm0 # 0 = ch
movdqa 0x30(%r10),%xmm1 # 0 : sbbt
+
pshufb %xmm2, %xmm4 # 4 = sbbu
+ pshufb %xmm5, %xmm0 # MC ch
pshufb %xmm3, %xmm1 # 0 = sbbt
- pxor %xmm0, %xmm4 # 4 = ch
- pxor %xmm4, %xmm1 # 0 = ch
-
+ pxor %xmm4, %xmm0 # 4 = ch
movdqa 0x40(%r10),%xmm4 # 4 : sbeu
- pshufb %xmm5, %xmm1 # MC ch
- movdqa 0x50(%r10),%xmm0 # 0 : sbet
+ pxor %xmm1, %xmm0 # 0 = ch
+ movdqa 0x50(%r10),%xmm1 # 0 : sbet
+
pshufb %xmm2, %xmm4 # 4 = sbeu
- pshufb %xmm3, %xmm0 # 0 = sbet
+ pshufb %xmm5, %xmm0 # MC ch
+ pshufb %xmm3, %xmm1 # 0 = sbet
+ pxor %xmm4, %xmm0 # 4 = ch
+ add \$16, %r9 # next round key
palignr \$12, %xmm5, %xmm5
- pxor %xmm1, %xmm4 # 4 = ch
- pxor %xmm4, %xmm0 # 0 = ch
+ pxor %xmm1, %xmm0 # 0 = ch
+ sub \$1,%rax # nr--
.Ldec_entry:
# top of round
##
.Lschedule_128:
mov \$10, %esi
-
+
.Loop_schedule_128:
call _vpaes_schedule_round
dec %rsi
.Loop_schedule_192:
call _vpaes_schedule_round
- palignr \$8,%xmm6,%xmm0
+ palignr \$8,%xmm6,%xmm0
call _vpaes_schedule_mangle # save key n
call _vpaes_schedule_192_smear
call _vpaes_schedule_mangle # save key n+1
movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
call _vpaes_schedule_transform # input transform
mov \$7, %esi
-
+
.Loop_schedule_256:
call _vpaes_schedule_mangle # output low result
movdqa %xmm0, %xmm6 # save cur_lo in xmm6
call _vpaes_schedule_round
dec %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
# low round. swap xmm7 and xmm6
pshufd \$0xFF, %xmm0, %xmm0
movdqa %xmm6, %xmm7
call _vpaes_schedule_low_round
movdqa %xmm5, %xmm7
-
+
jmp .Loop_schedule_256
-
+
##
## .aes_schedule_mangle_last
##
# rotate
pshufd \$0xFF, %xmm0, %xmm0
palignr \$1, %xmm0, %xmm0
-
+
# fall through...
-
+
# low round: same as high round, but no rotation and no rcon.
_vpaes_schedule_low_round:
# smear xmm7
pxor %xmm4, %xmm0 # 0 = sbox output
# add in smeared stuff
- pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm0
movdqa %xmm0, %xmm7
ret
.size _vpaes_schedule_round,.-_vpaes_schedule_round
.Lk_dsbo: # decryption sbox final output
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
.align 64
.size _vpaes_consts,.-_vpaes_consts
___