-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
######################################################################
## Constant-time SSSE3 AES core implementation.
#
# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
# Nehalem 27.9/40.4/18.1 10.2/11.9
-# Atom 70.7/92.1/60.1 61.1/81.0(***)
+# Atom 70.7/92.1/60.1 61.1/75.4(***)
+# Silvermont 45.4/62.9/24.1 49.2/61.1(***)
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
-&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
+&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
$PREFIX="vpaes";
&movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t
&pshufb ("xmm4","xmm2"); # 4 = sb9u
&pshufb ("xmm1","xmm3"); # 0 = sb9t
- &pxor ("xmm4","xmm0");
- &add ($key,16); # next round key
- &pxor ("xmm1","xmm4"); # 0 = ch
-
+ &pxor ("xmm0","xmm4");
&movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
- &pshufb ("xmm1","xmm5"); # MC ch
- &pshufb ("xmm4","xmm2"); # 4 = sbdu
- &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
- &pxor ("xmm4","xmm1"); # 4 = ch
- &pshufb ("xmm0","xmm3"); # 0 = sbdt
- &sub ($round,1); # nr--
- &pxor ("xmm0","xmm4"); # 0 = ch
+ &pxor ("xmm0","xmm1"); # 0 = ch
+ &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt
- &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
+ &pshufb ("xmm4","xmm2"); # 4 = sbdu
&pshufb ("xmm0","xmm5"); # MC ch
+ &pshufb ("xmm1","xmm3"); # 0 = sbdt
+ &pxor ("xmm0","xmm4"); # 4 = ch
+ &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
+ &pxor ("xmm0","xmm1"); # 0 = ch
&movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt
+
&pshufb ("xmm4","xmm2"); # 4 = sbbu
+ &pshufb ("xmm0","xmm5"); # MC ch
&pshufb ("xmm1","xmm3"); # 0 = sbbt
- &pxor ("xmm4","xmm0"); # 4 = ch
- &pxor ("xmm1","xmm4"); # 0 = ch
-
+ &pxor ("xmm0","xmm4"); # 4 = ch
&movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
- &pshufb ("xmm1","xmm5"); # MC ch
- &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
+ &pxor ("xmm0","xmm1"); # 0 = ch
+ &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet
+
&pshufb ("xmm4","xmm2"); # 4 = sbeu
- &pshufb ("xmm0","xmm3"); # 0 = sbet
+ &pshufb ("xmm0","xmm5"); # MC ch
+ &pshufb ("xmm1","xmm3"); # 0 = sbet
+ &pxor ("xmm0","xmm4"); # 4 = ch
+ &add ($key,16); # next round key
&palignr("xmm5","xmm5",12);
- &pxor ("xmm4","xmm1"); # 4 = ch
- &pxor ("xmm0","xmm4"); # 0 = ch
+ &pxor ("xmm0","xmm1"); # 0 = ch
+ &sub ($round,1); # nr--
&set_label("dec_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
- &pandn ("xmm1","xmm0"); # 1 = i<<4
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
- &psrld ("xmm1",4); # 1 = i
+ &pandn ("xmm1","xmm0"); # 1 = i<<4
&pand ("xmm0","xmm6"); # 0 = k
+ &psrld ("xmm1",4); # 1 = i
&pshufb ("xmm2","xmm0"); # 2 = a/k
&movdqa ("xmm3","xmm7"); # 3 : 1/i
&pxor ("xmm0","xmm1"); # 0 = j
##
&set_label("schedule_192",16);
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
- &call ("_vpaes_schedule_transform"); # input transform
+ &call ("_vpaes_schedule_transform"); # input transform
&movdqa ("xmm6","xmm0"); # save short part
&pxor ("xmm4","xmm4"); # clear 4
&movhlps("xmm6","xmm4"); # clobber low side with zeros
##
&set_label("schedule_256",16);
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
- &call ("_vpaes_schedule_transform"); # input transform
+ &call ("_vpaes_schedule_transform"); # input transform
&mov ($round,7);
&set_label("loop_schedule_256");
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
- &call ("_vpaes_schedule_mangle");
+ &call ("_vpaes_schedule_mangle");
# low round. swap xmm7 and xmm6
&pshufd ("xmm0","xmm0",0xFF);
# subbyte
&movdqa ("xmm4",&QWP($k_s0F,$const));
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
- &movdqa ("xmm1","xmm4");
+ &movdqa ("xmm1","xmm4");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm4"); # 0 = k
&function_end("${PREFIX}_cbc_encrypt");
&asm_finish();
+
+close STDOUT;