modes/ocb128.c: Reset nonce-dependent variables on setiv
[openssl.git] / crypto / aes / asm / vpaes-x86.pl
index 433912ff57ddf74abfb061a8c67c74ced4fbd77d..7d57edc0eb6c1090838b50ac367a978f64b07118 100644 (file)
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 ######################################################################
 ## Constant-time SSSE3 AES core implementation.
@@ -29,7 +36,8 @@
 #
 # Core 2(**)   28.1/41.4/18.3          21.9/25.2(***)
 # Nehalem      27.9/40.4/18.1          10.2/11.9
-# Atom         70.7/92.1/60.1          61.1/81.0(***)
+# Atom         70.7/92.1/60.1          61.1/75.4(***)
+# Silvermont   45.4/62.9/24.1          49.2/61.1(***)
 #
 # (*)  "Hyper-threading" in the context refers rather to cache shared
 #      among multiple cores, than to specifically Intel HTT. As vast
@@ -50,7 +58,11 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
-&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+$output = pop;
+open OUT,">$output";
+*STDOUT=*OUT;
+
+&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
 
 $PREFIX="vpaes";
 
@@ -295,43 +307,43 @@ $k_dsbo=0x2c0;            # decryption sbox final output
        &movdqa ("xmm1",&QWP(-0x10,$base));     # 0 : sb9t
        &pshufb ("xmm4","xmm2");                # 4 = sb9u
        &pshufb ("xmm1","xmm3");                # 0 = sb9t
-       &pxor   ("xmm4","xmm0");
-       &add    ($key,16);                      # next round key
-       &pxor   ("xmm1","xmm4");                # 0 = ch
-
+       &pxor   ("xmm0","xmm4");
        &movdqa ("xmm4",&QWP(0,$base));         # 4 : sbdu
-       &pshufb ("xmm1","xmm5");                # MC ch
-       &pshufb ("xmm4","xmm2");                # 4 = sbdu
-       &movdqa ("xmm0",&QWP(0x10,$base));      # 0 : sbdt
-       &pxor   ("xmm4","xmm1");                # 4 = ch
-       &pshufb ("xmm0","xmm3");                # 0 = sbdt
-       &sub    ($round,1);                     # nr--
-       &pxor   ("xmm0","xmm4");                # 0 = ch
+       &pxor   ("xmm0","xmm1");                # 0 = ch
+       &movdqa ("xmm1",&QWP(0x10,$base));      # 0 : sbdt
 
-       &movdqa ("xmm4",&QWP(0x20,$base));      # 4 : sbbu
+       &pshufb ("xmm4","xmm2");                # 4 = sbdu
        &pshufb ("xmm0","xmm5");                # MC ch
+       &pshufb ("xmm1","xmm3");                # 0 = sbdt
+       &pxor   ("xmm0","xmm4");                # 4 = ch
+       &movdqa ("xmm4",&QWP(0x20,$base));      # 4 : sbbu
+       &pxor   ("xmm0","xmm1");                # 0 = ch
        &movdqa ("xmm1",&QWP(0x30,$base));      # 0 : sbbt
+
        &pshufb ("xmm4","xmm2");                # 4 = sbbu
+       &pshufb ("xmm0","xmm5");                # MC ch
        &pshufb ("xmm1","xmm3");                # 0 = sbbt
-       &pxor   ("xmm4","xmm0");                # 4 = ch
-       &pxor   ("xmm1","xmm4");                # 0 = ch
-
+       &pxor   ("xmm0","xmm4");                # 4 = ch
        &movdqa ("xmm4",&QWP(0x40,$base));      # 4 : sbeu
-       &pshufb ("xmm1","xmm5");                # MC ch
-       &movdqa ("xmm0",&QWP(0x50,$base));      # 0 : sbet
+       &pxor   ("xmm0","xmm1");                # 0 = ch
+       &movdqa ("xmm1",&QWP(0x50,$base));      # 0 : sbet
+
        &pshufb ("xmm4","xmm2");                # 4 = sbeu
-       &pshufb ("xmm0","xmm3");                # 0 = sbet
+       &pshufb ("xmm0","xmm5");                # MC ch
+       &pshufb ("xmm1","xmm3");                # 0 = sbet
+       &pxor   ("xmm0","xmm4");                # 4 = ch
+       &add    ($key,16);                      # next round key
        &palignr("xmm5","xmm5",12);
-       &pxor   ("xmm4","xmm1");                # 4 = ch
-       &pxor   ("xmm0","xmm4");                # 0 = ch
+       &pxor   ("xmm0","xmm1");                # 0 = ch
+       &sub    ($round,1);                     # nr--
 
 &set_label("dec_entry");
        # top of round
        &movdqa ("xmm1","xmm6");                # 1 : i
-       &pandn  ("xmm1","xmm0");                # 1 = i<<4
        &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
-       &psrld  ("xmm1",4);                     # 1 = i
+       &pandn  ("xmm1","xmm0");                # 1 = i<<4
        &pand   ("xmm0","xmm6");                # 0 = k
+       &psrld  ("xmm1",4);                     # 1 = i
        &pshufb ("xmm2","xmm0");                # 2 = a/k
        &movdqa ("xmm3","xmm7");                # 3 : 1/i
        &pxor   ("xmm0","xmm1");                # 0 = j
@@ -433,7 +445,7 @@ $k_dsbo=0x2c0;              # decryption sbox final output
 ##
 &set_label("schedule_192",16);
        &movdqu ("xmm0",&QWP(8,$inp));          # load key part 2 (very unaligned)
-       &call   ("_vpaes_schedule_transform");  # input transform       
+       &call   ("_vpaes_schedule_transform");  # input transform
        &movdqa ("xmm6","xmm0");                # save short part
        &pxor   ("xmm4","xmm4");                # clear 4
        &movhlps("xmm6","xmm4");                # clobber low side with zeros
@@ -464,7 +476,7 @@ $k_dsbo=0x2c0;              # decryption sbox final output
 ##
 &set_label("schedule_256",16);
        &movdqu ("xmm0",&QWP(16,$inp));         # load key part 2 (unaligned)
-       &call   ("_vpaes_schedule_transform");  # input transform       
+       &call   ("_vpaes_schedule_transform");  # input transform
        &mov    ($round,7);
 
 &set_label("loop_schedule_256");
@@ -475,7 +487,7 @@ $k_dsbo=0x2c0;              # decryption sbox final output
        &call   ("_vpaes_schedule_round");
        &dec    ($round);
        &jz     (&label("schedule_mangle_last"));
-       &call   ("_vpaes_schedule_mangle");     
+       &call   ("_vpaes_schedule_mangle");
 
        # low round. swap xmm7 and xmm6
        &pshufd ("xmm0","xmm0",0xFF);
@@ -598,7 +610,7 @@ $k_dsbo=0x2c0;              # decryption sbox final output
        # subbyte
        &movdqa ("xmm4",&QWP($k_s0F,$const));
        &movdqa ("xmm5",&QWP($k_inv,$const));   # 4 : 1/j
-       &movdqa ("xmm1","xmm4");        
+       &movdqa ("xmm1","xmm4");
        &pandn  ("xmm1","xmm0");
        &psrld  ("xmm1",4);                     # 1 = i
        &pand   ("xmm0","xmm4");                # 0 = k
@@ -900,3 +912,5 @@ $k_dsbo=0x2c0;              # decryption sbox final output
 &function_end("${PREFIX}_cbc_encrypt");
 
 &asm_finish();
+
+close STDOUT;