From 31ed9a21315c571db443c68e4f618ecb51c631f9 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Tue, 3 Dec 2013 22:05:17 +0100
Subject: [PATCH] crypto/bn/rsaz*: fix licensing note.

rsaz_exp.c: harmonize line terminating;
asm/rsaz-*.pl: minor optimizations.
---
 crypto/bn/asm/rsaz-avx2.pl   | 215 ++++++------
 crypto/bn/asm/rsaz-x86_64.pl | 120 ++++---
 crypto/bn/rsaz_exp.c         | 624 ++++++++++++++++++-----------------
 3 files changed, 500 insertions(+), 459 deletions(-)

diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl
index 09c45b0ec2..3eb95569fb 100755
--- a/crypto/bn/asm/rsaz-avx2.pl
+++ b/crypto/bn/asm/rsaz-avx2.pl
@@ -1,54 +1,66 @@
 #!/usr/bin/env perl
 
-#******************************************************************************
-#* Copyright(c) 2012, Intel Corp.                                             
-#* Developers and authors:                                                    
-#* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   
-#* (1) Intel Corporation, Israel Development Center, Haifa, Israel
-#* (2) University of Haifa, Israel                                              
-#******************************************************************************
-#* LICENSE:                                                                
-#* This submission to OpenSSL is to be made available under the OpenSSL  
-#* license, and only to the OpenSSL project, in order to allow integration    
-#* into the publicly distributed code. 
-#* The use of this code, or portions of this code, or concepts embedded in
-#* this code, or modification of this code and/or algorithm(s) in it, or the
-#* use of this code for any other purpose than stated above, requires special
-#* licensing.                                                                  
-#******************************************************************************
-#* DISCLAIMER:                                                                
-#* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS     
-#* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
-#* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
-#* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT
-#* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 
-#* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF    
-#* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS   
-#* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN    
-#* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)    
-#* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
-#* POSSIBILITY OF SUCH DAMAGE.                                                
-#******************************************************************************
-#* Reference:                                                                 
-#* [1]	S. Gueron, V. Krasnov: "Software Implementation of Modular
-#*	Exponentiation,  Using Advanced Vector Instructions Architectures",
-#*	F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,
-#*	pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012
-#* [2]	S. Gueron: "Efficient Software Implementations of Modular
-#*	Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).
-#* [3]	S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE
-#*	Proceedings of 9th International Conference on Information Technology:
-#*	New Generations (ITNG 2012), pp.821-823 (2012)
-#* [4]	S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
-#*	resistant 1024-bit modular exponentiation, for optimizing RSA2048
-#*	on AVX2 capable x86_64 platforms",
-#*	http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest
-#******************************************************************************
-
-# +10% improvement by <appro@openssl.org>
+##############################################################################
+#                                                                            #
+#  Copyright (c) 2012, Intel Corporation                                     #
+#                                                                            #
+#  All rights reserved.                                                      #
+#                                                                            #
+#  Redistribution and use in source and binary forms, with or without        #
+#  modification, are permitted provided that the following conditions are    #
+#  met:                                                                      #
+#                                                                            #
+#  *  Redistributions of source code must retain the above copyright         #
+#     notice, this list of conditions and the following disclaimer.          #
+#                                                                            #
+#  *  Redistributions in binary form must reproduce the above copyright      #
+#     notice, this list of conditions and the following disclaimer in the    #
+#     documentation and/or other materials provided with the                 #
+#     distribution.                                                          #
+#                                                                            #
+#  *  Neither the name of the Intel Corporation nor the names of its         #
+#     contributors may be used to endorse or promote products derived from   #
+#     this software without specific prior written permission.               #
+#                                                                            #
+#                                                                            #
+#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
+#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
+#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
+#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
+#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
+#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
+#                                                                            #
+##############################################################################
+# Developers and authors:                                                    #
+# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
+# (1) Intel Corporation, Israel Development Center, Haifa, Israel            #
+# (2) University of Haifa, Israel                                            #
+##############################################################################
+# Reference:                                                                 #
+# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular             #
+#     Exponentiation,  Using Advanced Vector Instructions Architectures",    #
+#     F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,   #
+#     pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012              #
+# [2] S. Gueron: "Efficient Software Implementations of Modular              #
+#     Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).  #
+# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE         #
+#     Proceedings of 9th International Conference on Information Technology: #
+#     New Generations (ITNG 2012), pp.821-823 (2012)                         #
+# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
+#     resistant 1024-bit modular exponentiation, for optimizing RSA2048      #
+#     on AVX2 capable x86_64 platforms",                                     #
+#     http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
+##############################################################################
+#
+# +13% improvement over original submission by <appro@openssl.org>
 #
 # rsa2048 sign/sec	OpenSSL 1.0.1	scalar(*)	this
-# 2.3GHz Haswell	621		732/+18%	1112/+79%
+# 2.3GHz Haswell	621		765/+23%	1113/+79%
 #
 # (*)	if system doesn't support AVX2, for reference purposes;
 
@@ -143,24 +155,24 @@ rsaz_1024_sqr_avx2:		# 702 cycles, 14% faster than rsaz_1024_mul_avx2
 	push	%r13
 	push	%r14
 	push	%r15
+	vzeroupper
 ___
 $code.=<<___ if ($win64);
 	lea	-0xa8(%rsp),%rsp
-	movaps  %xmm6,-0xd8(%rax)
-	movaps  %xmm7,-0xc8(%rax)
-	movaps  %xmm8,-0xb8(%rax)
-	movaps  %xmm9,-0xa8(%rax)
-	movaps  %xmm10,-0x98(%rax)
-	movaps  %xmm11,-0x88(%rax)
-	movaps  %xmm12,-0x78(%rax)
-	movaps  %xmm13,-0x68(%rax)
-	movaps  %xmm14,-0x58(%rax)
-	movaps  %xmm15,-0x48(%rax)
+	vmovaps	%xmm6,-0xd8(%rax)
+	vmovaps	%xmm7,-0xc8(%rax)
+	vmovaps	%xmm8,-0xb8(%rax)
+	vmovaps	%xmm9,-0xa8(%rax)
+	vmovaps	%xmm10,-0x98(%rax)
+	vmovaps	%xmm11,-0x88(%rax)
+	vmovaps	%xmm12,-0x78(%rax)
+	vmovaps	%xmm13,-0x68(%rax)
+	vmovaps	%xmm14,-0x58(%rax)
+	vmovaps	%xmm15,-0x48(%rax)
 .Lsqr_1024_body:
 ___
 $code.=<<___;
 	mov	%rax,%rbp
-	vzeroall
 	mov	%rdx, $np			# reassigned argument
 	sub	\$$FrameSize, %rsp
 	mov	$np, $tmp
@@ -171,6 +183,7 @@ $code.=<<___;
 	and	\$4095, $tmp			# see if $np crosses page
 	add	\$32*10, $tmp
 	shr	\$12, $tmp
+	vpxor	$ACC9,$ACC9,$ACC9
 	jz	.Lsqr_1024_no_n_copy
 
 	# unaligned 256-bit load that crosses page boundary can
@@ -198,7 +211,7 @@ $code.=<<___;
 	vmovdqu		$ACC6, 32*6-128($np)
 	vmovdqu		$ACC7, 32*7-128($np)
 	vmovdqu		$ACC8, 32*8-128($np)
-	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero after vzeroall
+	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero
 
 .Lsqr_1024_no_n_copy:
 	and		\$-1024, %rsp
@@ -876,17 +889,18 @@ rsaz_1024_mul_avx2:
 	push	%r15
 ___
 $code.=<<___ if ($win64);
+	vzeroupper
 	lea	-0xa8(%rsp),%rsp
-	movaps  %xmm6,-0xd8(%rax)
-	movaps  %xmm7,-0xc8(%rax)
-	movaps  %xmm8,-0xb8(%rax)
-	movaps  %xmm9,-0xa8(%rax)
-	movaps  %xmm10,-0x98(%rax)
-	movaps  %xmm11,-0x88(%rax)
-	movaps  %xmm12,-0x78(%rax)
-	movaps  %xmm13,-0x68(%rax)
-	movaps  %xmm14,-0x58(%rax)
-	movaps  %xmm15,-0x48(%rax)
+	vmovaps	%xmm6,-0xd8(%rax)
+	vmovaps	%xmm7,-0xc8(%rax)
+	vmovaps	%xmm8,-0xb8(%rax)
+	vmovaps	%xmm9,-0xa8(%rax)
+	vmovaps	%xmm10,-0x98(%rax)
+	vmovaps	%xmm11,-0x88(%rax)
+	vmovaps	%xmm12,-0x78(%rax)
+	vmovaps	%xmm13,-0x68(%rax)
+	vmovaps	%xmm14,-0x58(%rax)
+	vmovaps	%xmm15,-0x48(%rax)
 .Lmul_1024_body:
 ___
 $code.=<<___;
@@ -900,6 +914,7 @@ $code.=<<___;
 	# cross page boundary, swap it with $bp [meaning that caller
 	# is advised to lay down $ap and $bp next to each other, so
 	# that only one can cross page boundary].
+	.byte	0x67,0x67
 	mov	$ap, $tmp
 	and	\$4095, $tmp
 	add	\$32*10, $tmp
@@ -915,6 +930,7 @@ $code.=<<___;
 
 	and	\$4095, $tmp	# see if $np crosses page
 	add	\$32*10, $tmp
+	.byte	0x67,0x67
 	shr	\$12, $tmp
 	jz	.Lmul_1024_no_n_copy
 
@@ -960,6 +976,7 @@ $code.=<<___;
 	vpbroadcastq ($bp), $Bi
 	vmovdqu	$ACC0, (%rsp)			# clear top of stack
 	xor	$r0, $r0
+	.byte	0x67
 	xor	$r1, $r1
 	xor	$r2, $r2
 	xor	$r3, $r3
@@ -1564,22 +1581,22 @@ rsaz_1024_gather5_avx2:
 ___
 $code.=<<___ if ($win64);
 	lea	-0x88(%rsp),%rax
+	vzeroupper
 .LSEH_begin_rsaz_1024_gather5:
 	# I can't trust assembler to use specific encoding:-(
 	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
-	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax)
-	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax)
-	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax)
-	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax)
-	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax)
-	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax)
-	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax)
-	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax)
-	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax)
-	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)
+	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6,-0x20(%rax)
+	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7,-0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8,0(%rax)
+	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9,0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10,0x20(%rax)
+	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11,0x30(%rax)
+	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12,0x40(%rax)
+	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13,0x50(%rax)
+	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14,0x60(%rax)
+	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15,0x70(%rax)
 ___
 $code.=<<___;
-	vzeroupper
 	lea	.Lgather_table(%rip),%r11
 	mov	$power,%eax
 	and	\$3,$power
@@ -1596,25 +1613,25 @@ $code.=<<___;
 	vpbroadcastb	2(%r11,%rax), %xmm14
 	vpbroadcastb	1(%r11,%rax), %xmm15
 
-	lea	($inp,$power),$inp
+	lea	64($inp,$power),$inp
 	mov	\$64,%r11			# size optimization
 	mov	\$9,%eax
 	jmp	.Loop_gather_1024
 
 .align	32
 .Loop_gather_1024:
-	vpand		($inp),			%xmm8,%xmm0
-	vpand		($inp,%r11),		%xmm9,%xmm1
-	vpand		($inp,%r11,2),		%xmm10,%xmm2
-	vpand		64($inp,%r11,2),	%xmm11,%xmm3
+	vpand		-64($inp),		%xmm8,%xmm0
+	vpand		($inp),			%xmm9,%xmm1
+	vpand		64($inp),		%xmm10,%xmm2
+	vpand		($inp,%r11,2),		%xmm11,%xmm3
 	 vpor					%xmm0,%xmm1,%xmm1
-	vpand		($inp,%r11,4),		%xmm12,%xmm4
+	vpand		64($inp,%r11,2),	%xmm12,%xmm4
 	 vpor					%xmm2,%xmm3,%xmm3
-	vpand		64($inp,%r11,4),	%xmm13,%xmm5
+	vpand		($inp,%r11,4),		%xmm13,%xmm5
 	 vpor					%xmm1,%xmm3,%xmm3
-	vpand		-128($inp,%r11,8),	%xmm14,%xmm6
+	vpand		64($inp,%r11,4),	%xmm14,%xmm6
 	 vpor					%xmm4,%xmm5,%xmm5
-	vpand		-64($inp,%r11,8),	%xmm15,%xmm2
+	vpand		-128($inp,%r11,8),	%xmm15,%xmm2
 	lea		($inp,%r11,8),$inp
 	 vpor					%xmm3,%xmm5,%xmm5
 	 vpor					%xmm2,%xmm6,%xmm6
@@ -1798,16 +1815,16 @@ rsaz_se_handler:
 	.rva	.Lmul_1024_body,.Lmul_1024_epilogue
 .LSEH_info_rsaz_1024_gather5:
 	.byte	0x01,0x33,0x16,0x00
-	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
-	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
-	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
-	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
-	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
-	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
-	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
-	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
-	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
-	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
+	.byte	0x36,0xf8,0x09,0x00	#vmovaps 0x90(rsp),xmm15
+	.byte	0x31,0xe8,0x08,0x00	#vmovaps 0x80(rsp),xmm14
+	.byte	0x2c,0xd8,0x07,0x00	#vmovaps 0x70(rsp),xmm13
+	.byte	0x27,0xc8,0x06,0x00	#vmovaps 0x60(rsp),xmm12
+	.byte	0x22,0xb8,0x05,0x00	#vmovaps 0x50(rsp),xmm11
+	.byte	0x1d,0xa8,0x04,0x00	#vmovaps 0x40(rsp),xmm10
+	.byte	0x18,0x98,0x03,0x00	#vmovaps 0x30(rsp),xmm9
+	.byte	0x13,0x88,0x02,0x00	#vmovaps 0x20(rsp),xmm8
+	.byte	0x0e,0x78,0x01,0x00	#vmovaps 0x10(rsp),xmm7
+	.byte	0x09,0x68,0x00,0x00	#vmovaps 0x00(rsp),xmm6
 	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
 ___
 }
diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
index cef1d0cc77..b55421b7db 100755
--- a/crypto/bn/asm/rsaz-x86_64.pl
+++ b/crypto/bn/asm/rsaz-x86_64.pl
@@ -1,48 +1,60 @@
 #!/usr/bin/env perl
 
-#******************************************************************************#
-#* Copyright(c) 2012, Intel Corp.                                             *#
-#* Developers and authors:                                                    *#
-#* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *#
-#* (1) Intel Architecture Group, Microprocessor and Chipset Development,      *#
-#*     Israel Development Center, Haifa, Israel                               *#
-#* (2) University of Haifa                                                    *#
-#******************************************************************************#
-#* This submission to OpenSSL is to be made available under the OpenSSL       *#
-#* license, and only to the OpenSSL project, in order to allow integration    *#
-#* into the publicly distributed code. ?                                      *#
-#* The use of this code, or portions of this code, or concepts embedded in    *#
-#* this code, or modification of this code and/or algorithm(s) in it, or the  *#
-#* use of this code for any other purpose than stated above, requires special *#
-#* licensing.                                                                 *#
-#******************************************************************************#
-#******************************************************************************#
-#* DISCLAIMER:                                                                *#
-#* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS     *#
-#* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *#
-#* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *#
-#* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT*#
-#* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, *#
-#* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF    *#
-#* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS   *#
-#* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN    *#
-#* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)    *#
-#* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *#
-#* POSSIBILITY OF SUCH DAMAGE.                                                *#
-#******************************************************************************#
-#* Reference:                                                                 *#
-#* [1] S. Gueron, "Efficient Software Implementations of Modular              *#
-#*     Exponentiation", http://eprint.iacr.org/2011/239                       *#
-#* [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".             *#
-#*     IEEE Proceedings of 9th International Conference on Information        *#
-#*     Technology: New Generations (ITNG 2012), 821-823 (2012).               *#
-#* [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation*#
-#*     Journal of Cryptographic Engineering 2:31-43 (2012).                   *#
-#* [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    *#
-#*     resistant 512-bit and 1024-bit modular exponentiation for optimizing   *#
-#*     RSA1024 and RSA2048 on x86_64 platforms",                              *#
-#*     http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest*#
-################################################################################
+##############################################################################
+#                                                                            #
+#  Copyright (c) 2012, Intel Corporation                                     #
+#                                                                            #
+#  All rights reserved.                                                      #
+#                                                                            #
+#  Redistribution and use in source and binary forms, with or without        #
+#  modification, are permitted provided that the following conditions are    #
+#  met:                                                                      #
+#                                                                            #
+#  *  Redistributions of source code must retain the above copyright         #
+#     notice, this list of conditions and the following disclaimer.          #
+#                                                                            #
+#  *  Redistributions in binary form must reproduce the above copyright      #
+#     notice, this list of conditions and the following disclaimer in the    #
+#     documentation and/or other materials provided with the                 #
+#     distribution.                                                          #
+#                                                                            #
+#  *  Neither the name of the Intel Corporation nor the names of its         #
+#     contributors may be used to endorse or promote products derived from   #
+#     this software without specific prior written permission.               #
+#                                                                            #
+#                                                                            #
+#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
+#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
+#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
+#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
+#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
+#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
+#                                                                            #
+##############################################################################
+# Developers and authors:                                                    #
+# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
+# (1) Intel Architecture Group, Microprocessor and Chipset Development,      #
+#     Israel Development Center, Haifa, Israel                               #
+# (2) University of Haifa                                                    #
+##############################################################################
+# Reference:                                                                 #
+# [1] S. Gueron, "Efficient Software Implementations of Modular              #
+#     Exponentiation", http://eprint.iacr.org/2011/239                       #
+# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".             #
+#     IEEE Proceedings of 9th International Conference on Information        #
+#     Technology: New Generations (ITNG 2012), 821-823 (2012).               #
+# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
+#     Journal of Cryptographic Engineering 2:31-43 (2012).                   #
+# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
+#     resistant 512-bit and 1024-bit modular exponentiation for optimizing   #
+#     RSA1024 and RSA2048 on x86_64 platforms",                              #
+#     http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
+##############################################################################
 
 # While original submission covers 512- and 1024-bit exponentiation,
 # this module is limited to 512-bit version only (and as such
@@ -1812,33 +1824,33 @@ $code.=<<___;
 .align	32
 __rsaz_512_mulx:
 	mulx	($ap), %rbx, %r8	# initial %rdx preloaded by caller
-	xor	$zero, $zero		# cf=0,of=0
+	mov	\$-6, %rcx
 
 	mulx	8($ap), %rax, %r9
 	movq	%rbx, 8(%rsp)
 
 	mulx	16($ap), %rbx, %r10
-	adcx	%rax, %r8
+	adc	%rax, %r8
 
 	mulx	24($ap), %rax, %r11
-	adcx	%rbx, %r9
+	adc	%rbx, %r9
 
-	.byte	0xc4,0x62,0xe3,0xf6,0xa6,0x20,0x00,0x00,0x00	# mulx	32($ap), %rbx, %r12
-	adcx	%rax, %r10
+	mulx	32($ap), %rbx, %r12
+	adc	%rax, %r10
 
 	mulx	40($ap), %rax, %r13
-	adcx	%rbx, %r11
+	adc	%rbx, %r11
 
 	mulx	48($ap), %rbx, %r14
-	adcx	%rax, %r12
+	adc	%rax, %r12
 
 	mulx	56($ap), %rax, %r15
 	 mov	8($bp), %rdx
-	adcx	%rbx, %r13
-	adcx	%rax, %r14
-	adcx	$zero, %r15		# cf=0
+	adc	%rbx, %r13
+	adc	%rax, %r14
+	adc	\$0, %r15
 
-	mov	\$-6, %rcx
+	xor	$zero, $zero		# cf=0,of=0
 	jmp	.Loop_mulx
 
 .align	32
diff --git a/crypto/bn/rsaz_exp.c b/crypto/bn/rsaz_exp.c
index 57591b8db3..2cdb02a4f0 100644
--- a/crypto/bn/rsaz_exp.c
+++ b/crypto/bn/rsaz_exp.c
@@ -1,306 +1,318 @@
-/******************************************************************************
-* Copyright(c) 2012, Intel Corp.                                             
-* Developers and authors:                                                    
-* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   
-* (1) Intel Corporation, Israel Development Center, Haifa, Israel                               
-* (2) University of Haifa, Israel                                              
-******************************************************************************
-* LICENSE:                                                                
-* This submission to OpenSSL is to be made available under the OpenSSL  
-* license, and only to the OpenSSL project, in order to allow integration    
-* into the publicly distributed code. 
-* The use of this code, or portions of this code, or concepts embedded in
-* this code, or modification of this code and/or algorithm(s) in it, or the
-* use of this code for any other purpose than stated above, requires special
-* licensing.                                                                  
-******************************************************************************
-* DISCLAIMER:                                                                
-* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS     
-* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
-* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
-* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT
-* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 
-* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF    
-* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS   
-* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN    
-* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)    
-* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
-* POSSIBILITY OF SUCH DAMAGE.                                                
-******************************************************************************/
-
-#include "rsaz_exp.h"
-
-/*
- * See crypto/bn/asm/rsaz-avx2.pl for further details.
- */
-void rsaz_1024_norm2red_avx2(void *red,const void *norm);
-void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,unsigned long k);
-void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,unsigned long k,int cnt);
-void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i);
-void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i);
-void rsaz_1024_red2norm_avx2(void *norm,const void *red);
-
-#if defined(__GNUC__)
-# define ALIGN64	__attribute__((aligned(64)))
-#elif defined(_MSC_VER)
-# define ALIGN64	__declspec(align(64))
-#elif defined(__SUNPRO_C)
-# define ALIGN64
-# pragma align 64(one,two80)
-#else
-# define ALIGN64	/* not fatal, might hurt performance a little */
-#endif
-
-ALIGN64 static const unsigned long one[40] =
-	{1,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-ALIGN64 static const unsigned long two80[40] =
-	{0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
-void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
-	const BN_ULONG base_norm[16], const BN_ULONG exponent[16],
-	const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0)
-{
-	unsigned char	 storage[320*3+32*9*16+64];	/* 5.5KB */
-	unsigned char	*p_str = storage + (64-((size_t)storage%64));
-	unsigned char	*a_inv, *m, *result,
-			*table_s = p_str+320*3,
-			*R2      = table_s;	/* borrow */
-	int index;
-	int wvalue;
-
-	if ((((size_t)p_str&4095)+320)>>12) {
-		result = p_str;
-		a_inv = p_str + 320;
-		m = p_str + 320*2;	/* should not cross page */
-	} else {
-		m = p_str;		/* should not cross page */
-		result = p_str + 320;
-		a_inv = p_str + 320*2;
-	}
-
-	rsaz_1024_norm2red_avx2(m, m_norm);
-	rsaz_1024_norm2red_avx2(a_inv, base_norm);
-	rsaz_1024_norm2red_avx2(R2, RR);
-
-	rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
-	rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
-
-	/* table[0] = 1 */
-	rsaz_1024_mul_avx2(result, R2, one, m, k0);
-	/* table[1] = a_inv^1 */
-	rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
-
-	rsaz_1024_scatter5_avx2(table_s,result,0);
-	rsaz_1024_scatter5_avx2(table_s,a_inv,1);
-
-	/* table[2] = a_inv^2 */
-	rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,2);
-#if 0
-	/* this is almost 2x smaller and less than 1% slower */
-	for (index=3; index<32; index++) {
-		rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-		rsaz_1024_scatter5_avx2(table_s,result,index);
-	}
-#else
-	/* table[4] = a_inv^4 */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,4);
-	/* table[8] = a_inv^8 */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,8);
-	/* table[16] = a_inv^16 */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,16);
-	/* table[17] = a_inv^17 */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,17);
-
-	/* table[3] */
-	rsaz_1024_gather5_avx2(result,table_s,2);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,3);
-	/* table[6] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,6);
-	/* table[12] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,12);
- 	/* table[24] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,24);
-	/* table[25] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,25);
-
-	/* table[5] */
-	rsaz_1024_gather5_avx2(result,table_s,4);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,5);
-	/* table[10] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,10);
-	/* table[20] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,20);
-	/* table[21] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,21);
-
-	/* table[7] */
-	rsaz_1024_gather5_avx2(result,table_s,6);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,7);
-	/* table[14] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,14);
-	/* table[28] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,28);
-	/* table[29] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,29);
-
-	/* table[9] */
-	rsaz_1024_gather5_avx2(result,table_s,8);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,9);
-	/* table[18] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,18);
-	/* table[19] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,19);
-
-	/* table[11] */
-	rsaz_1024_gather5_avx2(result,table_s,10);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,11);
-	/* table[22] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,22);
-	/* table[23] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,23);
-
-	/* table[13] */
-	rsaz_1024_gather5_avx2(result,table_s,12);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,13);
-	/* table[26] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,26);
-	/* table[27] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,27);
-
-	/* table[15] */
-	rsaz_1024_gather5_avx2(result,table_s,14);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,15);
-	/* table[30] */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_scatter5_avx2(table_s,result,30);
-	/* table[31] */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,31);
-#endif
-
-	/* load first window */
-	p_str = (unsigned char*)exponent;
-	wvalue = p_str[127] >> 3;
-	rsaz_1024_gather5_avx2(result,table_s,wvalue);
-
-	index = 1014;
-
-	while(index > -1) {	/* loop for the remaining 127 windows */
-
-		rsaz_1024_sqr_avx2(result, result, m, k0, 5);
-
-		wvalue = *((unsigned short*)&p_str[index/8]);
-		wvalue = (wvalue>> (index%8)) & 31;
-		index-=5;
-
-		rsaz_1024_gather5_avx2(a_inv,table_s,wvalue);	/* borrow a_inv */
-		rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	}
-
-	/* square four times */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 4);
-
-	wvalue = p_str[0] & 15;
-
-	rsaz_1024_gather5_avx2(a_inv,table_s,wvalue);	/* borrow a_inv */
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-
-	/* from Montgomery */
-	rsaz_1024_mul_avx2(result, result, one, m, k0);
-
-	rsaz_1024_red2norm_avx2(result_norm, result);
-
-	OPENSSL_cleanse(storage,sizeof(storage));
-}
-
-/*
- * See crypto/bn/rsaz-x86_64.pl for further details.
- */
-void rsaz_512_mul(void *ret,const void *a,const void *b,const void *n,unsigned long k);
-void rsaz_512_mul_scatter4(void *ret,const void *a,const void *n,unsigned long k,const void *tbl,unsigned int power);
-void rsaz_512_mul_gather4(void *ret,const void *a,const void *tbl,const void *n,unsigned long k,unsigned int power);
-void rsaz_512_mul_by_one(void *ret,const void *a,const void *n,unsigned long k);
-void rsaz_512_sqr(void *ret,const void *a,const void *n,unsigned long k,int cnt);
-void rsaz_512_scatter4(void *tbl, const unsigned long *val, int power);
-void rsaz_512_gather4(unsigned long *val, const void *tbl, int power);
-
-void RSAZ_512_mod_exp(BN_ULONG result[8],
-	const BN_ULONG base[8], const BN_ULONG exponent[8],
-	const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])
-{
-	unsigned char	 storage[16*8*8+64*2+64];	/* 1.2KB */
-	unsigned char	*table = storage + (64-((size_t)storage%64));
-	unsigned long	*a_inv = (unsigned long *)(table+16*8*8),
-			*temp  = (unsigned long *)(table+16*8*8+8*8);
-	unsigned char	*p_str = (unsigned char*)exponent;
-	int index;
-	unsigned int wvalue;
-
-	/* table[0] = 1_inv */
-	temp[0] = 0-m[0];	temp[1] = ~m[1];
-	temp[2] = ~m[2];	temp[3] = ~m[3];
-	temp[4] = ~m[4];	temp[5] = ~m[5];
-	temp[6] = ~m[6];	temp[7] = ~m[7];
-	rsaz_512_scatter4(table, temp, 0);
-
-	/* table [1] = a_inv^1 */
-	rsaz_512_mul(a_inv, base, RR, m, k0);
-	rsaz_512_scatter4(table, a_inv, 1);
-
-	/* table [2] = a_inv^2 */
-	rsaz_512_sqr(temp, a_inv, m, k0, 1);
-	rsaz_512_scatter4(table, temp, 2);
-
-	for (index=3; index<16; index++)
-		rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);
-
-	/* load first window */
-	wvalue = p_str[63];
-
-	rsaz_512_gather4(temp, table, wvalue>>4);
-	rsaz_512_sqr(temp, temp, m, k0, 4);
-	rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0xf);
-
-	for (index=62; index>=0; index--) {
-		wvalue = p_str[index];
-
-		rsaz_512_sqr(temp, temp, m, k0, 4);
-		rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue>>4);
-
-		rsaz_512_sqr(temp, temp, m, k0, 4);
-		rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0x0f);
-	}
-
-	/* from Montgomery */
-	rsaz_512_mul_by_one(result, temp, m, k0);
-
-	OPENSSL_cleanse(storage,sizeof(storage));
-}
+/*****************************************************************************
+*                                                                            *
+*  Copyright (c) 2012, Intel Corporation                                     *
+*                                                                            *
+*  All rights reserved.                                                      *
+*                                                                            *
+*  Redistribution and use in source and binary forms, with or without        *
+*  modification, are permitted provided that the following conditions are    *
+*  met:                                                                      *
+*                                                                            *
+*  *  Redistributions of source code must retain the above copyright         *
+*     notice, this list of conditions and the following disclaimer.          *
+*                                                                            *
+*  *  Redistributions in binary form must reproduce the above copyright      *
+*     notice, this list of conditions and the following disclaimer in the    *
+*     documentation and/or other materials provided with the                 *
+*     distribution.                                                          *
+*                                                                            *
+*  *  Neither the name of the Intel Corporation nor the names of its         *
+*     contributors may be used to endorse or promote products derived from   *
+*     this software without specific prior written permission.               *
+*                                                                            *
+*                                                                            *
+*  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          *
+*  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         *
+*  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        *
+*  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            *
+*  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     *
+*  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       *
+*  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        *
+*  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    *
+*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      *
+*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        *
+*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              *
+*                                                                            *
+******************************************************************************
+* Developers and authors:                                                    *
+* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
+* (1) Intel Corporation, Israel Development Center, Haifa, Israel            *
+* (2) University of Haifa, Israel                                            *
+*****************************************************************************/
+
+#include "rsaz_exp.h"
+
+/*
+ * See crypto/bn/asm/rsaz-avx2.pl for further details.
+ */
+void rsaz_1024_norm2red_avx2(void *red,const void *norm);
+void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,unsigned long k);
+void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,unsigned long k,int cnt);
+void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i);
+void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i);
+void rsaz_1024_red2norm_avx2(void *norm,const void *red);
+
+#if defined(__GNUC__)
+# define ALIGN64	__attribute__((aligned(64)))
+#elif defined(_MSC_VER)
+# define ALIGN64	__declspec(align(64))
+#elif defined(__SUNPRO_C)
+# define ALIGN64
+# pragma align 64(one,two80)
+#else
+# define ALIGN64	/* not fatal, might hurt performance a little */
+#endif
+
+ALIGN64 static const unsigned long one[40] =
+	{1,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+ALIGN64 static const unsigned long two80[40] =
+	{0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
+	const BN_ULONG base_norm[16], const BN_ULONG exponent[16],
+	const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0)
+{
+	unsigned char	 storage[320*3+32*9*16+64];	/* 5.5KB */
+	unsigned char	*p_str = storage + (64-((size_t)storage%64));
+	unsigned char	*a_inv, *m, *result,
+			*table_s = p_str+320*3,
+			*R2      = table_s;	/* borrow */
+	int index;
+	int wvalue;
+
+	if ((((size_t)p_str&4095)+320)>>12) {
+		result = p_str;
+		a_inv = p_str + 320;
+		m = p_str + 320*2;	/* should not cross page */
+	} else {
+		m = p_str;		/* should not cross page */
+		result = p_str + 320;
+		a_inv = p_str + 320*2;
+	}
+
+	rsaz_1024_norm2red_avx2(m, m_norm);
+	rsaz_1024_norm2red_avx2(a_inv, base_norm);
+	rsaz_1024_norm2red_avx2(R2, RR);
+
+	rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
+	rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
+
+	/* table[0] = 1 */
+	rsaz_1024_mul_avx2(result, R2, one, m, k0);
+	/* table[1] = a_inv^1 */
+	rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
+
+	rsaz_1024_scatter5_avx2(table_s,result,0);
+	rsaz_1024_scatter5_avx2(table_s,a_inv,1);
+
+	/* table[2] = a_inv^2 */
+	rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,2);
+#if 0
+	/* this is almost 2x smaller and less than 1% slower */
+	for (index=3; index<32; index++) {
+		rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+		rsaz_1024_scatter5_avx2(table_s,result,index);
+	}
+#else
+	/* table[4] = a_inv^4 */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,4);
+	/* table[8] = a_inv^8 */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,8);
+	/* table[16] = a_inv^16 */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,16);
+	/* table[17] = a_inv^17 */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,17);
+
+	/* table[3] */
+	rsaz_1024_gather5_avx2(result,table_s,2);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,3);
+	/* table[6] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,6);
+	/* table[12] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,12);
+ 	/* table[24] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,24);
+	/* table[25] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,25);
+
+	/* table[5] */
+	rsaz_1024_gather5_avx2(result,table_s,4);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,5);
+	/* table[10] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,10);
+	/* table[20] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,20);
+	/* table[21] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,21);
+
+	/* table[7] */
+	rsaz_1024_gather5_avx2(result,table_s,6);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,7);
+	/* table[14] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,14);
+	/* table[28] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,28);
+	/* table[29] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,29);
+
+	/* table[9] */
+	rsaz_1024_gather5_avx2(result,table_s,8);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,9);
+	/* table[18] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,18);
+	/* table[19] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,19);
+
+	/* table[11] */
+	rsaz_1024_gather5_avx2(result,table_s,10);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,11);
+	/* table[22] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,22);
+	/* table[23] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,23);
+
+	/* table[13] */
+	rsaz_1024_gather5_avx2(result,table_s,12);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,13);
+	/* table[26] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,26);
+	/* table[27] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,27);
+
+	/* table[15] */
+	rsaz_1024_gather5_avx2(result,table_s,14);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,15);
+	/* table[30] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,30);
+	/* table[31] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,31);
+#endif
+
+	/* load first window */
+	p_str = (unsigned char*)exponent;
+	wvalue = p_str[127] >> 3;
+	rsaz_1024_gather5_avx2(result,table_s,wvalue);
+
+	index = 1014;
+
+	while(index > -1) {	/* loop for the remaining 127 windows */
+
+		rsaz_1024_sqr_avx2(result, result, m, k0, 5);
+
+		wvalue = *((unsigned short*)&p_str[index/8]);
+		wvalue = (wvalue>> (index%8)) & 31;
+		index-=5;
+
+		rsaz_1024_gather5_avx2(a_inv,table_s,wvalue);	/* borrow a_inv */
+		rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	}
+
+	/* square four times */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 4);
+
+	wvalue = p_str[0] & 15;
+
+	rsaz_1024_gather5_avx2(a_inv,table_s,wvalue);	/* borrow a_inv */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+
+	/* from Montgomery */
+	rsaz_1024_mul_avx2(result, result, one, m, k0);
+
+	rsaz_1024_red2norm_avx2(result_norm, result);
+
+	OPENSSL_cleanse(storage,sizeof(storage));
+}
+
+/*
+ * See crypto/bn/rsaz-x86_64.pl for further details.
+ */
+void rsaz_512_mul(void *ret,const void *a,const void *b,const void *n,unsigned long k);
+void rsaz_512_mul_scatter4(void *ret,const void *a,const void *n,unsigned long k,const void *tbl,unsigned int power);
+void rsaz_512_mul_gather4(void *ret,const void *a,const void *tbl,const void *n,unsigned long k,unsigned int power);
+void rsaz_512_mul_by_one(void *ret,const void *a,const void *n,unsigned long k);
+void rsaz_512_sqr(void *ret,const void *a,const void *n,unsigned long k,int cnt);
+void rsaz_512_scatter4(void *tbl, const unsigned long *val, int power);
+void rsaz_512_gather4(unsigned long *val, const void *tbl, int power);
+
+void RSAZ_512_mod_exp(BN_ULONG result[8],
+	const BN_ULONG base[8], const BN_ULONG exponent[8],
+	const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])
+{
+	unsigned char	 storage[16*8*8+64*2+64];	/* 1.2KB */
+	unsigned char	*table = storage + (64-((size_t)storage%64));
+	unsigned long	*a_inv = (unsigned long *)(table+16*8*8),
+			*temp  = (unsigned long *)(table+16*8*8+8*8);
+	unsigned char	*p_str = (unsigned char*)exponent;
+	int index;
+	unsigned int wvalue;
+
+	/* table[0] = 1_inv */
+	temp[0] = 0-m[0];	temp[1] = ~m[1];
+	temp[2] = ~m[2];	temp[3] = ~m[3];
+	temp[4] = ~m[4];	temp[5] = ~m[5];
+	temp[6] = ~m[6];	temp[7] = ~m[7];
+	rsaz_512_scatter4(table, temp, 0);
+
+	/* table [1] = a_inv^1 */
+	rsaz_512_mul(a_inv, base, RR, m, k0);
+	rsaz_512_scatter4(table, a_inv, 1);
+
+	/* table [2] = a_inv^2 */
+	rsaz_512_sqr(temp, a_inv, m, k0, 1);
+	rsaz_512_scatter4(table, temp, 2);
+
+	for (index=3; index<16; index++)
+		rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);
+
+	/* load first window */
+	wvalue = p_str[63];
+
+	rsaz_512_gather4(temp, table, wvalue>>4);
+	rsaz_512_sqr(temp, temp, m, k0, 4);
+	rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0xf);
+
+	for (index=62; index>=0; index--) {
+		wvalue = p_str[index];
+
+		rsaz_512_sqr(temp, temp, m, k0, 4);
+		rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue>>4);
+
+		rsaz_512_sqr(temp, temp, m, k0, 4);
+		rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0x0f);
+	}
+
+	/* from Montgomery */
+	rsaz_512_mul_by_one(result, temp, m, k0);
+
+	OPENSSL_cleanse(storage,sizeof(storage));
+}
-- 
2.34.1