X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Faes%2Fasm%2Faesni-sha256-x86_64.pl;h=5521766a6a7dcf3f3df589b610f4784a17c0b306;hp=16fea488173f587adbf8b077de7b4049ae355e45;hb=HEAD;hpb=42b9a4177bf9d465ff9ac8d4e1526301d809a87e

diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl
index 16fea48817..5c8bb0fbcc 100644
--- a/crypto/aes/asm/aesni-sha256-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha256-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -21,21 +28,26 @@
 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
 # subroutine:
 #
-#		 AES-128/-192/-256+SHA256	this(**)gain
-# Sandy Bridge	    5.05/6.05/7.05+11.6		13.0	+28%/36%/43%
-# Ivy Bridge	    5.05/6.05/7.05+10.3		11.6	+32%/41%/50%
-# Haswell	    4.43/5.29/6.19+7.80		8.79	+39%/49%/59%
-# Bulldozer	    5.77/6.89/8.00+13.7		13.7	+42%/50%/58%
+#		 AES-128/-192/-256+SHA256   this(**)	gain
+# Sandy Bridge	    5.05/6.05/7.05+11.6	    13.0	+28%/36%/43%
+# Ivy Bridge	    5.05/6.05/7.05+10.3	    11.6	+32%/41%/50%
+# Haswell	    4.43/5.29/6.19+7.80	    8.79	+39%/49%/59%
+# Skylake	    2.62/3.14/3.62+7.70	    8.10	+27%/34%/40%
+# Bulldozer	    5.77/6.89/8.00+13.7	    13.7	+42%/50%/58%
+# Ryzen(***)	    2.71/-/3.71+2.05	    2.74/-/3.73	+74%/-/54%
+# Goldmont(***)	    3.82/-/5.35+4.16	    4.73/-/5.94	+69%/-/60%
 #
-# (*)	there are XOP, AVX1 and AVX2 code pathes, meaning that
+# (*)	there are XOP, AVX1 and AVX2 code paths, meaning that
 #	Westmere is omitted from loop, this is because gain was not
 #	estimated high enough to justify the effort;
 # (**)	these are EVP-free results, results obtained with 'speed
 #	-evp aes-256-cbc-hmac-sha256' will vary by percent or two;
+# (***)	these are SHAEXT results;
 
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
 
 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 
@@ -56,10 +68,18 @@ if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
 
 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
-	$avx = ($1>=10) + ($1>=11);
+	$avx = ($1>=10) + ($1>=12);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
+	$avx = ($2>=3.0) + ($2>3.0);
 }
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+$shaext=$avx;	### set to zero if compiling for 1.0.1
+$avx=1		if (!$shaext && $avx);
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
 *STDOUT=*OUT;
 
 $func="aesni_cbc_sha256_enc";
@@ -94,7 +114,7 @@ $_key="16*$SZ+3*8(%rsp)";
 $_ivp="16*$SZ+4*8(%rsp)";
 $_ctx="16*$SZ+5*8(%rsp)";
 $_in0="16*$SZ+6*8(%rsp)";
-$_rsp="16*$SZ+7*8(%rsp)";
+$_rsp="`16*$SZ+7*8`(%rsp)";
 $framesz=16*$SZ+8*8;
 
 $code=<<___;
@@ -105,15 +125,24 @@ $code=<<___;
 .type	$func,\@abi-omnipotent
 .align	16
 $func:
+.cfi_startproc
 ___
-$code.=<<___ if ($avx);
+						if ($avx) {
+$code.=<<___;
 	lea	OPENSSL_ia32cap_P(%rip),%r11
 	mov	\$1,%eax
 	cmp	\$0,`$win64?"%rcx":"%rdi"`
 	je	.Lprobe
 	mov	0(%r11),%eax
-	mov	4(%r11),%r10d
-	mov	8(%r11),%r11d
+	mov	4(%r11),%r10
+___
+$code.=<<___ if ($shaext);
+	bt	\$61,%r10			# check for SHA
+	jc	${func}_shaext
+___
+$code.=<<___;
+	mov	%r10,%r11
+	shr	\$32,%r11
 
 	test	\$`1<<11`,%r10d			# check for XOP
 	jnz	${func}_xop
@@ -123,14 +152,12 @@ $code.=<<___ if ($avx>1);
 	cmp	\$`1<<8|1<<5|1<<3`,%r11d
 	je	${func}_avx2
 ___
-$code.=<<___ if ($avx);
-	and	\$`1<<30`,%eax			# mask "Intel CPU" bit
-	and	\$`1<<28|1<<9`,%r10d		# mask AVX+SSSE3 bits
-	or	%eax,%r10d
-	cmp	\$`1<<28|1<<9|1<<30`,%r10d
-	je	${func}_avx
+$code.=<<___;
+	and	\$`1<<28`,%r10d			# check for AVX
+	jnz	${func}_avx
 	ud2
 ___
+						}
 $code.=<<___;
 	xor	%eax,%eax
 	cmp	\$0,`$win64?"%rcx":"%rdi"`
@@ -138,8 +165,10 @@ $code.=<<___;
 	ud2
 .Lprobe:
 	ret
+.cfi_endproc
 .size	$func,.-$func
 
+.section .rodata align=64
 .align	64
 .type	$TABLE,\@object
 $TABLE:
@@ -182,6 +211,7 @@ $TABLE:
 	.long	0,0,0,0,   0,0,0,0
 	.asciz	"AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	64
+.previous
 ___
 
 ######################################################################
@@ -318,15 +348,23 @@ $code.=<<___;
 .type	${func}_xop,\@function,6
 .align	64
 ${func}_xop:
+.cfi_startproc
 .Lxop_shortcut:
 	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
+	mov	%rsp,%rax		# copy %rsp
+.cfi_def_cfa_register	%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
-	mov	%rsp,%r11		# copy %rsp
+.cfi_push	%r15
 	sub	\$`$framesz+$win64*16*10`,%rsp
 	and	\$-64,%rsp		# align stack frame
 
@@ -342,7 +380,8 @@ ${func}_xop:
 	mov	$ivp,$_ivp
 	mov	$ctx,$_ctx
 	mov	$in0,$_in0
-	mov	%r11,$_rsp
+	mov	%rax,$_rsp
+.cfi_cfa_expression	$_rsp,deref,+8
 ___
 $code.=<<___ if ($win64);
 	movaps	%xmm6,`$framesz+16*0`(%rsp)
@@ -580,6 +619,7 @@ $code.=<<___;
 
 	mov	$_ivp,$ivp
 	mov	$_rsp,%rsi
+.cfi_def_cfa	%rsi,8
 	vmovdqu	$iv,($ivp)		# output IV
 	vzeroall
 ___
@@ -596,15 +636,23 @@ $code.=<<___ if ($win64);
 	movaps	`$framesz+16*9`(%rsp),%xmm15
 ___
 $code.=<<___;
-	mov	(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lepilogue_xop:
 	ret
+.cfi_endproc
 .size	${func}_xop,.-${func}_xop
 ___
 ######################################################################
@@ -616,15 +664,23 @@ $code.=<<___;
 .type	${func}_avx,\@function,6
 .align	64
 ${func}_avx:
+.cfi_startproc
 .Lavx_shortcut:
 	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
+	mov	%rsp,%rax		# copy %rsp
+.cfi_def_cfa_register	%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
-	mov	%rsp,%r11		# copy %rsp
+.cfi_push	%r15
 	sub	\$`$framesz+$win64*16*10`,%rsp
 	and	\$-64,%rsp		# align stack frame
 
@@ -640,7 +696,8 @@ ${func}_avx:
 	mov	$ivp,$_ivp
 	mov	$ctx,$_ctx
 	mov	$in0,$_in0
-	mov	%r11,$_rsp
+	mov	%rax,$_rsp
+.cfi_cfa_expression	$_rsp,deref,+8
 ___
 $code.=<<___ if ($win64);
 	movaps	%xmm6,`$framesz+16*0`(%rsp)
@@ -831,6 +888,7 @@ $code.=<<___;
 
 	mov	$_ivp,$ivp
 	mov	$_rsp,%rsi
+.cfi_def_cfa	%rsi,8
 	vmovdqu	$iv,($ivp)		# output IV
 	vzeroall
 ___
@@ -847,15 +905,23 @@ $code.=<<___ if ($win64);
 	movaps	`$framesz+16*9`(%rsp),%xmm15
 ___
 $code.=<<___;
-	mov	(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lepilogue_avx:
 	ret
+.cfi_endproc
 .size	${func}_avx,.-${func}_avx
 ___
 
@@ -863,7 +929,7 @@ if ($avx>1) {{
 ######################################################################
 # AVX2+BMI code path
 #
-my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp 
+my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
 my $PUSH8=8*2*$SZ;
 use integer;
 
@@ -912,15 +978,23 @@ $code.=<<___;
 .type	${func}_avx2,\@function,6
 .align	64
 ${func}_avx2:
+.cfi_startproc
 .Lavx2_shortcut:
 	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
+	mov	%rsp,%rax		# copy %rsp
+.cfi_def_cfa_register	%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
-	mov	%rsp,%r11		# copy %rsp
+.cfi_push	%r15
 	sub	\$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 	and	\$-256*$SZ,%rsp		# align stack frame
 	add	\$`2*$SZ*($rounds-8)`,%rsp
@@ -937,7 +1011,8 @@ ${func}_avx2:
 	mov	$ivp,$_ivp
 	mov	$ctx,$_ctx
 	mov	$in0,$_in0
-	mov	%r11,$_rsp
+	mov	%rax,$_rsp
+.cfi_cfa_expression	$_rsp,deref,+8
 ___
 $code.=<<___ if ($win64);
 	movaps	%xmm6,`$framesz+16*0`(%rsp)
@@ -1015,7 +1090,23 @@ $code.=<<___;
 	vmovdqa	$t0,0x00(%rsp)
 	xor	$a1,$a1
 	vmovdqa	$t1,0x20(%rsp)
+___
+$code.=<<___ if (!$win64);
+# temporarily use %rsi as frame pointer
+        mov     $_rsp,%rsi
+.cfi_def_cfa    %rsi,8
+___
+$code.=<<___;
 	lea	-$PUSH8(%rsp),%rsp
+___
+$code.=<<___ if (!$win64);
+# the frame info is at $_rsp, but the stack is moving...
+# so a second frame pointer is saved at -8(%rsp)
+# that is in the red zone
+        mov     %rsi,-8(%rsp)
+.cfi_cfa_expression     %rsp-8,deref,+8
+___
+$code.=<<___;
 	mov	$B,$a3
 	vmovdqa	$t2,0x00(%rsp)
 	xor	$C,$a3			# magic
@@ -1037,7 +1128,17 @@ my @X = @_;
 my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
 my $base = "+2*$PUSH8(%rsp)";
 
-	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
+	if (($j%2)==0) {
+	&lea	("%rsp","-$PUSH8(%rsp)");
+$code.=<<___ if (!$win64);
+.cfi_cfa_expression     %rsp+`$PUSH8-8`,deref,+8
+# copy secondary frame pointer to new location again at -8(%rsp)
+        pushq   $PUSH8-8(%rsp)
+.cfi_cfa_expression     %rsp,deref,+8
+        lea     8(%rsp),%rsp
+.cfi_cfa_expression     %rsp-8,deref,+8
+___
+	}
 	foreach (Xupdate_256_AVX()) {		# 29 instructions
 	    eval;
 	    eval(shift(@insns));
@@ -1163,50 +1264,347 @@ $code.=<<___;
 
 	jbe	.Loop_avx2
 	lea	(%rsp),$Tbl
+# temporarily use $Tbl as index to $_rsp
+# this avoids the need to save a secondary frame pointer at -8(%rsp)
+.cfi_cfa_expression     $Tbl+`16*$SZ+7*8`,deref,+8
 
 .Ldone_avx2:
-	lea	($Tbl),%rsp
-	mov	$_ivp,$ivp
-	mov	$_rsp,%rsi
+	mov	16*$SZ+4*8($Tbl),$ivp
+	mov	16*$SZ+7*8($Tbl),%rsi
+.cfi_def_cfa	%rsi,8
 	vmovdqu	$iv,($ivp)		# output IV
 	vzeroall
 ___
 $code.=<<___ if ($win64);
-	movaps	`$framesz+16*0`(%rsp),%xmm6
-	movaps	`$framesz+16*1`(%rsp),%xmm7
-	movaps	`$framesz+16*2`(%rsp),%xmm8
-	movaps	`$framesz+16*3`(%rsp),%xmm9
-	movaps	`$framesz+16*4`(%rsp),%xmm10
-	movaps	`$framesz+16*5`(%rsp),%xmm11
-	movaps	`$framesz+16*6`(%rsp),%xmm12
-	movaps	`$framesz+16*7`(%rsp),%xmm13
-	movaps	`$framesz+16*8`(%rsp),%xmm14
-	movaps	`$framesz+16*9`(%rsp),%xmm15
+	movaps	`$framesz+16*0`($Tbl),%xmm6
+	movaps	`$framesz+16*1`($Tbl),%xmm7
+	movaps	`$framesz+16*2`($Tbl),%xmm8
+	movaps	`$framesz+16*3`($Tbl),%xmm9
+	movaps	`$framesz+16*4`($Tbl),%xmm10
+	movaps	`$framesz+16*5`($Tbl),%xmm11
+	movaps	`$framesz+16*6`($Tbl),%xmm12
+	movaps	`$framesz+16*7`($Tbl),%xmm13
+	movaps	`$framesz+16*8`($Tbl),%xmm14
+	movaps	`$framesz+16*9`($Tbl),%xmm15
 ___
 $code.=<<___;
-	mov	(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lepilogue_avx2:
 	ret
+.cfi_endproc
 .size	${func}_avx2,.-${func}_avx2
 ___
 }}
+}}
+{{
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my ($rounds,$Tbl)=("%r11d","%rbx");
+
+my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
+my @rndkey=("%xmm4","%xmm5");
+my $r=0;
+my $sn=0;
+
+my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
+my @MSG=map("%xmm$_",(10..13));
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+	movups		`16*$n`($in0),$in		# load input
+	xorps		$rndkey0,$in
+___
+      $code.=<<___ if ($n);
+	movups		$iv,`16*($n-1)`($out,$in0)	# write output
+___
+      $code.=<<___;
+	xorps		$in,$iv
+	movups		`32+16*$k-112`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+	cmp		\$11,$rounds
+	jb		.Laesenclast$sn
+	movups		`32+16*($k+0)-112`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*($k+1)-112`($key),$rndkey[0]
+	aesenc		$rndkey[1],$iv
+	je		.Laesenclast$sn
+	movups		`32+16*($k+2)-112`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*($k+3)-112`($key),$rndkey[0]
+	aesenc		$rndkey[1],$iv
+.Laesenclast$sn:
+	aesenclast	$rndkey[0],$iv
+	movups		16-112($key),$rndkey[1]		# forward reference
+	nop
+___
+    } else {
+      $code.=<<___;
+	movups		`32+16*$k-112`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+___
+    }
+    $r++;	unshift(@rndkey,pop(@rndkey));
+};
+
+if ($shaext) {
+my $Tbl="%rax";
+
+$code.=<<___;
+.type	${func}_shaext,\@function,6
+.align	32
+${func}_shaext:
+.cfi_startproc
+	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
+___
+$code.=<<___ if ($win64);
+	lea	`-8-10*16`(%rsp),%rsp
+	movaps	%xmm6,-8-10*16(%rax)
+	movaps	%xmm7,-8-9*16(%rax)
+	movaps	%xmm8,-8-8*16(%rax)
+	movaps	%xmm9,-8-7*16(%rax)
+	movaps	%xmm10,-8-6*16(%rax)
+	movaps	%xmm11,-8-5*16(%rax)
+	movaps	%xmm12,-8-4*16(%rax)
+	movaps	%xmm13,-8-3*16(%rax)
+	movaps	%xmm14,-8-2*16(%rax)
+	movaps	%xmm15,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+	lea		K256+0x80(%rip),$Tbl
+	movdqu		($ctx),$ABEF		# DCBA
+	movdqu		16($ctx),$CDGH		# HGFE
+	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
+
+	mov		240($key),$rounds
+	sub		$in0,$out
+	movups		($key),$rndkey0		# $key[0]
+	movups		($ivp),$iv		# load IV
+	movups		16($key),$rndkey[0]	# forward reference
+	lea		112($key),$key		# size optimization
+
+	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
+	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
+	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
+	movdqa		$TMP,$BSWAP		# offload
+	palignr		\$8,$CDGH,$ABEF		# ABEF
+	punpcklqdq	$Wi,$CDGH		# CDGH
+
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu		($inp),@MSG[0]
+	movdqu		0x10($inp),@MSG[1]
+	movdqu		0x20($inp),@MSG[2]
+	pshufb		$TMP,@MSG[0]
+	movdqu		0x30($inp),@MSG[3]
+
+	movdqa		0*32-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	pshufb		$TMP,@MSG[1]
+	movdqa		$CDGH,$CDGH_SAVE	# offload
+	movdqa		$ABEF,$ABEF_SAVE	# offload
+___
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$ABEF,$CDGH		# 0-3
+	pshufd		\$0x0e,$Wi,$Wi
+___
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		1*32-0x80($Tbl),$Wi
+	paddd		@MSG[1],$Wi
+	pshufb		$TMP,@MSG[2]
+	lea		0x40($inp),$inp
+___
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$ABEF,$CDGH		# 4-7
+	pshufd		\$0x0e,$Wi,$Wi
+___
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		2*32-0x80($Tbl),$Wi
+	paddd		@MSG[2],$Wi
+	pshufb		$TMP,@MSG[3]
+	sha256msg1	@MSG[1],@MSG[0]
+___
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$ABEF,$CDGH		# 8-11
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[3],$TMP
+	palignr		\$4,@MSG[2],$TMP
+	paddd		$TMP,@MSG[0]
+___
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		3*32-0x80($Tbl),$Wi
+	paddd		@MSG[3],$Wi
+	sha256msg2	@MSG[3],@MSG[0]
+	sha256msg1	@MSG[2],@MSG[1]
+___
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$ABEF,$CDGH		# 12-15
+	pshufd		\$0x0e,$Wi,$Wi
+___
+	&$aesenc();
+$code.=<<___;
+	movdqa		@MSG[0],$TMP
+	palignr		\$4,@MSG[3],$TMP
+	paddd		$TMP,@MSG[1]
+	sha256rnds2	$CDGH,$ABEF
+___
+for($i=4;$i<16-3;$i++) {
+	&$aesenc()	if (($r%10)==0);
+$code.=<<___;
+	movdqa		$i*32-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	sha256msg2	@MSG[0],@MSG[1]
+	sha256msg1	@MSG[3],@MSG[2]
+___
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$ABEF,$CDGH		# 16-19...
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[1],$TMP
+	palignr		\$4,@MSG[0],$TMP
+	paddd		$TMP,@MSG[2]
+___
+	&$aesenc();
+	&$aesenc()	if ($r==19);
+$code.=<<___;
+	sha256rnds2	$CDGH,$ABEF
+___
+	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	movdqa		13*32-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	sha256msg2	@MSG[0],@MSG[1]
+	sha256msg1	@MSG[3],@MSG[2]
+___
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$ABEF,$CDGH		# 52-55
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[1],$TMP
+	palignr		\$4,@MSG[0],$TMP
+	paddd		$TMP,@MSG[2]
+___
+	&$aesenc();
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		14*32-0x80($Tbl),$Wi
+	paddd		@MSG[1],$Wi
+	sha256msg2	@MSG[1],@MSG[2]
+	movdqa		$BSWAP,$TMP
+___
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$ABEF,$CDGH		# 56-59
+	pshufd		\$0x0e,$Wi,$Wi
+___
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		15*32-0x80($Tbl),$Wi
+	paddd		@MSG[2],$Wi
+___
+	&$aesenc();
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$ABEF,$CDGH		# 60-63
+	pshufd		\$0x0e,$Wi,$Wi
+___
+	&$aesenc();
+$code.=<<___;
+	sha256rnds2	$CDGH,$ABEF
+	#pxor		$CDGH,$rndkey0		# black magic
+___
+	while ($r<40)	{ &$aesenc(); }		# remaining aesenc's
+$code.=<<___;
+	#xorps		$CDGH,$rndkey0		# black magic
+	paddd		$CDGH_SAVE,$CDGH
+	paddd		$ABEF_SAVE,$ABEF
+
+	dec		$len
+	movups		$iv,48($out,$in0)	# write output
+	lea		64($in0),$in0
+	jnz		.Loop_shaext
+
+	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
+	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
+	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
+	punpckhqdq	$CDGH,$ABEF		# DCBA
+	palignr		\$8,$TMP,$CDGH		# HGFE
+
+	movups		$iv,($ivp)		# write IV
+	movdqu		$ABEF,($ctx)
+	movdqu		$CDGH,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	0*16(%rsp),%xmm6
+	movaps	1*16(%rsp),%xmm7
+	movaps	2*16(%rsp),%xmm8
+	movaps	3*16(%rsp),%xmm9
+	movaps	4*16(%rsp),%xmm10
+	movaps	5*16(%rsp),%xmm11
+	movaps	6*16(%rsp),%xmm12
+	movaps	7*16(%rsp),%xmm13
+	movaps	8*16(%rsp),%xmm14
+	movaps	9*16(%rsp),%xmm15
+	lea	8+10*16(%rsp),%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+	ret
+.cfi_endproc
+.size	${func}_shaext,.-${func}_shaext
+___
+}
 }}}}}
 
 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 #		CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
+if ($win64 && $avx) {
 $rec="%rcx";
 $frame="%rdx";
 $context="%r8";
 $disp="%r9";
 
-$code.=<<___ if ($avx);
+$code.=<<___;
 .extern	__imp_RtlVirtualUnwind
 .type	se_handler,\@abi-omnipotent
 .align	16
@@ -1240,6 +1638,19 @@ se_handler:
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lin_prologue
 ___
+$code.=<<___ if ($shaext);
+	lea	aesni_cbc_sha256_enc_shaext(%rip),%r10
+	cmp	%r10,%rbx
+	jb	.Lnot_in_shaext
+
+	lea	(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	168(%rax),%rax		# adjust stack pointer
+	jmp	.Lin_prologue
+.Lnot_in_shaext:
+___
 $code.=<<___ if ($avx>1);
 	lea	.Lavx2_shortcut(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
@@ -1252,7 +1663,6 @@ ___
 $code.=<<___;
 	mov	%rax,%rsi		# put aside Rsp
 	mov	16*$SZ+7*8(%rax),%rax	# pull $_rsp
-	lea	48(%rax),%rax
 
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
@@ -1267,10 +1677,6 @@ $code.=<<___;
 	mov	%r14,232($context)	# restore context->R14
 	mov	%r15,240($context)	# restore context->R15
 
-	lea	.Lepilogue(%rip),%r10
-	cmp	%r10,%rbx
-	jb	.Lin_prologue		# non-AVX code
-
 	lea	16*$SZ+8*8(%rsi),%rsi	# Xmm6- save area
 	lea	512($context),%rdi	# &context.Xmm6
 	mov	\$20,%ecx
@@ -1330,7 +1736,12 @@ $code.=<<___ if ($avx>1);
 	.rva	.LSEH_end_${func}_avx2
 	.rva	.LSEH_info_${func}_avx2
 ___
-$code.=<<___ if ($avx);
+$code.=<<___ if ($shaext);
+	.rva	.LSEH_begin_${func}_shaext
+	.rva	.LSEH_end_${func}_shaext
+	.rva	.LSEH_info_${func}_shaext
+___
+$code.=<<___;
 .section	.xdata
 .align	8
 .LSEH_info_${func}_xop:
@@ -1349,8 +1760,47 @@ $code.=<<___ if ($avx>1);
 	.rva	se_handler
 	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
 ___
+$code.=<<___ if ($shaext);
+.LSEH_info_${func}_shaext:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04			if($dst>=8);
+    $rex|=0x01			if($src>=8);
+    unshift @opcode,$rex|0x40	if($rex);
+}
+
+{
+  my %opcodelet = (
+		"sha256rnds2" => 0xcb,
+  		"sha256msg1"  => 0xcc,
+		"sha256msg2"  => 0xcd	);
+
+  sub sha256op38 {
+    my $instr = shift;
+
+    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x0f,0x38);
+	rex(\@opcode,$2,$1);
+	push @opcode,$opcodelet{$instr};
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	return ".byte\t".join(',',@opcode);
+    } else {
+	return $instr."\t".@_[0];
+    }
+  }
 }
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
 print $code;
-close STDOUT;
+close STDOUT or die "error closing STDOUT: $!";