+if ($shaext) {{{
+######################################################################
+# Intel SHA Extensions implementation of SHA1 update function.
+#
+my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
+my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
+my @MSG=map("%xmm$_",(4..7));
+
+$code.=<<___;
+.type sha1_block_data_order_shaext,\@function,3
+.align 32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+___
+$code.=<<___ if ($win64);
+ lea `-8-4*16`(%rsp),%rsp
+ movaps %xmm6,-8-4*16(%rax)
+ movaps %xmm7,-8-3*16(%rax)
+ movaps %xmm8,-8-2*16(%rax)
+ movaps %xmm9,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+ movdqu ($ctx),$ABCD
+ movd 16($ctx),$E
+ movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
+
+ movdqu ($inp),@MSG[0]
+ pshufd \$0b00011011,$ABCD,$ABCD # flip word order
+ movdqu 0x10($inp),@MSG[1]
+ pshufd \$0b00011011,$E,$E # flip word order
+ movdqu 0x20($inp),@MSG[2]
+ pshufb $BSWAP,@MSG[0]
+ movdqu 0x30($inp),@MSG[3]
+ pshufb $BSWAP,@MSG[1]
+ pshufb $BSWAP,@MSG[2]
+ movdqa $E,$E_SAVE # offload $E
+ pshufb $BSWAP,@MSG[3]
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ dec $num
+ lea 0x40($inp),%r8 # next input block
+ paddd @MSG[0],$E
+ cmovne %r8,$inp
+ movdqa $ABCD,$ABCD_SAVE # offload $ABCD
+___
+for($i=0;$i<20-4;$i+=2) {
+$code.=<<___;
+ sha1msg1 @MSG[1],@MSG[0]
+ movdqa $ABCD,$E_
+ sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
+ sha1nexte @MSG[1],$E_
+ pxor @MSG[2],@MSG[0]
+ sha1msg1 @MSG[2],@MSG[1]
+ sha1msg2 @MSG[3],@MSG[0]
+
+ movdqa $ABCD,$E
+ sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
+ sha1nexte @MSG[2],$E
+ pxor @MSG[3],@MSG[1]
+ sha1msg2 @MSG[0],@MSG[1]
+___
+ push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ movdqu ($inp),@MSG[0]
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 64-67
+ sha1nexte @MSG[1],$E_
+ movdqu 0x10($inp),@MSG[1]
+ pshufb $BSWAP,@MSG[0]
+
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 68-71
+ sha1nexte @MSG[2],$E
+ movdqu 0x20($inp),@MSG[2]
+ pshufb $BSWAP,@MSG[1]
+
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 72-75
+ sha1nexte @MSG[3],$E_
+ movdqu 0x30($inp),@MSG[3]
+ pshufb $BSWAP,@MSG[2]
+
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 76-79
+ sha1nexte $E_SAVE,$E
+ pshufb $BSWAP,@MSG[3]
+
+ paddd $ABCD_SAVE,$ABCD
+ movdqa $E,$E_SAVE # offload $E
+
+ jnz .Loop_shaext
+
+ pshufd \$0b00011011,$ABCD,$ABCD
+ pshufd \$0b00011011,$E,$E
+ movdqu $ABCD,($ctx)
+ movd $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps -8-4*16(%rax),%xmm6
+ movaps -8-3*16(%rax),%xmm7
+ movaps -8-2*16(%rax),%xmm8
+ movaps -8-1*16(%rax),%xmm9
+ mov %rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+ ret
+.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
+___
+}}}