$code=".text\n";
+%PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64); # prefetch errata
$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
$ctx="%rdx";
.type padlock_sha1_oneshot,\@function,3
.align 16
padlock_sha1_oneshot:
- xor %rax,%rax
mov %rdx,%rcx
+ mov %rdi,%rdx # put aside %rdi
+ movups (%rdi),%xmm0 # copy-in context
+ sub \$128+8,%rsp
+ mov 16(%rdi),%eax
+ movaps %xmm0,(%rsp)
+ mov %rsp,%rdi
+ mov %eax,16(%rsp)
+ xor %rax,%rax
.byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
+ movaps (%rsp),%xmm0
+ mov 16(%rsp),%eax
+ add \$128+8,%rsp
+ movups %xmm0,(%rdx) # copy-out context
+ mov %eax,16(%rdx)
ret
.size padlock_sha1_oneshot,.-padlock_sha1_oneshot
.type padlock_sha1_blocks,\@function,3
.align 16
padlock_sha1_blocks:
- mov \$-1,%rax
mov %rdx,%rcx
+ mov %rdi,%rdx # put aside %rdi
+ movups (%rdi),%xmm0 # copy-in context
+ sub \$128+8,%rsp
+ mov 16(%rdi),%eax
+ movaps %xmm0,(%rsp)
+ mov %rsp,%rdi
+ mov %eax,16(%rsp)
+ mov \$-1,%rax
.byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
+ movaps (%rsp),%xmm0
+ mov 16(%rsp),%eax
+ add \$128+8,%rsp
+ movups %xmm0,(%rdx) # copy-out context
+ mov %eax,16(%rdx)
ret
.size padlock_sha1_blocks,.-padlock_sha1_blocks
.type padlock_sha256_oneshot,\@function,3
.align 16
padlock_sha256_oneshot:
- xor %rax,%rax
mov %rdx,%rcx
+ mov %rdi,%rdx # put aside %rdi
+ movups (%rdi),%xmm0 # copy-in context
+ sub \$128+8,%rsp
+ movups 16(%rdi),%xmm1
+ movaps %xmm0,(%rsp)
+ mov %rsp,%rdi
+ movaps %xmm1,16(%rsp)
+ xor %rax,%rax
.byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
+ movaps (%rsp),%xmm0
+ movaps 16(%rsp),%xmm1
+ add \$128+8,%rsp
+ movups %xmm0,(%rdx) # copy-out context
+ movups %xmm1,16(%rdx)
ret
.size padlock_sha256_oneshot,.-padlock_sha256_oneshot
.type padlock_sha256_blocks,\@function,3
.align 16
padlock_sha256_blocks:
- mov \$-1,%rax
mov %rdx,%rcx
+ mov %rdi,%rdx # put aside %rdi
+ movups (%rdi),%xmm0 # copy-in context
+ sub \$128+8,%rsp
+ movups 16(%rdi),%xmm1
+ movaps %xmm0,(%rsp)
+ mov %rsp,%rdi
+ movaps %xmm1,16(%rsp)
+ mov \$-1,%rax
.byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
+ movaps (%rsp),%xmm0
+ movaps 16(%rsp),%xmm1
+ add \$128+8,%rsp
+ movups %xmm0,(%rdx) # copy-out context
+ movups %xmm1,16(%rdx)
ret
.size padlock_sha256_blocks,.-padlock_sha256_blocks
.align 16
padlock_sha512_blocks:
mov %rdx,%rcx
+ mov %rdi,%rdx # put aside %rdi
+ movups (%rdi),%xmm0 # copy-in context
+ sub \$128+8,%rsp
+ movups 16(%rdi),%xmm1
+ movups 32(%rdi),%xmm2
+ movups 48(%rdi),%xmm3
+ movaps %xmm0,(%rsp)
+ mov %rsp,%rdi
+ movaps %xmm1,16(%rsp)
+ movaps %xmm2,32(%rsp)
+ movaps %xmm3,48(%rsp)
.byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
+ movaps (%rsp),%xmm0
+ movaps 16(%rsp),%xmm1
+ movaps 32(%rsp),%xmm2
+ movaps 48(%rsp),%xmm3
+ add \$128+8,%rsp
+ movups %xmm0,(%rdx) # copy-out context
+ movups %xmm1,16(%rdx)
+ movups %xmm2,32(%rdx)
+ movups %xmm3,48(%rdx)
ret
.size padlock_sha512_blocks,.-padlock_sha512_blocks
___
lea 16($ctx),$ctx # control word
xor %eax,%eax
xor %ebx,%ebx
+___
+# Formally speaking correct condtion is $len<=$margin and $inp+$margin
+# crosses page boundary [and next page is unreadable]. But $inp can
+# be unaligned in which case data can be copied to $out if latter is
+# aligned, in which case $out+$margin has to be checked. Covering all
+# cases appears more complicated than just copying short input...
+$code.=<<___ if ($PADLOCK_MARGIN{$mode});
+ cmp \$$PADLOCK_MARGIN{$mode},$len
+ jbe .L${mode}_short
+___
+$code.=<<___;
testl \$`1<<5`,($ctx) # align bit in control word
jnz .L${mode}_aligned
test \$0x0f,$out
lea (%rax,%rbp),%rsp
___
$code.=<<___ if ($mode eq "ctr32");
+.L${mode}_reenter:
mov -4($ctx),%eax # pull 32-bit counter
bswap %eax
neg %eax
mov \$$PADLOCK_CHUNK,$chunk
jnz .L${mode}_loop
- test \$0x0f,$out
- jz .L${mode}_done
+ cmp %rsp,%rbp
+ je .L${mode}_done
+
+ pxor %xmm0,%xmm0
+ lea (%rsp),%rax
+.L${mode}_bzero:
+ movaps %xmm0,(%rax)
+ lea 16(%rax),%rax
+ cmp %rax,%rbp
+ ja .L${mode}_bzero
- mov %rbp,$len
- mov %rsp,$out
- sub %rsp,$len
- xor %rax,%rax
- shr \$3,$len
- .byte 0xf3,0x48,0xab # rep stosq
.L${mode}_done:
lea (%rbp),%rsp
jmp .L${mode}_exit
-
+___
+$code.=<<___ if ($PADLOCK_MARGIN{$mode});
+.align 16
+.L${mode}_short:
+ mov %rsp,%rbp
+ sub $len,%rsp
+ xor $chunk,$chunk
+.L${mode}_short_copy:
+ movups ($inp,$chunk),%xmm0
+ lea 16($chunk),$chunk
+ cmp $chunk,$len
+ movaps %xmm0,-16(%rsp,$chunk)
+ ja .L${mode}_short_copy
+ mov %rsp,$inp
+ mov $len,$chunk
+ jmp .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"`
+___
+$code.=<<___;
.align 16
.L${mode}_aligned:
___