# September 2011
#
-# Assembler helpers for Padlock engine.
+# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
+# details.
$flavour = shift;
$output = shift;
$code=".text\n";
-$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16
+$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
$ctx="%rdx";
$out="%rdi";
ret
.size padlock_sha1_oneshot,.-padlock_sha1_oneshot
-.globl padlock_sha1
-.type padlock_sha1,\@function,3
+.globl padlock_sha1_blocks
+.type padlock_sha1_blocks,\@function,3
.align 16
-padlock_sha1:
+padlock_sha1_blocks:
mov \$-1,%rax
mov %rdx,%rcx
.byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
ret
-.size padlock_sha1,.-padlock_sha1
+.size padlock_sha1_blocks,.-padlock_sha1_blocks
.globl padlock_sha256_oneshot
.type padlock_sha256_oneshot,\@function,3
ret
.size padlock_sha256_oneshot,.-padlock_sha256_oneshot
-.globl padlock_sha256
-.type padlock_sha256,\@function,3
+.globl padlock_sha256_blocks
+.type padlock_sha256_blocks,\@function,3
.align 16
-padlock_sha256:
+padlock_sha256_blocks:
mov \$-1,%rax
mov %rdx,%rcx
.byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
ret
-.size padlock_sha256,.-padlock_sha256
+.size padlock_sha256_blocks,.-padlock_sha256_blocks
+
+.globl padlock_sha512_blocks
+.type padlock_sha512_blocks,\@function,3
+.align 16
+padlock_sha512_blocks:
+ mov %rdx,%rcx
+ .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
+ ret
+.size padlock_sha512_blocks,.-padlock_sha512_blocks
___
sub generate_mode {
xor %eax,%eax
xor %ebx,%ebx
testl \$`1<<5`,($ctx) # align bit in control word
+ jnz .L${mode}_aligned
test \$0x0f,$out
setz %al # !out_misaligned
test \$0x0f,$inp
neg %rax
and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
lea (%rax,%rbp),%rsp
+___
+$code.=<<___ if ($mode eq "ctr32");
+ mov -4($ctx),%eax # pull 32-bit counter
+ bswap %eax
+ neg %eax
+ and \$`$PADLOCK_CHUNK/16-1`,%eax
+ jz .L${mode}_loop
+ shl \$4,%eax
+ cmp %rax,$len
+ cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
+___
+$code.=<<___;
jmp .L${mode}_loop
.align 16
.L${mode}_loop:
+ cmp $len,$chunk # ctr32 artefact
+ cmova $len,$chunk # ctr32 artefact
mov $out,%r8 # save parameters
mov $inp,%r9
mov $len,%r10
movdqa (%rax),%xmm0
movdqa %xmm0,-16($ctx) # copy [or refresh] iv
___
+$code.=<<___ if ($mode eq "ctr32");
+ mov -4($ctx),%eax # pull 32-bit counter
+ test \$0xffff0000,%eax
+ jnz .L${mode}_no_corr
+ bswap %eax
+ add \$0x10000,%eax
+ bswap %eax
+ mov %eax,-4($ctx)
+.L${mode}_no_corr:
+___
$code.=<<___;
mov %r8,$out # restore paramters
mov %r11,$chunk
.align 16
.L${mode}_aligned:
+___
+$code.=<<___ if ($mode eq "ctr32");
+ mov -4($ctx),%eax # pull 32-bit counter
+ mov \$`16*0x10000`,$chunk
+ bswap %eax
+ cmp $len,$chunk
+ cmova $len,$chunk
+ neg %eax
+ and \$0xffff,%eax
+ jz .L${mode}_aligned_loop
+ shl \$4,%eax
+ cmp %rax,$len
+ cmova %rax,$chunk # don't let counter cross 2^16
+ jmp .L${mode}_aligned_loop
+.align 16
+.L${mode}_aligned_loop:
+ cmp $len,$chunk
+ cmova $len,$chunk
+ mov $len,%r10 # save parameters
+ mov $chunk,$len
+ mov $chunk,%r11
+___
+$code.=<<___;
lea -16($ctx),%rax # ivp
lea 16($ctx),%rbx # key
shr \$4,$len # len/=AES_BLOCK_SIZE
movdqa (%rax),%xmm0
movdqa %xmm0,-16($ctx) # copy [or refresh] iv
___
+$code.=<<___ if ($mode eq "ctr32");
+ mov -4($ctx),%eax # pull 32-bit counter
+ bswap %eax
+ add \$0x10000,%eax
+ bswap %eax
+ mov %eax,-4($ctx)
+
+ mov %r11,$chunk # restore paramters
+ mov %r10,$len
+ sub $chunk,$len
+ mov \$`16*0x10000`,$chunk
+ jnz .L${mode}_aligned_loop
+___
$code.=<<___;
.L${mode}_exit:
mov \$1,%eax
&generate_mode("cbc",0xd0);
&generate_mode("cfb",0xe0);
&generate_mode("ofb",0xe8);
-&generate_mode("ctr16",0xd8);
+&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
$code.=<<___;
.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"