+___
+$code.=<<___ if ($mode eq "ctr32");
+ mov -4($ctx),%eax # pull 32-bit counter
+ bswap %eax
+ neg %eax
+ and \$0xffff,%eax
+ mov \$`16*0x10000`,$chunk
+ shl \$4,%eax
+ cmovz $chunk,%rax
+ cmp %rax,$len
+ cmova %rax,$chunk # don't let counter cross 2^16
+ cmovbe $len,$chunk
+ jbe .L${mode}_aligned_skip
+
+.L${mode}_aligned_loop:
+ mov $len,%r10 # save parameters
+ mov $chunk,$len
+ mov $chunk,%r11
+
+ lea -16($ctx),%rax # ivp
+ lea 16($ctx),%rbx # key
+ shr \$4,$len # len/=AES_BLOCK_SIZE
+ .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
+
+ mov -4($ctx),%eax # pull 32-bit counter
+ bswap %eax
+ add \$0x10000,%eax
+ bswap %eax
+ mov %eax,-4($ctx)
+
+ mov %r10,$len # restore parameters
+ sub %r11,$len
+ mov \$`16*0x10000`,$chunk
+ jz .L${mode}_exit
+ cmp $chunk,$len
+ jae .L${mode}_aligned_loop
+
+.L${mode}_aligned_skip:
+___
+$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
+ lea ($inp,$len),%rbp
+ neg %rbp
+ and \$0xfff,%rbp # distance to page boundary
+ xor %eax,%eax
+ cmp \$$PADLOCK_PREFETCH{$mode},%rbp
+ mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
+ cmovae %rax,%rbp
+ and $len,%rbp # remainder
+ sub %rbp,$len
+ jz .L${mode}_aligned_tail
+___
+$code.=<<___;