# loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one...
#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+# bit shift operation is neatly fused with 128-bit xor here, and
+# "538B" variant would eliminate only 4-5 instructions out of 32
+# in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+# consumption might be unappreciated (for so little improvement);
+#
# Byte order [in]dependence. =========================================
#
# Caller is expected to maintain specific *dword* order in Htable,
# *native* byte order on current platform. See gcm128.c for working
# example...
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
$Xi="r0"; # argument block
$Htbl="r1";
$inp="r2";
$rem_4bit=$inp; # used in gcm_gmult_4bit
$cnt=$len;
-$output=shift;
-open STDOUT,">$output";
-
sub Zsmash() {
my $i=12;
my @args=@_;
add $Zhh,$Htbl,$nlo,lsl#4
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
+ add $Thh,$Htbl,$nhi
ldrb $nlo,[$inp,#14]
- add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
- mov $nhi,$nhi,lsl#1
+ add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
eor $nlo,$nlo,$nhi
- eor $Zhh,$Zhh,$Tll,lsl#16
and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tll,lsl#16
.Loop:
add $Thh,$Htbl,$nlo,lsl#4
subs $cnt,$cnt,#1
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
and $nlo,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
add $nlo,$nlo,$nlo
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
add $Thh,$Htbl,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
and $nhi,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
- ldrplb $nhi,[$Xi,$cnt]
eor $Zlh,$Tlh,$Zlh,lsr#4
+ ldrplb $nhi,[$Xi,$cnt]
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
- eor $Zhh,$Thh,$Zhh,lsr#4
eorpl $nlo,$nlo,$nhi
- eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
+ eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop
ldr $len,[sp,#32] @ re-load $len/end
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
- mov $nhi,$nhi,lsl#1
+ add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
.Loop2:
add $Thh,$Htbl,$nlo,lsl#4
subs $cnt,$cnt,#1
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
and $nlo,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
add $nlo,$nlo,$nlo
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
add $Thh,$Htbl,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
- ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
and $nhi,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0
- eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
andpl $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop2
___
&Zsmash();