# ====================================================================
#
# June 2015
-#
+#
# ChaCha20 for ARMv8.
#
# Performance in cycles per byte out of large buffer.
# Denver 4.50/+82% 2.63 2.67(*)
# X-Gene 9.50/+46% 8.82 8.89(*)
# Mongoose 8.00/+44% 3.64 3.25
+# Kryo 8.17/+50% 4.83 4.65
#
# (*) it's expected that doubling interleave factor doesn't help
# all processors, only those with higher NEON latency and
b.ne ChaCha20_neon
.Lshort:
+ .inst 0xd503233f // paciasp
stp x29,x30,[sp,#-96]!
add x29,sp,#0
mov $ctr,#10
subs $len,$len,#64
.Loop:
- sub $ctr,$ctr,#1
+ sub $ctr,$ctr,#1
___
foreach (&ROUND(0, 4, 8,12)) { eval; }
foreach (&ROUND(0, 5,10,15)) { eval; }
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
+ .inst 0xd50323bf // autiasp
.Labort:
ret
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
+ .inst 0xd50323bf // autiasp
ret
.size ChaCha20_ctr32,.-ChaCha20_ctr32
___
.type ChaCha20_neon,%function
.align 5
ChaCha20_neon:
+ .inst 0xd503233f // paciasp
stp x29,x30,[sp,#-96]!
add x29,sp,#0
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
+ .inst 0xd50323bf // autiasp
ret
.Ltail_neon:
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
+ .inst 0xd50323bf // autiasp
ret
.size ChaCha20_neon,.-ChaCha20_neon
___
.type ChaCha20_512_neon,%function
.align 5
ChaCha20_512_neon:
+ .inst 0xd503233f // paciasp
stp x29,x30,[sp,#-96]!
add x29,sp,#0
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
+ .inst 0xd50323bf // autiasp
ret
.size ChaCha20_512_neon,.-ChaCha20_512_neon
___