X-Git-Url: https://git.openssl.org/?a=blobdiff_plain;f=crypto%2Fec%2Fcurve448%2Fp448%2Farch_neon%2Ff_impl.c;fp=crypto%2Fec%2Fcurve448%2Fp448%2Farch_neon%2Ff_impl.c;h=0000000000000000000000000000000000000000;hb=abcd22bf621b25e5db724b0ad9bcb4bcc189b1d3;hp=5e998f9f371081d8baebb4185d01c1b57a44218c;hpb=7324473f893ef135693cf983b415f19a0366d539;p=openssl.git diff --git a/crypto/ec/curve448/p448/arch_neon/f_impl.c b/crypto/ec/curve448/p448/arch_neon/f_impl.c deleted file mode 100644 index 5e998f9f37..0000000000 --- a/crypto/ec/curve448/p448/arch_neon/f_impl.c +++ /dev/null @@ -1,592 +0,0 @@ -/* Copyright (c) 2014 Cryptography Research, Inc. - * Released under the MIT License. See LICENSE.txt for license information. - */ - -#include "f_field.h" - -static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused)) -xx_vaddup_u64(uint64x2_t x) { - __asm__ ("vadd.s64 %f0, %e0" : "+w"(x)); - return x; -} - -static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused)) -vrev128_s64(int64x2_t x) { - __asm__ ("vswp.s64 %e0, %f0" : "+w"(x)); - return x; -} - -static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline)) -vrev128_u64(uint64x2_t x) { - __asm__ ("vswp.s64 %e0, %f0" : "+w"(x)); - return x; -} - -static inline void __attribute__((gnu_inline,always_inline,unused)) -smlal ( - uint64_t *acc, - const uint32_t a, - const uint32_t b -) { - *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; -} - -static inline void __attribute__((gnu_inline,always_inline,unused)) -smlal2 ( - uint64_t *acc, - const uint32_t a, - const uint32_t b -) { - *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; -} - -static inline void __attribute__((gnu_inline,always_inline,unused)) -smull ( - uint64_t *acc, - const uint32_t a, - const uint32_t b -) { - *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; -} - -static inline void __attribute__((gnu_inline,always_inline,unused)) -smull2 ( - uint64_t *acc, - const uint32_t a, - const uint32_t b -) { - *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; -} - -void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { - #define _bl0 "q0" - #define _bl0_0 "d0" - #define _bl0_1 "d1" - #define _bh0 "q1" - #define _bh0_0 "d2" - #define _bh0_1 "d3" - #define _bs0 "q2" - #define _bs0_0 "d4" - #define _bs0_1 "d5" - #define _bl2 "q3" - #define _bl2_0 "d6" - #define _bl2_1 "d7" - #define _bh2 "q4" - #define _bh2_0 "d8" - #define _bh2_1 "d9" - #define _bs2 "q5" - #define _bs2_0 "d10" - #define _bs2_1 "d11" - - #define _as0 "q6" - #define _as0_0 "d12" - #define _as0_1 "d13" - #define _as2 "q7" - #define _as2_0 "d14" - #define _as2_1 "d15" - #define _al0 "q8" - #define _al0_0 "d16" - #define _al0_1 "d17" - #define _ah0 "q9" - #define _ah0_0 "d18" - #define _ah0_1 "d19" - #define _al2 "q10" - #define _al2_0 "d20" - #define _al2_1 "d21" - #define _ah2 "q11" - #define _ah2_0 "d22" - #define _ah2_1 "d23" - - #define _a0a "q12" - #define _a0a_0 "d24" - #define _a0a_1 "d25" - #define _a0b "q13" - #define _a0b_0 "d26" - #define _a0b_1 "d27" - #define _a1a "q14" - #define _a1a_0 "d28" - #define _a1a_1 "d29" - #define _a1b "q15" - #define _a1b_0 "d30" - #define _a1b_1 "d31" - #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t" - #define VOP3(op,result,a,b) #op" "result", "a", "b"\n\t" - #define VOP2(op,result,a) #op" "result", "a"\n\t" - - int32x2_t *vc = (int32x2_t*) cs->limb; - - __asm__ __volatile__( - - "vld2.32 {"_al0_0","_al0_1","_ah0_0","_ah0_1"}, [%[a],:128]!" "\n\t" - VOP3(vadd.i32,_as0,_al0,_ah0) - - "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t" - VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) - VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) - - "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t" - VOP3(vadd.i32,_bs2,_bl2,_bh2) - - "vld2.32 {"_al2_0","_al2_1","_ah2_0","_ah2_1"}, [%[a],:128]!" "\n\t" - VOP3(vadd.i32,_as2,_al2,_ah2) - - VMAC(vmull.s32,_a0b,_as0_1,_bs2_1,0) - VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0) - VMAC(vmlal.s32,_a0b,_as2_1,_bs0_1,0) - VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0) - - VMAC(vmull.s32,_a1b,_as0_1,_bs2_1,1) - VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1) - VMAC(vmlal.s32,_a1b,_as2_1,_bs0_1,1) - VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1) - - VOP2(vmov,_a0a,_a0b) - VMAC(vmlal.s32,_a0a,_ah0_1,_bh2_1,0) - VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_0,0) - VMAC(vmlal.s32,_a0a,_ah2_1,_bh0_1,0) - VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_0,0) - - VMAC(vmlsl.s32,_a0b,_al0_1,_bl2_1,0) - VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_0,0) - VMAC(vmlsl.s32,_a0b,_al2_1,_bl0_1,0) - VMAC(vmlal.s32,_a0b,_al0_0,_bs0_0,0) - - VOP2(vmov,_a1a,_a1b) - VMAC(vmlal.s32,_a1a,_ah0_1,_bh2_1,1) - VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_0,1) - VMAC(vmlal.s32,_a1a,_ah2_1,_bh0_1,1) - VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_0,1) - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vmlsl.s32,_a1b,_al0_1,_bl2_1,1) - VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_0,1) - VMAC(vmlsl.s32,_a1b,_al2_1,_bl0_1,1) - VMAC(vmlal.s32,_a1b,_al0_0,_bs0_0,1) - - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1) - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a1b,_a0a,_a1b) - - - VMAC(vmull.s32,_a0a,_as2_0,_bs2_1,0) - VOP2(vmovn.i64,_a0b_1,_a1b) - VMAC(vmlal.s32,_a0a,_as2_1,_bs2_0,0) - VOP3(vsra.u64,_a1a,_a1b,"#28") - VMAC(vmlal.s32,_a0a,_as0_0,_bh0_1,0) - VOP2(vbic.i32,_a0b,"#0xf0000000") - VMAC(vmlal.s32,_a0a,_as0_1,_bh0_0,0) - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - - VMAC(vmull.s32,_a1b,_as2_0,_bs2_1,1) - VMAC(vmlal.s32,_a1b,_as2_1,_bs2_0,1) - VMAC(vmlal.s32,_a1b,_as0_0,_bh0_1,1) - VMAC(vmlal.s32,_a1b,_as0_1,_bh0_0,1) - - VOP2(vmov,_a0b_1,_a0a_1) - VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0) - VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1) - VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_1,0) - VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_0,0) - VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_1,0) - VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_0,0) - - VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_1,0) - VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_0,0) - VMAC(vmlal.s32,_a0b,_al0_0,_bs0_1,0) - VMAC(vmlal.s32,_a0b,_al0_1,_bs0_0,0) - - VOP2(vmov,_a1a,_a1b) - VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_1,1) - VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_0,1) - VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_1,1) - VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_0,1) - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_1,1) - VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_0,1) - VMAC(vmlal.s32,_a1b,_al0_0,_bs0_1,1) - VMAC(vmlal.s32,_a1b,_al0_1,_bs0_0,1) - - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0) - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a1b,_a0a,_a1b) - - VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0) - VOP2(vmovn.i64,_a0b_1,_a1b) - VMAC(vmlal.s32,_a0a,_as0_0,_bh2_0,0) - VOP3(vsra.u64,_a1a,_a1b,"#28") - VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0) - VOP2(vbic.i32,_a0b,"#0xf0000000") - VMAC(vmlal.s32,_a0a,_as2_0,_bh0_0,0) - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - - VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1) - VMAC(vmlal.s32,_a1b,_as0_0,_bh2_0,1) - VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1) - VMAC(vmlal.s32,_a1b,_as2_0,_bh0_0,1) - - VOP2(vmov,_a0b_1,_a0a_1) - VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0) - VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1) - VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_1,0) - VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_0,0) - VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_1,0) - VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_0,0) - - VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_1,0) - VMAC(vmlal.s32,_a0b,_al0_0,_bs2_0,0) - VMAC(vmlal.s32,_a0b,_al0_1,_bs0_1,0) - VMAC(vmlal.s32,_a0b,_al2_0,_bs0_0,0) - - VOP2(vmov,_a1a,_a1b) - VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_1,1) - VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_0,1) - VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_1,1) - VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_0,1) - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_1,1) - VMAC(vmlal.s32,_a1b,_al0_0,_bs2_0,1) - VMAC(vmlal.s32,_a1b,_al0_1,_bs0_1,1) - VMAC(vmlal.s32,_a1b,_al2_0,_bs0_0,1) - - VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1) - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a1b,_a0a,_a1b) - - VMAC(vmull.s32,_a0a,_as0_0,_bh2_1,0) - VOP2(vmovn.i64,_a0b_1,_a1b) - VMAC(vmlal.s32,_a0a,_as0_1,_bh2_0,0) - VOP3(vsra.u64,_a1a,_a1b,"#28") - VMAC(vmlal.s32,_a0a,_as2_0,_bh0_1,0) - VOP2(vbic.i32,_a0b,"#0xf0000000") - VMAC(vmlal.s32,_a0a,_as2_1,_bh0_0,0) - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - - VMAC(vmull.s32,_a1b,_as0_0,_bh2_1,1) - VMAC(vmlal.s32,_a1b,_as0_1,_bh2_0,1) - VMAC(vmlal.s32,_a1b,_as2_0,_bh0_1,1) - VMAC(vmlal.s32,_a1b,_as2_1,_bh0_0,1) - - VOP2(vmov,_a0b_1,_a0a_1) - VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0) - VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1) - VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_1,0) - VMAC(vmlal.s32,_a0a,_ah0_1,_bl2_0,0) - VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_1,0) - VMAC(vmlal.s32,_a0a,_ah2_1,_bl0_0,0) - - VMAC(vmlal.s32,_a0b,_al0_0,_bs2_1,0) - VMAC(vmlal.s32,_a0b,_al0_1,_bs2_0,0) - VMAC(vmlal.s32,_a0b,_al2_0,_bs0_1,0) - VMAC(vmlal.s32,_a0b,_al2_1,_bs0_0,0) - - VOP2(vmov,_a1a,_a1b) - VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_1,1) - VMAC(vmlal.s32,_a1a,_ah0_1,_bl2_0,1) - VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_1,1) - VMAC(vmlal.s32,_a1a,_ah2_1,_bl0_0,1) - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vmlal.s32,_a1b,_al0_0,_bs2_1,1) - VMAC(vmlal.s32,_a1b,_al0_1,_bs2_0,1) - VMAC(vmlal.s32,_a1b,_al2_0,_bs0_1,1) - VMAC(vmlal.s32,_a1b,_al2_1,_bs0_0,1) - - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a0a,_a0a,_a1b) - - VOP2(vmovn.i64,_a0b_1,_a0a) - VOP3(vsra.u64,_a1a,_a0a,"#28") - - VOP2(vbic.i32,_a0b,"#0xf0000000") - - VOP2(vswp,_a1a_0,_a1a_1) - - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - "sub %[c], #64" "\n\t" - - VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0) - - "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t" - VOP2(vaddw.s32,_a1a,_a0a_0) - VOP2(vmovn.i64,_a0a_0,_a1a) - VOP2(vshr.s64,_a1a,"#28") - - VOP2(vaddw.s32,_a1a,_a0a_1) - VOP2(vmovn.i64,_a0a_1,_a1a) - VOP2(vshr.s64,_a1a,"#28") - - VOP2(vbic.i32,_a0a,"#0xf0000000") - - VOP2(vaddw.s32,_a1a,_a0b_0) - VOP2(vmovn.i64,_a0b_0,_a1a) - - "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t" - - : [a]"+r"(as) - , [b]"+r"(bs) - , [c]"+r"(vc) - - :: "q0","q1","q2","q3", - "q4","q5","q6","q7", - "q8","q9","q10","q11", - "q12","q13","q14","q15", - "memory" - ); -} - -void gf_sqr (gf_s *__restrict__ cs, const gf bs) { - int32x2_t *vc = (int32x2_t*) cs->limb; - - __asm__ __volatile__ ( - "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t" - VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) /* 0 .. 2^30 */ - VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) /* +- 2^29 */ - VOP3(vadd.i32,_as0,_bl0,_bh0) /* 0 .. 2^30 */ - - "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t" - VOP3(vadd.i32,_bs2,_bl2,_bh2) /* 0 .. 2^30 */ - VOP2(vmov,_as2,_bs2) - - VMAC(vqdmull.s32,_a0b,_as0_1,_bs2_1,0) /* 0 .. 8 * 2^58. danger for vqdmlal is 32 */ - VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0) /* 0 .. 12 */ - VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0) /* 0 .. 14 */ - - VMAC(vqdmull.s32,_a1b,_as0_1,_bs2_1,1) /* 0 .. 8 */ - VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1) /* 0 .. 14 */ - VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1) /* 0 .. 16 */ - - VOP2(vmov,_a0a,_a0b) /* 0 .. 14 */ - VMAC(vqdmlal.s32,_a0a,_bh0_1,_bh2_1,0) /* 0 .. 16 */ - VMAC(vmlal.s32,_a0a,_bh2_0,_bh2_0,0) /* 0 .. 17 */ - VMAC(vmlal.s32,_a0a,_bh0_0,_bl0_0,0) /* 0 .. 18 */ - - VMAC(vqdmlsl.s32,_a0b,_bl0_1,_bl2_1,0) /*-2 .. 14 */ - VMAC(vmlsl.s32,_a0b,_bl2_0,_bl2_0,0) /*-3 .. 14 */ - VMAC(vmlal.s32,_a0b,_bl0_0,_bs0_0,0) /*-4 .. 15 */ - - VOP2(vmov,_a1a,_a1b) - VMAC(vqdmlal.s32,_a1a,_bh0_1,_bh2_1,1) /* 0 .. 18 */ - VMAC(vmlal.s32,_a1a,_bh2_0,_bh2_0,1) /* 0 .. 19 */ - VMAC(vmlal.s32,_a1a,_bh0_0,_bl0_0,1) /* 0 .. 20 */ - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vqdmlsl.s32,_a1b,_bl0_1,_bl2_1,1) /*-2 .. 16 */ - VMAC(vmlsl.s32,_a1b,_bl2_0,_bl2_0,1) /*-3 .. 16 */ - VMAC(vmlal.s32,_a1b,_bl0_0,_bs0_0,1) /*-4 .. 17 */ - - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1) - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a1b,_a0a,_a1b) - - - VMAC(vqdmull.s32,_a0a,_as2_0,_bs2_1,0) /* 0 .. 8 */ - VOP2(vmovn.i64,_a0b_1,_a1b) - VOP3(vsra.u64,_a1a,_a1b,"#28") - VMAC(vqdmlal.s32,_a0a,_as0_0,_bh0_1,0) /* 0 .. 12 */ - VOP2(vbic.i32,_a0b,"#0xf0000000") - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - - VMAC(vqdmull.s32,_a1b,_as2_0,_bs2_1,1) /* 0 .. 8 */ - VMAC(vqdmlal.s32,_a1b,_as0_0,_bh0_1,1) /* 0 .. 12 */ - - VOP2(vmov,_a0b,_a0a) /* 0 .. 12 */ - VMAC(vqdmlal.s32,_a0a,_bh2_0,_bh2_1,0) /* 0 .. 14 */ - VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl0_1,0) /* 0 .. 16 */ - - VMAC(vqdmlsl.s32,_a0b,_bl2_0,_bl2_1,0) /*-2 .. 12 */ - VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs0_1,0) /*-4 .. 14 */ - VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1) - VOP3(vadd.i64,_a0b_0,_a0b_0,_a1a_0) - - VOP2(vmov,_a1a,_a1b) /* 0 .. 12 */ - VMAC(vqdmlal.s32,_a1a,_bh2_0,_bh2_1,1) /* 0 .. 14 */ - VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl0_1,1) /* 0 .. 16 */ - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vqdmlsl.s32,_a1b,_bl2_0,_bl2_1,1) /*-2 .. 12 */ - VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs0_1,1) /*-4 .. 14 */ - - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0) - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a1b,_a0a,_a1b) - - VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0) - VOP2(vmovn.i64,_a0b_1,_a1b) - VMAC(vqdmlal.s32,_a0a,_as0_0,_bh2_0,0) - VOP3(vsra.u64,_a1a,_a1b,"#28") - VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0) - VOP2(vbic.i32,_a0b,"#0xf0000000") - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - - VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1) - VMAC(vqdmlal.s32,_a1b,_as0_0,_bh2_0,1) - VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1) - - VOP2(vmov,_a0b_1,_a0a_1) - VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0) - VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1) - VMAC(vmlal.s32,_a0a,_bh2_1,_bh2_1,0) - VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_0,0) - VMAC(vmlal.s32,_a0a,_bh0_1,_bl0_1,0) - - VMAC(vmlsl.s32,_a0b,_bl2_1,_bl2_1,0) - VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_0,0) - VMAC(vmlal.s32,_a0b,_bl0_1,_bs0_1,0) - - VOP2(vmov,_a1a,_a1b) - VMAC(vmlal.s32,_a1a,_bh2_1,_bh2_1,1) - VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_0,1) - VMAC(vmlal.s32,_a1a,_bh0_1,_bl0_1,1) - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vmlsl.s32,_a1b,_bl2_1,_bl2_1,1) - VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_0,1) - VMAC(vmlal.s32,_a1b,_bl0_1,_bs0_1,1) - - VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1) - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a1b,_a0a,_a1b) - - VMAC(vqdmull.s32,_a0a,_as0_0,_bh2_1,0) - VOP2(vmovn.i64,_a0b_1,_a1b) - VOP3(vsra.u64,_a1a,_a1b,"#28") - VMAC(vqdmlal.s32,_a0a,_as2_0,_bh0_1,0) - VOP2(vbic.i32,_a0b,"#0xf0000000") - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - - VMAC(vqdmull.s32,_a1b,_as0_0,_bh2_1,1) - VMAC(vqdmlal.s32,_a1b,_as2_0,_bh0_1,1) - - VOP2(vmov,_a0b_1,_a0a_1) - VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0) - VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1) - VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_1,0) - VMAC(vqdmlal.s32,_a0a,_bh2_0,_bl0_1,0) - - VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_1,0) - VMAC(vqdmlal.s32,_a0b,_bl2_0,_bs0_1,0) - - VOP2(vmov,_a1a,_a1b) - VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_1,1) - VMAC(vqdmlal.s32,_a1a,_bh2_0,_bl0_1,1) - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_1,1) - VMAC(vqdmlal.s32,_a1b,_bl2_0,_bs0_1,1) - - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a0a,_a0a,_a1b) - - VOP2(vmovn.i64,_a0b_1,_a0a) - VOP3(vsra.u64,_a1a,_a0a,"#28") - - VOP2(vbic.i32,_a0b,"#0xf0000000") - - VOP2(vswp,_a1a_0,_a1a_1) - - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - "sub %[c], #64" "\n\t" - - VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0) - - "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t" - VOP2(vaddw.s32,_a1a,_a0a_0) - VOP2(vmovn.i64,_a0a_0,_a1a) - VOP2(vshr.s64,_a1a,"#28") - - VOP2(vaddw.s32,_a1a,_a0a_1) - VOP2(vmovn.i64,_a0a_1,_a1a) - VOP2(vshr.s64,_a1a,"#28") - - VOP2(vbic.i32,_a0a,"#0xf0000000") - - VOP2(vaddw.s32,_a1a,_a0b_0) - VOP2(vmovn.i64,_a0b_0,_a1a) - - "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t" - - : [b]"+r"(bs) - , [c]"+r"(vc) - - :: "q0","q1","q2","q3", - "q4","q5","q6","q7", - "q12","q13","q14","q15", - "memory" - ); -} - -void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) { - uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; - assert(b<(1<<28)); - - uint64x2_t accum; - const uint32x2_t *va = (const uint32x2_t *) as->limb; - uint32x2_t *vo = (uint32x2_t *) cs->limb; - uint32x2_t vc, vn; - uint32x2_t vb = {b, 0}; - - vc = va[0]; - accum = vmull_lane_u32(vc, vb, 0); - vo[0] = vmovn_u64(accum) & vmask; - accum = vshrq_n_u64(accum,28); - - /* PERF: the right way to do this is to reduce behind, i.e. - * vmull + vmlal round 0 - * vmull + vmlal round 1 - * vmull + vmlal round 2 - * vsraq round 0, 1 - * vmull + vmlal round 3 - * vsraq round 1, 2 - * ... - */ - - int i; - for (i=1; i<8; i++) { - vn = va[i]; - accum = vmlal_lane_u32(accum, vn, vb, 0); - vo[i] = vmovn_u64(accum) & vmask; - accum = vshrq_n_u64(accum,28); - vc = vn; - } - - accum = xx_vaddup_u64(vrev128_u64(accum)); - accum = vaddw_u32(accum, vo[0]); - vo[0] = vmovn_u64(accum) & vmask; - - accum = vshrq_n_u64(accum,28); - vo[1] += vmovn_u64(accum); -}