From 4b8736a22e758c371bc2f8b3534dc0c274acf42c Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 29 Mar 2016 10:02:45 +0200 Subject: [PATCH 1/1] crypto/poly1305: don't break carry chains. RT#4483 [poly1305-armv4.pl: remove redundant #ifdef __thumb2__] [poly1305-ppc*.pl: presumably more accurate benchmark results] Reviewed-by: Richard Levitte --- crypto/poly1305/asm/poly1305-armv4.pl | 34 +++------- crypto/poly1305/asm/poly1305-armv8.pl | 18 +++-- crypto/poly1305/asm/poly1305-c64xplus.pl | 15 +++-- crypto/poly1305/asm/poly1305-ppc.pl | 11 ++-- crypto/poly1305/asm/poly1305-ppcfp.pl | 4 +- crypto/poly1305/asm/poly1305-s390x.pl | 7 +- crypto/poly1305/asm/poly1305-sparcv9.pl | 15 +++-- crypto/poly1305/asm/poly1305-x86.pl | 10 +-- crypto/poly1305/asm/poly1305-x86_64.pl | 24 ++++--- crypto/poly1305/poly1305.c | 84 ++++++++++++++++++++++-- 10 files changed, 146 insertions(+), 76 deletions(-) diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl index 4c4b417fcd..aa3f2280c6 100755 --- a/crypto/poly1305/asm/poly1305-armv4.pl +++ b/crypto/poly1305/asm/poly1305-armv4.pl @@ -10,10 +10,10 @@ # IALU(*)/gcc-4.4 NEON # # ARM11xx(ARMv6) 7.78/+100% - -# Cortex-A5 6.30/+130% 2.96 +# Cortex-A5 6.35/+130% 2.96 # Cortex-A8 6.25/+115% 2.36 # Cortex-A9 5.10/+95% 2.55 -# Cortex-A15 3.79/+85% 1.25(**) +# Cortex-A15 3.85/+85% 1.25(**) # Snapdragon S4 5.70/+100% 1.48(**) # # (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; @@ -313,7 +313,8 @@ poly1305_blocks: adds $h0,$h0,r1 adcs $h1,$h1,#0 adcs $h2,$h2,#0 - adc $h3,$h3,#0 + adcs $h3,$h3,#0 + adc $h4,$h4,#0 cmp r0,lr @ done yet? bhi .Loop @@ -735,9 +736,7 @@ poly1305_blocks_neon: .align 4 .Leven: subs $len,$len,#64 -# ifdef __thumb2__ it lo -# endif movlo $in2,$zeros vmov.i32 $H4,#1<<24 @ padbit, yes, always @@ -745,9 +744,7 @@ poly1305_blocks_neon: add $inp,$inp,#64 vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) add $in2,$in2,#64 -# ifdef __thumb2__ itt hi -# endif addhi $tbl1,$ctx,#(48+1*9*4) addhi $tbl0,$ctx,#(48+3*9*4) @@ -817,9 +814,7 @@ poly1305_blocks_neon: vmull.u32 $D4,$H4#hi,${R0}[1] subs $len,$len,#64 vmlal.u32 $D0,$H4#hi,${S1}[1] -# ifdef __thumb2__ it lo -# endif movlo $in2,$zeros vmlal.u32 $D3,$H2#hi,${R1}[1] vld1.32 ${S4}[1],[$tbl1,:32] @@ -946,9 +941,7 @@ poly1305_blocks_neon: add $tbl1,$ctx,#(48+0*9*4) add $tbl0,$ctx,#(48+1*9*4) adds $len,$len,#32 -# ifdef __thumb2__ it ne -# endif movne $len,#0 bne .Long_tail @@ -990,14 +983,10 @@ poly1305_blocks_neon: vmlal.u32 $D2,$H0#hi,$R2 vmlal.u32 $D3,$H0#hi,$R3 -# ifdef __thumb2__ - it ne -# endif + it ne addne $tbl1,$ctx,#(48+2*9*4) vmlal.u32 $D0,$H2#hi,$S3 -# ifdef __thumb2__ - it ne -# endif + it ne addne $tbl0,$ctx,#(48+3*9*4) vmlal.u32 $D4,$H1#hi,$R3 vmlal.u32 $D1,$H3#hi,$S3 @@ -1138,7 +1127,8 @@ poly1305_emit_neon: adds $h0,$h0,$g0 adcs $h1,$h1,#0 adcs $h2,$h2,#0 - adc $h3,$h3,#0 + adcs $h3,$h3,#0 + adc $h4,$h4,#0 adds $g0,$h0,#5 @ compare to modulus adcs $g1,$h1,#0 @@ -1147,24 +1137,16 @@ poly1305_emit_neon: adc $g4,$h4,#0 tst $g4,#4 @ did it carry/borrow? -# ifdef __thumb2__ it ne -# endif movne $h0,$g0 ldr $g0,[$nonce,#0] -# ifdef __thumb2__ it ne -# endif movne $h1,$g1 ldr $g1,[$nonce,#4] -# ifdef __thumb2__ it ne -# endif movne $h2,$g2 ldr $g2,[$nonce,#8] -# ifdef __thumb2__ it ne -# endif movne $h3,$g3 ldr $g3,[$nonce,#12] diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl index f1359fd44a..2e1dae3df2 100755 --- a/crypto/poly1305/asm/poly1305-armv8.pl +++ b/crypto/poly1305/asm/poly1305-armv8.pl @@ -16,10 +16,10 @@ # IALU/gcc-4.9 NEON # # Apple A7 1.86/+5% 0.72 -# Cortex-A53 2.63/+58% 1.47 +# Cortex-A53 2.69/+58% 1.47 # Cortex-A57 2.70/+7% 1.14 -# Denver 1.39/+50% 1.18(*) -# X-Gene 2.00/+68% 2.19 +# Denver 1.64/+50% 1.18(*) +# X-Gene 2.13/+68% 2.19 # # (*) estimate based on resources availability is less than 1.0, # i.e. measured result is worse than expected, presumably binary @@ -151,7 +151,8 @@ poly1305_blocks: and $h2,$d2,#3 add $t0,$t0,$d2,lsr#2 adds $h0,$d0,$t0 - adc $h1,$d1,xzr + adcs $h1,$d1,xzr + adc $h2,$h2,xzr cbnz $len,.Loop @@ -235,7 +236,8 @@ poly1305_mult: and $h2,$d2,#3 add $t0,$t0,$d2,lsr#2 adds $h0,$d0,$t0 - adc $h1,$d1,xzr + adcs $h1,$d1,xzr + adc $h2,$h2,xzr ret .size poly1305_mult,.-poly1305_mult @@ -310,7 +312,8 @@ poly1305_blocks_neon: and $h2,$d2,#3 add $t0,$t0,$d2,lsr#2 adds $h0,$h0,$t0 - adc $h1,$h1,xzr + adcs $h1,$h1,xzr + adc $h2,$h2,xzr #ifdef __ARMEB__ rev $d0,$d0 @@ -870,7 +873,8 @@ poly1305_emit_neon: add $d0,$d0,$h2,lsr#2 and $h2,$h2,#3 adds $h0,$h0,$d0 - adc $h1,$h1,xzr + adcs $h1,$h1,xzr + adc $h2,$h2,xzr adds $d0,$h0,#5 // compare to modulus adcs $d1,$h1,xzr diff --git a/crypto/poly1305/asm/poly1305-c64xplus.pl b/crypto/poly1305/asm/poly1305-c64xplus.pl index f750a6e5eb..a7cf47d5f0 100755 --- a/crypto/poly1305/asm/poly1305-c64xplus.pl +++ b/crypto/poly1305/asm/poly1305-c64xplus.pl @@ -11,7 +11,7 @@ # # October 2015 # -# Performance is [incredible for a 32-bit processor] 1.76 cycles per +# Performance is [incredible for a 32-bit processor] 1.82 cycles per # processed byte. Comparison to compiler-generated code is problematic, # because results were observed to vary from 2.1 to 7.6 cpb depending # on compiler's ability to inline small functions. Compiler also @@ -128,7 +128,7 @@ _poly1305_blocks: || SWAP2 $D1,$D1 ADDU $D0,B24,$D0:$H0 ; h0+=inp[0] -|| ADD $D0,B24,B31 ; B-copy of h0+inp[0] +|| ADD $D0,B24,B27 ; B-copy of h0+inp[0] || SWAP4 $D1,$D1 ADDU $D1,B25,$D1:$H1 ; h1+=inp[1] || MVK 3,$THREE @@ -140,12 +140,12 @@ _poly1305_blocks: loop?: MPY32U $H0,$R0,A17:A16 -|| MPY32U B31,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16 +|| MPY32U B27,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16 || ADDU $D0,$D1:$H1,B25:B24 ; ADDU $D0,$D1:$H1,$D1:$H1 || ADDU $D2,B28,$D2:$H2 ; h2+=inp[2] || SWAP2 $D3,$D3 MPY32U $H0,$R2,A19:A18 -|| MPY32U B31,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18 +|| MPY32U B27,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18 || ADD $D0,$H1,A24 ; A-copy of B24 || SWAP4 $D3,$D3 || [A2] SUB A2,1,A2 ; decrement loop counter @@ -227,8 +227,8 @@ loop?: SHRU $H4,2,B16 ; last reduction step || AND $H4,$THREE,$H4 -|| [A2] BNOP loop? ADDAW B16,B16,B16 ; 5*(h4>>2) +|| [A2] BNOP loop? ADDU B24,B16,B25:B24 ; B24 is h0 || [A2] SWAP2 $D2,$D2 @@ -236,8 +236,9 @@ loop?: || [A2] SWAP4 $D2,$D2 ADDU B28,B27,B29:B28 ; B28 is h2 || [A2] ADDU $D0,B24,$D0:$H0 ; h0+=inp[0] -|| [A2] ADD $D0,B24,B31 ; B-copy of h0+inp[0] - ADD B30,B29,B30 ; B30 is h3 +|| [A2] ADD $D0,B24,B27 ; B-copy of h0+inp[0] + ADDU B30,B29,B31:B30 ; B30 is h3 + ADD B31,$H4,$H4 || [A2] ADDU $D1,B26,$D1:$H1 ; h1+=inp[1] ;;===== branch to loop? is taken here diff --git a/crypto/poly1305/asm/poly1305-ppc.pl b/crypto/poly1305/asm/poly1305-ppc.pl index 46130c9327..07da9d10b6 100755 --- a/crypto/poly1305/asm/poly1305-ppc.pl +++ b/crypto/poly1305/asm/poly1305-ppc.pl @@ -17,11 +17,10 @@ # -m32 -m64 # # Freescale e300 14.8/+80% - -# PPC74x0 7.40/+60% - -# PPC970 7.20/+114% 3.51/+205% -# POWER6 3.96/+250% 2.02/+170% -# POWER7 3.67/+260% 1.87/+100% -# POWER8 - 2.13/+200% +# PPC74x0 7.60/+60% - +# PPC970 7.00/+114% 3.51/+205% +# POWER7 3.75/+260% 1.93/+100% +# POWER8 - 2.03/+200% # # Do we need floating-point implementation for PPC? Results presented # in poly1305_ieee754.c are tricky to compare to, because they are for @@ -212,6 +211,7 @@ $code.=<<___; add $t0,$t0,$t1 addc $h0,$d0,$t0 addze $h1,$d1 + addze $h2,$h2 bdnz Loop @@ -518,6 +518,7 @@ $code.=<<___; addze $h1,$h1 addze $h2,$h2 addze $h3,$h3 + addze $h4,$h4 bdnz Loop diff --git a/crypto/poly1305/asm/poly1305-ppcfp.pl b/crypto/poly1305/asm/poly1305-ppcfp.pl index 061a556377..c8636a46ed 100755 --- a/crypto/poly1305/asm/poly1305-ppcfp.pl +++ b/crypto/poly1305/asm/poly1305-ppcfp.pl @@ -15,8 +15,8 @@ # and improvement coefficients relative to gcc-generated code. # # Freescale e300 9.78/+30% -# PPC74x0 7.08/+50% -# PPC970 6.24/+80% +# PPC74x0 6.92/+50% +# PPC970 6.03/+80% # POWER7 3.50/+30% # POWER8 3.75/+10% diff --git a/crypto/poly1305/asm/poly1305-s390x.pl b/crypto/poly1305/asm/poly1305-s390x.pl index 49b3f79f1d..141ba8d0bd 100755 --- a/crypto/poly1305/asm/poly1305-s390x.pl +++ b/crypto/poly1305/asm/poly1305-s390x.pl @@ -11,7 +11,7 @@ # # June 2015 # -# ~6.4/2.2 cpb on z10/z196+, >2x improvement over compiler-generated +# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated # code. For older compiler improvement coefficient is >3x, because # then base 2^64 and base 2^32 implementations are compared. # @@ -138,11 +138,12 @@ poly1305_blocks: ngr $h0,$h2 srlg $t0,$h2,2 algr $h0,$t0 + lghi $t1,3 + ngr $h2,$t1 algr $h0,$d0lo - lghi $t1,3 alcgr $h1,$d1hi # $d1hi is still zero - ngr $h2,$t1 + alcgr $h2,$d1hi # $d1hi is still zero brct$g $len,.Loop diff --git a/crypto/poly1305/asm/poly1305-sparcv9.pl b/crypto/poly1305/asm/poly1305-sparcv9.pl index 5452887981..497e27097d 100755 --- a/crypto/poly1305/asm/poly1305-sparcv9.pl +++ b/crypto/poly1305/asm/poly1305-sparcv9.pl @@ -16,10 +16,10 @@ # # IALU(*) FMA # -# UltraSPARC III 11.9(**) -# SPARC T3 7.85 -# SPARC T4 1.67(***) 6.55 -# SPARC64 X 5.54 3.64 +# UltraSPARC III 12.3(**) +# SPARC T3 7.92 +# SPARC T4 1.70(***) 6.55 +# SPARC64 X 5.60 3.64 # # (*) Comparison to compiler-generated code is really problematic, # because latter's performance varies too much depending on too @@ -251,8 +251,9 @@ poly1305_blocks: addcc $t0,$d0,$h0 addccc %g0,$h1,$h1 addccc %g0,$h2,$h2 + addccc %g0,$h3,$h3 brnz,pt $len,.Loop - addc %g0,$h3,$h3 + addc %g0,$h4,$h4 st $h1,[$ctx+0] ! store hash value st $h0,[$ctx+4] @@ -295,6 +296,7 @@ poly1305_blocks_vis3: neg $shr,$shl srlx $R1,2,$S1 + b .Loop_vis3 add $R1,$S1,$S1 .Loop_vis3: @@ -342,8 +344,9 @@ poly1305_blocks_vis3: add $T1,$T0,$T0 addcc $T0,$D0,$H0 + addxccc %g0,$D1,$H1 brnz,pt $len,.Loop_vis3 - addxc %g0,$D1,$H1 + addxc %g0,$H2,$H2 stx $H0,[$ctx+0] ! store hash value stx $H1,[$ctx+8] diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl index 01c3cbcda9..97d0a81bea 100755 --- a/crypto/poly1305/asm/poly1305-x86.pl +++ b/crypto/poly1305/asm/poly1305-x86.pl @@ -299,6 +299,7 @@ if ($sse2) { &adc ("ebx",0); &adc ("ecx",0); &adc ("esi",0); + &adc ("edi",0); &cmp ("ebp",&wparam(2)); # done yet? &jne (&label("loop")); @@ -1166,11 +1167,12 @@ my $addr = shift; &shr ("edi",2); &lea ("ebp",&DWP(0,"edi","edi",4)); # *5 &mov ("edi",&wparam(1)); # output - add ("eax","ebp"); + &add ("eax","ebp"); &mov ("ebp",&wparam(2)); # key - adc ("ebx",0); - adc ("ecx",0); - adc ("edx",0); + &adc ("ebx",0); + &adc ("ecx",0); + &adc ("edx",0); + &adc ("esi",0); &movd ($D0,"eax"); # offload original hash value &add ("eax",5); # compare to modulus diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl index 8977d563a2..7d676119a2 100755 --- a/crypto/poly1305/asm/poly1305-x86_64.pl +++ b/crypto/poly1305/asm/poly1305-x86_64.pl @@ -15,16 +15,16 @@ # measured with rdtsc at fixed clock frequency. # # IALU/gcc-4.8(*) AVX(**) AVX2 -# P4 4.90/+120% - -# Core 2 2.39/+90% - -# Westmere 1.86/+120% - +# P4 4.46/+120% - +# Core 2 2.41/+90% - +# Westmere 1.88/+120% - # Sandy Bridge 1.39/+140% 1.10 -# Haswell 1.10/+175% 1.11 0.65 -# Skylake 1.12/+120% 0.96 0.51 +# Haswell 1.14/+175% 1.11 0.65 +# Skylake 1.13/+120% 0.96 0.51 # Silvermont 2.83/+95% - # VIA Nano 1.82/+150% - # Sledgehammer 1.38/+160% - -# Bulldozer 2.21/+130% 0.97 +# Bulldozer 2.30/+130% 0.97 # # (*) improvement coefficients relative to clang are more modest and # are ~50% on most processors, in both cases we are comparing to @@ -114,6 +114,7 @@ $code.=<<___; add $d3,%rax add %rax,$h0 adc \$0,$h1 + adc \$0,$h2 ___ } @@ -184,8 +185,8 @@ $code.=<<___; .align 32 poly1305_blocks: .Lblocks: - sub \$16,$len # too short? - jc .Lno_data + shr \$4,$len + jz .Lno_data # too short push %rbx push %rbp @@ -220,8 +221,8 @@ ___ &poly1305_iteration(); $code.=<<___; mov $r1,%rax - sub \$16,%r15 # len-=16 - jnc .Loop + dec %r15 # len-=16 + jnz .Loop mov $h0,0($ctx) # store hash value mov $h1,8($ctx) @@ -521,6 +522,7 @@ poly1305_blocks_avx: add $d2,$d1 # =*5 add $d1,$h0 adc \$0,$h1 + adc \$0,$h2 mov $s1,$r1 mov $s1,%rax @@ -1315,6 +1317,7 @@ poly1305_emit_avx: add %rcx,%rax add %rax,%r8 adc \$0,%r9 + adc \$0,%r10 mov %r8,%rax add \$5,%r8 # compare to modulus @@ -1407,6 +1410,7 @@ poly1305_blocks_avx2: add $d2,$d1 # =*5 add $d1,$h0 adc \$0,$h1 + adc \$0,$h2 mov $s1,$r1 mov $s1,%rax diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c index b500f2e7cb..6bec8b30f8 100644 --- a/crypto/poly1305/poly1305.c +++ b/crypto/poly1305/poly1305.c @@ -207,7 +207,17 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit) c = (h2 >> 2) + (h2 & ~3UL); h2 &= 3; h0 += c; - h1 += (c = CONSTANT_TIME_CARRY(h0,c)); /* doesn't overflow */ + h1 += (c = CONSTANT_TIME_CARRY(h0,c)); + h2 += CONSTANT_TIME_CARRY(h1,c); + /* + * Occasional overflows to 3rd bit of h2 are taken care of + * "naturally". If after this point we end up at the top of + * this loop, then the overflow bit will be accounted for + * in next iteration. If we end up in poly1305_emit, then + * comparison to modulus below will still count as "carry + * into 131st bit", so that properly reduced value will be + * picked in conditional move. + */ inp += POLY1305_BLOCK_SIZE; len -= POLY1305_BLOCK_SIZE; @@ -231,12 +241,12 @@ static void poly1305_emit(void *ctx, unsigned char mac[16], h1 = st->h[1]; h2 = st->h[2]; - /* compute h + -p */ + /* compare to modulus by computing h + -p */ g0 = (u64)(t = (u128)h0 + 5); g1 = (u64)(t = (u128)h1 + (t >> 64)); g2 = h2 + (u64)(t >> 64); - /* if there was carry into 130th bit, h1:h0 = g1:g0 */ + /* if there was carry into 131st bit, h1:h0 = g1:g0 */ mask = 0 - (g2 >> 2); g0 &= mask; g1 &= mask; @@ -361,7 +371,17 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit) h0 += c; h1 += (c = CONSTANT_TIME_CARRY(h0,c)); h2 += (c = CONSTANT_TIME_CARRY(h1,c)); - h3 += (c = CONSTANT_TIME_CARRY(h2,c)); /* doesn't overflow */ + h3 += (c = CONSTANT_TIME_CARRY(h2,c)); + h4 += CONSTANT_TIME_CARRY(h3,c); + /* + * Occasional overflows to 3rd bit of h4 are taken care of + * "naturally". If after this point we end up at the top of + * this loop, then the overflow bit will be accounted for + * in next iteration. If we end up in poly1305_emit, then + * comparison to modulus below will still count as "carry + * into 131st bit", so that properly reduced value will be + * picked in conditional move. + */ inp += POLY1305_BLOCK_SIZE; len -= POLY1305_BLOCK_SIZE; @@ -389,14 +409,14 @@ static void poly1305_emit(void *ctx, unsigned char mac[16], h3 = st->h[3]; h4 = st->h[4]; - /* compute h + -p */ + /* compare to modulus by computing h + -p */ g0 = (u32)(t = (u64)h0 + 5); g1 = (u32)(t = (u64)h1 + (t >> 32)); g2 = (u32)(t = (u64)h2 + (t >> 32)); g3 = (u32)(t = (u64)h3 + (t >> 32)); g4 = h4 + (u32)(t >> 32); - /* if there was carry into 130th bit, h3:h0 = g3:g0 */ + /* if there was carry into 131st bit, h3:h0 = g3:g0 */ mask = 0 - (g4 >> 2); g0 &= mask; g1 &= mask; @@ -728,6 +748,58 @@ static const struct poly1305_test poly1305_tests[] = { "99e5822dd4173c995e3dae0ddefb9774""3fde3b080134b39f76e9bf8d0e88d546", "2637408fe13086ea73f971e3425e2820" }, + /* + * test vectors from Hanno Böck + */ + { + "cccccccccccccccccccccccccccccccccccccccccccccccccc80cccccccccccc" + "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccecccccc" + "ccccccccccccccccccccccccccccccc5cccccccccccccccccccccccccccccccc" + "cccccccccce3cccccccccccccccccccccccccccccccccccccccccccccccccccc" + "ccccccccaccccccccccccccccccccce6cccccccccc000000afcccccccccccccc" + "ccccfffffff50000000000000000000000000000000000000000000000000000" + "00ffffffe7000000000000000000000000000000000000000000000000000000" + "0000000000000000000000000000000000000000000000000000719205a8521d" + "fc", + "7f1b0264000000000000000000000000""0000000000000000cccccccccccccccc", + "8559b876eceed66eb37798c0457baff9" + }, + { + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa0000000000" + "00000000800264", + "e0001600000000000000000000000000""0000aaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "00bd1258978e205444c9aaaa82006fed" + }, + { + "02fc", + "0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c""0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c", + "06120c0c0c0c0c0c0c0c0c0c0c0c0c0c" + }, + { + "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b" + "7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b" + "7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b" + "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b007b7b7b7b7b7b7b7b7b" + "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b" + "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b" + "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b" + "7b6e7b001300000000b300000000000000000000000000000000000000000000" + "f20000000000000000000000000000000000002000efff000900000000000000" + "0000000000100000000009000000640000000000000000000000001300000000" + "b300000000000000000000000000000000000000000000f20000000000000000" + "000000000000000000002000efff00090000000000000000007a000010000000" + "000900000064000000000000000000000000000000000000000000000000fc", + "00ff0000000000000000000000000000""00000000001e00000000000000007b7b", + "33205bbf9e9f8f7212ab9e2ab9b7e4a5" + }, + { + "7777777777777777777777777777777777777777777777777777777777777777" + "7777777777777777777777777777777777777777777777777777777777777777" + "777777777777777777777777ffffffe9e9acacacacacacacacacacac0000acac" + "ec0100acacac2caca2acacacacacacacacacacac64f2", + "0000007f0000007f0100002000000000""0000cf77777777777777777777777777", + "02ee7c8c546ddeb1a467e4c3981158b9" + }, /* * test vectors from Andrew Moon */ -- 2.34.1