From 4b8736a22e758c371bc2f8b3534dc0c274acf42c Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Tue, 29 Mar 2016 10:02:45 +0200
Subject: [PATCH 1/1] crypto/poly1305: don't break carry chains.

RT#4483

[poly1305-armv4.pl: remove redundant #ifdef __thumb2__]
[poly1305-ppc*.pl: presumably more accurate benchmark results]

Reviewed-by: Richard Levitte <levitte@openssl.org>
---
 crypto/poly1305/asm/poly1305-armv4.pl    | 34 +++-------
 crypto/poly1305/asm/poly1305-armv8.pl    | 18 +++--
 crypto/poly1305/asm/poly1305-c64xplus.pl | 15 +++--
 crypto/poly1305/asm/poly1305-ppc.pl      | 11 ++--
 crypto/poly1305/asm/poly1305-ppcfp.pl    |  4 +-
 crypto/poly1305/asm/poly1305-s390x.pl    |  7 +-
 crypto/poly1305/asm/poly1305-sparcv9.pl  | 15 +++--
 crypto/poly1305/asm/poly1305-x86.pl      | 10 +--
 crypto/poly1305/asm/poly1305-x86_64.pl   | 24 ++++---
 crypto/poly1305/poly1305.c               | 84 ++++++++++++++++++++++--
 10 files changed, 146 insertions(+), 76 deletions(-)

diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl
index 4c4b417fcd..aa3f2280c6 100755
--- a/crypto/poly1305/asm/poly1305-armv4.pl
+++ b/crypto/poly1305/asm/poly1305-armv4.pl
@@ -10,10 +10,10 @@
 #			IALU(*)/gcc-4.4		NEON
 #
 # ARM11xx(ARMv6)	7.78/+100%		-
-# Cortex-A5		6.30/+130%		2.96
+# Cortex-A5		6.35/+130%		2.96
 # Cortex-A8		6.25/+115%		2.36
 # Cortex-A9		5.10/+95%		2.55
-# Cortex-A15		3.79/+85%		1.25(**)
+# Cortex-A15		3.85/+85%		1.25(**)
 # Snapdragon S4		5.70/+100%		1.48(**)
 #
 # (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
@@ -313,7 +313,8 @@ poly1305_blocks:
 	adds	$h0,$h0,r1
 	adcs	$h1,$h1,#0
 	adcs	$h2,$h2,#0
-	adc	$h3,$h3,#0
+	adcs	$h3,$h3,#0
+	adc	$h4,$h4,#0
 
 	cmp	r0,lr			@ done yet?
 	bhi	.Loop
@@ -735,9 +736,7 @@ poly1305_blocks_neon:
 .align	4
 .Leven:
 	subs		$len,$len,#64
-# ifdef	__thumb2__
 	it		lo
-# endif
 	movlo		$in2,$zeros
 
 	vmov.i32	$H4,#1<<24		@ padbit, yes, always
@@ -745,9 +744,7 @@ poly1305_blocks_neon:
 	add		$inp,$inp,#64
 	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
 	add		$in2,$in2,#64
-# ifdef	__thumb2__
 	itt		hi
-# endif
 	addhi		$tbl1,$ctx,#(48+1*9*4)
 	addhi		$tbl0,$ctx,#(48+3*9*4)
 
@@ -817,9 +814,7 @@ poly1305_blocks_neon:
 	vmull.u32	$D4,$H4#hi,${R0}[1]
 	subs		$len,$len,#64
 	vmlal.u32	$D0,$H4#hi,${S1}[1]
-# ifdef	__thumb2__
 	it		lo
-# endif
 	movlo		$in2,$zeros
 	vmlal.u32	$D3,$H2#hi,${R1}[1]
 	vld1.32		${S4}[1],[$tbl1,:32]
@@ -946,9 +941,7 @@ poly1305_blocks_neon:
 	add		$tbl1,$ctx,#(48+0*9*4)
 	add		$tbl0,$ctx,#(48+1*9*4)
 	adds		$len,$len,#32
-# ifdef	__thumb2__
 	it		ne
-# endif
 	movne		$len,#0
 	bne		.Long_tail
 
@@ -990,14 +983,10 @@ poly1305_blocks_neon:
 	vmlal.u32	$D2,$H0#hi,$R2
 
 	vmlal.u32	$D3,$H0#hi,$R3
-# ifdef	__thumb2__
-	it		ne
-# endif
+	 it		ne
 	 addne		$tbl1,$ctx,#(48+2*9*4)
 	vmlal.u32	$D0,$H2#hi,$S3
-# ifdef	__thumb2__
-	it		ne
-# endif
+	 it		ne
 	 addne		$tbl0,$ctx,#(48+3*9*4)
 	vmlal.u32	$D4,$H1#hi,$R3
 	vmlal.u32	$D1,$H3#hi,$S3
@@ -1138,7 +1127,8 @@ poly1305_emit_neon:
 	adds	$h0,$h0,$g0
 	adcs	$h1,$h1,#0
 	adcs	$h2,$h2,#0
-	adc	$h3,$h3,#0
+	adcs	$h3,$h3,#0
+	adc	$h4,$h4,#0
 
 	adds	$g0,$h0,#5		@ compare to modulus
 	adcs	$g1,$h1,#0
@@ -1147,24 +1137,16 @@ poly1305_emit_neon:
 	adc	$g4,$h4,#0
 	tst	$g4,#4			@ did it carry/borrow?
 
-# ifdef	__thumb2__
 	it	ne
-# endif
 	movne	$h0,$g0
 	ldr	$g0,[$nonce,#0]
-# ifdef	__thumb2__
 	it	ne
-# endif
 	movne	$h1,$g1
 	ldr	$g1,[$nonce,#4]
-# ifdef	__thumb2__
 	it	ne
-# endif
 	movne	$h2,$g2
 	ldr	$g2,[$nonce,#8]
-# ifdef	__thumb2__
 	it	ne
-# endif
 	movne	$h3,$g3
 	ldr	$g3,[$nonce,#12]
 
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
index f1359fd44a..2e1dae3df2 100755
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -16,10 +16,10 @@
 #		IALU/gcc-4.9	NEON
 #
 # Apple A7	1.86/+5%	0.72
-# Cortex-A53	2.63/+58%	1.47
+# Cortex-A53	2.69/+58%	1.47
 # Cortex-A57	2.70/+7%	1.14
-# Denver	1.39/+50%	1.18(*)
-# X-Gene	2.00/+68%	2.19
+# Denver	1.64/+50%	1.18(*)
+# X-Gene	2.13/+68%	2.19
 #
 # (*)	estimate based on resources availability is less than 1.0,
 #	i.e. measured result is worse than expected, presumably binary
@@ -151,7 +151,8 @@ poly1305_blocks:
 	and	$h2,$d2,#3
 	add	$t0,$t0,$d2,lsr#2
 	adds	$h0,$d0,$t0
-	adc	$h1,$d1,xzr
+	adcs	$h1,$d1,xzr
+	adc	$h2,$h2,xzr
 
 	cbnz	$len,.Loop
 
@@ -235,7 +236,8 @@ poly1305_mult:
 	and	$h2,$d2,#3
 	add	$t0,$t0,$d2,lsr#2
 	adds	$h0,$d0,$t0
-	adc	$h1,$d1,xzr
+	adcs	$h1,$d1,xzr
+	adc	$h2,$h2,xzr
 
 	ret
 .size	poly1305_mult,.-poly1305_mult
@@ -310,7 +312,8 @@ poly1305_blocks_neon:
 	and	$h2,$d2,#3
 	add	$t0,$t0,$d2,lsr#2
 	adds	$h0,$h0,$t0
-	adc	$h1,$h1,xzr
+	adcs	$h1,$h1,xzr
+	adc	$h2,$h2,xzr
 
 #ifdef	__ARMEB__
 	rev	$d0,$d0
@@ -870,7 +873,8 @@ poly1305_emit_neon:
 	add	$d0,$d0,$h2,lsr#2
 	and	$h2,$h2,#3
 	adds	$h0,$h0,$d0
-	adc	$h1,$h1,xzr
+	adcs	$h1,$h1,xzr
+	adc	$h2,$h2,xzr
 
 	adds	$d0,$h0,#5		// compare to modulus
 	adcs	$d1,$h1,xzr
diff --git a/crypto/poly1305/asm/poly1305-c64xplus.pl b/crypto/poly1305/asm/poly1305-c64xplus.pl
index f750a6e5eb..a7cf47d5f0 100755
--- a/crypto/poly1305/asm/poly1305-c64xplus.pl
+++ b/crypto/poly1305/asm/poly1305-c64xplus.pl
@@ -11,7 +11,7 @@
 #
 # October 2015
 #
-# Performance is [incredible for a 32-bit processor] 1.76 cycles per
+# Performance is [incredible for a 32-bit processor] 1.82 cycles per
 # processed byte. Comparison to compiler-generated code is problematic,
 # because results were observed to vary from 2.1 to 7.6 cpb depending
 # on compiler's ability to inline small functions. Compiler also
@@ -128,7 +128,7 @@ _poly1305_blocks:
 ||	SWAP2	$D1,$D1
 
 	ADDU	$D0,B24,$D0:$H0		; h0+=inp[0]
-||	ADD	$D0,B24,B31		; B-copy of h0+inp[0]
+||	ADD	$D0,B24,B27		; B-copy of h0+inp[0]
 ||	SWAP4	$D1,$D1
 	ADDU	$D1,B25,$D1:$H1		; h1+=inp[1]
 ||	MVK	3,$THREE
@@ -140,12 +140,12 @@ _poly1305_blocks:
 
 loop?:
 	MPY32U	$H0,$R0,A17:A16
-||	MPY32U	B31,$R1,B17:B16		; MPY32U	$H0,$R1,B17:B16
+||	MPY32U	B27,$R1,B17:B16		; MPY32U	$H0,$R1,B17:B16
 ||	ADDU	$D0,$D1:$H1,B25:B24	; ADDU		$D0,$D1:$H1,$D1:$H1
 ||	ADDU	$D2,B28,$D2:$H2		; h2+=inp[2]
 ||	SWAP2	$D3,$D3
 	MPY32U	$H0,$R2,A19:A18
-||	MPY32U	B31,$R3,B19:B18		; MPY32U	$H0,$R3,B19:B18
+||	MPY32U	B27,$R3,B19:B18		; MPY32U	$H0,$R3,B19:B18
 ||	ADD	$D0,$H1,A24		; A-copy of B24
 ||	SWAP4	$D3,$D3
 || [A2]	SUB	A2,1,A2			; decrement loop counter
@@ -227,8 +227,8 @@ loop?:
 
 	SHRU	$H4,2,B16		; last reduction step
 ||	AND	$H4,$THREE,$H4
-|| [A2]	BNOP	loop?
 	ADDAW	B16,B16,B16		; 5*(h4>>2)
+|| [A2]	BNOP	loop?
 
 	ADDU	B24,B16,B25:B24		; B24 is h0
 || [A2]	SWAP2	$D2,$D2
@@ -236,8 +236,9 @@ loop?:
 || [A2]	SWAP4	$D2,$D2
 	ADDU	B28,B27,B29:B28		; B28 is h2
 || [A2]	ADDU	$D0,B24,$D0:$H0		; h0+=inp[0]
-|| [A2]	ADD	$D0,B24,B31		; B-copy of h0+inp[0]
-	ADD	B30,B29,B30		; B30 is h3
+|| [A2]	ADD	$D0,B24,B27		; B-copy of h0+inp[0]
+	ADDU	B30,B29,B31:B30		; B30 is h3
+	ADD	B31,$H4,$H4
 || [A2]	ADDU	$D1,B26,$D1:$H1		; h1+=inp[1]
 ;;===== branch to loop? is taken here
 
diff --git a/crypto/poly1305/asm/poly1305-ppc.pl b/crypto/poly1305/asm/poly1305-ppc.pl
index 46130c9327..07da9d10b6 100755
--- a/crypto/poly1305/asm/poly1305-ppc.pl
+++ b/crypto/poly1305/asm/poly1305-ppc.pl
@@ -17,11 +17,10 @@
 #			-m32		-m64
 #
 # Freescale e300	14.8/+80%	-
-# PPC74x0		7.40/+60%	-
-# PPC970		7.20/+114%	3.51/+205%
-# POWER6		3.96/+250%	2.02/+170%
-# POWER7		3.67/+260%	1.87/+100%
-# POWER8		-		2.13/+200%
+# PPC74x0		7.60/+60%	-
+# PPC970		7.00/+114%	3.51/+205%
+# POWER7		3.75/+260%	1.93/+100%
+# POWER8		-		2.03/+200%
 #
 # Do we need floating-point implementation for PPC? Results presented
 # in poly1305_ieee754.c are tricky to compare to, because they are for
@@ -212,6 +211,7 @@ $code.=<<___;
 	add	$t0,$t0,$t1
 	addc	$h0,$d0,$t0
 	addze	$h1,$d1
+	addze	$h2,$h2
 
 	bdnz	Loop
 
@@ -518,6 +518,7 @@ $code.=<<___;
 	addze	$h1,$h1
 	addze	$h2,$h2
 	addze	$h3,$h3
+	addze	$h4,$h4
 
 	bdnz	Loop
 
diff --git a/crypto/poly1305/asm/poly1305-ppcfp.pl b/crypto/poly1305/asm/poly1305-ppcfp.pl
index 061a556377..c8636a46ed 100755
--- a/crypto/poly1305/asm/poly1305-ppcfp.pl
+++ b/crypto/poly1305/asm/poly1305-ppcfp.pl
@@ -15,8 +15,8 @@
 # and improvement coefficients relative to gcc-generated code.
 #
 # Freescale e300	9.78/+30%
-# PPC74x0		7.08/+50%
-# PPC970		6.24/+80%
+# PPC74x0		6.92/+50%
+# PPC970		6.03/+80%
 # POWER7		3.50/+30%
 # POWER8		3.75/+10%
 
diff --git a/crypto/poly1305/asm/poly1305-s390x.pl b/crypto/poly1305/asm/poly1305-s390x.pl
index 49b3f79f1d..141ba8d0bd 100755
--- a/crypto/poly1305/asm/poly1305-s390x.pl
+++ b/crypto/poly1305/asm/poly1305-s390x.pl
@@ -11,7 +11,7 @@
 #
 # June 2015
 #
-# ~6.4/2.2 cpb on z10/z196+, >2x improvement over compiler-generated
+# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
 # code. For older compiler improvement coefficient is >3x, because
 # then base 2^64 and base 2^32 implementations are compared.
 #
@@ -138,11 +138,12 @@ poly1305_blocks:
 	ngr	$h0,$h2
 	srlg	$t0,$h2,2
 	algr	$h0,$t0
+	lghi	$t1,3
+	ngr	$h2,$t1
 
 	algr	$h0,$d0lo
-	lghi	$t1,3
 	alcgr	$h1,$d1hi		# $d1hi is still zero
-	ngr	$h2,$t1
+	alcgr	$h2,$d1hi		# $d1hi is still zero
 
 	brct$g	$len,.Loop
 
diff --git a/crypto/poly1305/asm/poly1305-sparcv9.pl b/crypto/poly1305/asm/poly1305-sparcv9.pl
index 5452887981..497e27097d 100755
--- a/crypto/poly1305/asm/poly1305-sparcv9.pl
+++ b/crypto/poly1305/asm/poly1305-sparcv9.pl
@@ -16,10 +16,10 @@
 #
 #			IALU(*)		FMA
 #
-# UltraSPARC III	11.9(**)
-# SPARC T3		7.85
-# SPARC T4		1.67(***)	6.55
-# SPARC64 X		5.54		3.64
+# UltraSPARC III	12.3(**)
+# SPARC T3		7.92
+# SPARC T4		1.70(***)	6.55
+# SPARC64 X		5.60		3.64
 #
 # (*)	Comparison to compiler-generated code is really problematic,
 #	because latter's performance varies too much depending on too
@@ -251,8 +251,9 @@ poly1305_blocks:
 	addcc	$t0,$d0,$h0
 	addccc	%g0,$h1,$h1
 	addccc	%g0,$h2,$h2
+	addccc	%g0,$h3,$h3
 	brnz,pt	$len,.Loop
-	addc	%g0,$h3,$h3
+	addc	%g0,$h4,$h4
 
 	st	$h1,[$ctx+0]		! store hash value
 	st	$h0,[$ctx+4]
@@ -295,6 +296,7 @@ poly1305_blocks_vis3:
 	neg	$shr,$shl
 
 	srlx	$R1,2,$S1
+	b	.Loop_vis3
 	add	$R1,$S1,$S1
 
 .Loop_vis3:
@@ -342,8 +344,9 @@ poly1305_blocks_vis3:
 	add	$T1,$T0,$T0
 
 	addcc	$T0,$D0,$H0
+	addxccc	%g0,$D1,$H1
 	brnz,pt	$len,.Loop_vis3
-	addxc	%g0,$D1,$H1
+	addxc	%g0,$H2,$H2
 
 	stx	$H0,[$ctx+0]		! store hash value
 	stx	$H1,[$ctx+8]
diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl
index 01c3cbcda9..97d0a81bea 100755
--- a/crypto/poly1305/asm/poly1305-x86.pl
+++ b/crypto/poly1305/asm/poly1305-x86.pl
@@ -299,6 +299,7 @@ if ($sse2) {
 	&adc	("ebx",0);
 	&adc	("ecx",0);
 	&adc	("esi",0);
+	&adc	("edi",0);
 
 	&cmp	("ebp",&wparam(2));		# done yet?
 	&jne	(&label("loop"));
@@ -1166,11 +1167,12 @@ my $addr = shift;
 	&shr	("edi",2);
 	&lea	("ebp",&DWP(0,"edi","edi",4));	# *5
 	 &mov	("edi",&wparam(1));		# output
-	add	("eax","ebp");
+	&add	("eax","ebp");
 	 &mov	("ebp",&wparam(2));		# key
-	adc	("ebx",0);
-	adc	("ecx",0);
-	adc	("edx",0);
+	&adc	("ebx",0);
+	&adc	("ecx",0);
+	&adc	("edx",0);
+	&adc	("esi",0);
 
 	&movd	($D0,"eax");			# offload original hash value
 	&add	("eax",5);			# compare to modulus
diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl
index 8977d563a2..7d676119a2 100755
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ b/crypto/poly1305/asm/poly1305-x86_64.pl
@@ -15,16 +15,16 @@
 # measured with rdtsc at fixed clock frequency.
 #
 #		IALU/gcc-4.8(*)	AVX(**)		AVX2
-# P4		4.90/+120%      -
-# Core 2	2.39/+90%	-
-# Westmere	1.86/+120%	-
+# P4		4.46/+120%	-
+# Core 2	2.41/+90%	-
+# Westmere	1.88/+120%	-
 # Sandy Bridge	1.39/+140%	1.10
-# Haswell	1.10/+175%	1.11		0.65
-# Skylake	1.12/+120%	0.96		0.51
+# Haswell	1.14/+175%	1.11		0.65
+# Skylake	1.13/+120%	0.96		0.51
 # Silvermont	2.83/+95%	-
 # VIA Nano	1.82/+150%	-
 # Sledgehammer	1.38/+160%	-
-# Bulldozer	2.21/+130%	0.97
+# Bulldozer	2.30/+130%	0.97
 #
 # (*)	improvement coefficients relative to clang are more modest and
 #	are ~50% on most processors, in both cases we are comparing to
@@ -114,6 +114,7 @@ $code.=<<___;
 	add	$d3,%rax
 	add	%rax,$h0
 	adc	\$0,$h1
+	adc	\$0,$h2
 ___
 }
 
@@ -184,8 +185,8 @@ $code.=<<___;
 .align	32
 poly1305_blocks:
 .Lblocks:
-	sub	\$16,$len		# too short?
-	jc	.Lno_data
+	shr	\$4,$len
+	jz	.Lno_data		# too short
 
 	push	%rbx
 	push	%rbp
@@ -220,8 +221,8 @@ ___
 	&poly1305_iteration();
 $code.=<<___;
 	mov	$r1,%rax
-	sub	\$16,%r15		# len-=16
-	jnc	.Loop
+	dec	%r15			# len-=16
+	jnz	.Loop
 
 	mov	$h0,0($ctx)		# store hash value
 	mov	$h1,8($ctx)
@@ -521,6 +522,7 @@ poly1305_blocks_avx:
 	add	$d2,$d1			# =*5
 	add	$d1,$h0
 	adc	\$0,$h1
+	adc	\$0,$h2
 
 	mov	$s1,$r1
 	mov	$s1,%rax
@@ -1315,6 +1317,7 @@ poly1305_emit_avx:
 	add	%rcx,%rax
 	add	%rax,%r8
 	adc	\$0,%r9
+	adc	\$0,%r10
 
 	mov	%r8,%rax
 	add	\$5,%r8		# compare to modulus
@@ -1407,6 +1410,7 @@ poly1305_blocks_avx2:
 	add	$d2,$d1			# =*5
 	add	$d1,$h0
 	adc	\$0,$h1
+	adc	\$0,$h2
 
 	mov	$s1,$r1
 	mov	$s1,%rax
diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c
index b500f2e7cb..6bec8b30f8 100644
--- a/crypto/poly1305/poly1305.c
+++ b/crypto/poly1305/poly1305.c
@@ -207,7 +207,17 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)
         c = (h2 >> 2) + (h2 & ~3UL);
         h2 &= 3;
         h0 += c;
-        h1 += (c = CONSTANT_TIME_CARRY(h0,c));   /* doesn't overflow */
+        h1 += (c = CONSTANT_TIME_CARRY(h0,c));
+        h2 += CONSTANT_TIME_CARRY(h1,c);
+        /*
+         * Occasional overflows to 3rd bit of h2 are taken care of
+         * "naturally". If after this point we end up at the top of
+         * this loop, then the overflow bit will be accounted for
+         * in next iteration. If we end up in poly1305_emit, then
+         * comparison to modulus below will still count as "carry
+         * into 131st bit", so that properly reduced value will be
+         * picked in conditional move.
+         */
 
         inp += POLY1305_BLOCK_SIZE;
         len -= POLY1305_BLOCK_SIZE;
@@ -231,12 +241,12 @@ static void poly1305_emit(void *ctx, unsigned char mac[16],
     h1 = st->h[1];
     h2 = st->h[2];
 
-    /* compute h + -p */
+    /* compare to modulus by computing h + -p */
     g0 = (u64)(t = (u128)h0 + 5);
     g1 = (u64)(t = (u128)h1 + (t >> 64));
     g2 = h2 + (u64)(t >> 64);
 
-    /* if there was carry into 130th bit, h1:h0 = g1:g0 */
+    /* if there was carry into 131st bit, h1:h0 = g1:g0 */
     mask = 0 - (g2 >> 2);
     g0 &= mask;
     g1 &= mask;
@@ -361,7 +371,17 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)
         h0 += c;
         h1 += (c = CONSTANT_TIME_CARRY(h0,c));
         h2 += (c = CONSTANT_TIME_CARRY(h1,c));
-        h3 += (c = CONSTANT_TIME_CARRY(h2,c));   /* doesn't overflow */
+        h3 += (c = CONSTANT_TIME_CARRY(h2,c));
+        h4 += CONSTANT_TIME_CARRY(h3,c);
+        /*
+         * Occasional overflows to 3rd bit of h4 are taken care of
+         * "naturally". If after this point we end up at the top of
+         * this loop, then the overflow bit will be accounted for
+         * in next iteration. If we end up in poly1305_emit, then
+         * comparison to modulus below will still count as "carry
+         * into 131st bit", so that properly reduced value will be
+         * picked in conditional move.
+         */
 
         inp += POLY1305_BLOCK_SIZE;
         len -= POLY1305_BLOCK_SIZE;
@@ -389,14 +409,14 @@ static void poly1305_emit(void *ctx, unsigned char mac[16],
     h3 = st->h[3];
     h4 = st->h[4];
 
-    /* compute h + -p */
+    /* compare to modulus by computing h + -p */
     g0 = (u32)(t = (u64)h0 + 5);
     g1 = (u32)(t = (u64)h1 + (t >> 32));
     g2 = (u32)(t = (u64)h2 + (t >> 32));
     g3 = (u32)(t = (u64)h3 + (t >> 32));
     g4 = h4 + (u32)(t >> 32);
 
-    /* if there was carry into 130th bit, h3:h0 = g3:g0 */
+    /* if there was carry into 131st bit, h3:h0 = g3:g0 */
     mask = 0 - (g4 >> 2);
     g0 &= mask;
     g1 &= mask;
@@ -728,6 +748,58 @@ static const struct poly1305_test poly1305_tests[] = {
      "99e5822dd4173c995e3dae0ddefb9774""3fde3b080134b39f76e9bf8d0e88d546",
      "2637408fe13086ea73f971e3425e2820"
     },
+    /*
+     * test vectors from Hanno BÃ¶ck
+     */
+    {
+     "cccccccccccccccccccccccccccccccccccccccccccccccccc80cccccccccccc"
+     "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccecccccc"
+     "ccccccccccccccccccccccccccccccc5cccccccccccccccccccccccccccccccc"
+     "cccccccccce3cccccccccccccccccccccccccccccccccccccccccccccccccccc"
+     "ccccccccaccccccccccccccccccccce6cccccccccc000000afcccccccccccccc"
+     "ccccfffffff50000000000000000000000000000000000000000000000000000"
+     "00ffffffe7000000000000000000000000000000000000000000000000000000"
+     "0000000000000000000000000000000000000000000000000000719205a8521d"
+     "fc",
+     "7f1b0264000000000000000000000000""0000000000000000cccccccccccccccc",
+     "8559b876eceed66eb37798c0457baff9"
+    },
+    {
+     "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa0000000000"
+     "00000000800264",
+     "e0001600000000000000000000000000""0000aaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+     "00bd1258978e205444c9aaaa82006fed"
+    },
+    {
+     "02fc",
+     "0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c""0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c",
+     "06120c0c0c0c0c0c0c0c0c0c0c0c0c0c"
+    },
+    {
+     "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+     "7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+     "7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+     "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b007b7b7b7b7b7b7b7b7b"
+     "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b"
+     "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+     "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+     "7b6e7b001300000000b300000000000000000000000000000000000000000000"
+     "f20000000000000000000000000000000000002000efff000900000000000000"
+     "0000000000100000000009000000640000000000000000000000001300000000"
+     "b300000000000000000000000000000000000000000000f20000000000000000"
+     "000000000000000000002000efff00090000000000000000007a000010000000"
+     "000900000064000000000000000000000000000000000000000000000000fc",
+     "00ff0000000000000000000000000000""00000000001e00000000000000007b7b",
+     "33205bbf9e9f8f7212ab9e2ab9b7e4a5"
+    },
+    {
+     "7777777777777777777777777777777777777777777777777777777777777777"
+     "7777777777777777777777777777777777777777777777777777777777777777"
+     "777777777777777777777777ffffffe9e9acacacacacacacacacacac0000acac"
+     "ec0100acacac2caca2acacacacacacacacacacac64f2",
+     "0000007f0000007f0100002000000000""0000cf77777777777777777777777777",
+     "02ee7c8c546ddeb1a467e4c3981158b9"
+    },
     /*
      * test vectors from Andrew Moon
      */
-- 
2.34.1