Add ECP_NISTZ256 by Shay Gueron, Intel Corp.
authorAndy Polyakov <appro@openssl.org>
Thu, 11 Sep 2014 22:37:41 +0000 (00:37 +0200)
committerAndy Polyakov <appro@openssl.org>
Thu, 11 Sep 2014 22:37:41 +0000 (00:37 +0200)
RT: 3149

Reviewed-by: Rich Salz <rsalz@openssl.org>
crypto/ec/Makefile
crypto/ec/asm/ecp_nistz256-avx2.pl [new file with mode: 0755]
crypto/ec/asm/ecp_nistz256-x86_64.pl [new file with mode: 0755]
crypto/ec/ec.h
crypto/ec/ec_curve.c
crypto/ec/ec_err.c
crypto/ec/ecp_nistz256.c [new file with mode: 0644]
crypto/ec/ecp_nistz256_table.c [new file with mode: 0644]

index 50cda66..898e43d 100644 (file)
@@ -48,6 +48,12 @@ lib: $(LIBOBJ)
        $(RANLIB) $(LIB) || echo Never mind.
        @touch lib
 
+ecp_nistz256-x86_64.s: asm/ecp_nistz256-x86_64.pl
+       $(PERL) asm/ecp_nistz256-x86_64.pl $(PERLASM_SCHEME) > $@
+
+ecp_nistz256-avx2.s:   asm/ecp_nistz256-avx2.pl
+       $(PERL) asm/ecp_nistz256-avx2.pl $(PERLASM_SCHEME) > $@
+
 files:
        $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 
diff --git a/crypto/ec/asm/ecp_nistz256-avx2.pl b/crypto/ec/asm/ecp_nistz256-avx2.pl
new file mode 100755 (executable)
index 0000000..4c220aa
--- /dev/null
@@ -0,0 +1,2093 @@
+#!/usr/bin/env perl
+
+##############################################################################
+#                                                                            #
+# Copyright 2014 Intel Corporation                                           #
+#                                                                            #
+# Licensed under the Apache License, Version 2.0 (the "License");            #
+# you may not use this file except in compliance with the License.           #
+# You may obtain a copy of the License at                                    #
+#                                                                            #
+#    http://www.apache.org/licenses/LICENSE-2.0                              #
+#                                                                            #
+# Unless required by applicable law or agreed to in writing, software        #
+# distributed under the License is distributed on an "AS IS" BASIS,          #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   #
+# See the License for the specific language governing permissions and        #
+# limitations under the License.                                             #
+#                                                                            #
+##############################################################################
+#                                                                            #
+#  Developers and authors:                                                   #
+#  Shay Gueron (1, 2), and Vlad Krasnov (1)                                  #
+#  (1) Intel Corporation, Israel Development Center                          #
+#  (2) University of Haifa                                                   #
+#  Reference:                                                                #
+#  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
+#                           256 Bit Primes"                                  #
+#                                                                            #
+##############################################################################
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+               =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.19) + ($1>=2.22);
+       $addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+           `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.09) + ($1>=2.10);
+       $addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+           `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+       $avx = ($1>=10) + ($1>=11);
+       $addx = ($1>=12);
+}
+
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+       my $ver = $2 + $3/100.0;        # 3.1->3.01, 3.10->3.10
+       $avx = ($ver>=3.0) + ($ver>=3.01);
+       $addx = ($ver>=3.03);
+}
+
+if ($avx>=2) {{
+$digit_size = "\$29";
+$n_digits = "\$9";
+
+$code.=<<___;
+.text
+
+.align 64
+.LAVX2_AND_MASK:
+.LAVX2_POLY:
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x00040000, 0x00040000, 0x00040000, 0x00040000
+.quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000
+.quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
+
+.LAVX2_POLY_x2:
+.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
+.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
+.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
+.quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC
+.quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
+.quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE
+.quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE
+.quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE
+.quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC
+
+.LAVX2_POLY_x8:
+.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
+.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
+.quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8
+.quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8
+.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
+.quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC
+.quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC
+.quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC
+.quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8
+
+.LONE:
+.quad 0x00000020, 0x00000020, 0x00000020, 0x00000020
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff
+.quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+
+# RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL
+# Montgomery form (*2^256) to our format (*2^261)
+
+.LTO_MONT_AVX2:
+.quad 0x00000400, 0x00000400, 0x00000400, 0x00000400
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x00000003, 0x00000003, 0x00000003, 0x00000003
+
+.LFROM_MONT_AVX2:
+.quad 0x00000001, 0x00000001, 0x00000001, 0x00000001
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+.quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff
+.quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff
+.quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff
+.quad 0x00000000, 0x00000000, 0x00000000, 0x00000000
+
+.LIntOne:
+.long 1,1,1,1,1,1,1,1
+___
+
+{
+# This function recieves a pointer to an array of four affine points
+# (X, Y, <1>) and rearanges the data for AVX2 execution, while
+# converting it to 2^29 radix redundant form
+
+my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3,
+    $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15));
+
+$code.=<<___;
+.globl ecp_nistz256_avx2_transpose_convert
+.type  ecp_nistz256_avx2_transpose_convert,\@function,2
+.align 64
+ecp_nistz256_avx2_transpose_convert:
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       lea     -8-16*10(%rsp), %rsp
+       vmovaps %xmm6, -8-16*10(%rax)
+       vmovaps %xmm7, -8-16*9(%rax)
+       vmovaps %xmm8, -8-16*8(%rax)
+       vmovaps %xmm9, -8-16*7(%rax)
+       vmovaps %xmm10, -8-16*6(%rax)
+       vmovaps %xmm11, -8-16*5(%rax)
+       vmovaps %xmm12, -8-16*4(%rax)
+       vmovaps %xmm13, -8-16*3(%rax)
+       vmovaps %xmm14, -8-16*2(%rax)
+       vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+       # Load the data
+       vmovdqa         32*0(%rsi), $X0
+       lea             112(%rsi), %rax         # size optimization
+       vmovdqa         32*1(%rsi), $Y0
+       lea             .LAVX2_AND_MASK(%rip), %rdx
+       vmovdqa         32*2(%rsi), $X1
+       vmovdqa         32*3(%rsi), $Y1
+       vmovdqa         32*4-112(%rax), $X2
+       vmovdqa         32*5-112(%rax), $Y2
+       vmovdqa         32*6-112(%rax), $X3
+       vmovdqa         32*7-112(%rax), $Y3
+
+       # Transpose X and Y independently
+       vpunpcklqdq     $X1, $X0, $T0           # T0 = [B2 A2 B0 A0]
+       vpunpcklqdq     $X3, $X2, $T1           # T1 = [D2 C2 D0 C0]
+       vpunpckhqdq     $X1, $X0, $T2           # T2 = [B3 A3 B1 A1]
+       vpunpckhqdq     $X3, $X2, $T3           # T3 = [D3 C3 D1 C1]
+
+       vpunpcklqdq     $Y1, $Y0, $T4
+       vpunpcklqdq     $Y3, $Y2, $T5
+       vpunpckhqdq     $Y1, $Y0, $T6
+       vpunpckhqdq     $Y3, $Y2, $T7
+
+       vperm2i128      \$0x20, $T1, $T0, $X0   # X0 = [D0 C0 B0 A0]
+       vperm2i128      \$0x20, $T3, $T2, $X1   # X1 = [D1 C1 B1 A1]
+       vperm2i128      \$0x31, $T1, $T0, $X2   # X2 = [D2 C2 B2 A2]
+       vperm2i128      \$0x31, $T3, $T2, $X3   # X3 = [D3 C3 B3 A3]
+
+       vperm2i128      \$0x20, $T5, $T4, $Y0
+       vperm2i128      \$0x20, $T7, $T6, $Y1
+       vperm2i128      \$0x31, $T5, $T4, $Y2
+       vperm2i128      \$0x31, $T7, $T6, $Y3
+       vmovdqa         (%rdx), $T7
+
+       vpand           (%rdx), $X0, $T0        # out[0] = in[0] & mask;
+       vpsrlq          \$29, $X0, $X0
+       vpand           $T7, $X0, $T1           # out[1] = (in[0] >> shift) & mask;
+       vpsrlq          \$29, $X0, $X0
+       vpsllq          \$6, $X1, $T2
+       vpxor           $X0, $T2, $T2
+       vpand           $T7, $T2, $T2           # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
+       vpsrlq          \$23, $X1, $X1
+       vpand           $T7, $X1, $T3           # out[3] = (in[1] >> ((shift*3)%64)) & mask;
+       vpsrlq          \$29, $X1, $X1
+       vpsllq          \$12, $X2, $T4
+       vpxor           $X1, $T4, $T4
+       vpand           $T7, $T4, $T4           # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
+       vpsrlq          \$17, $X2, $X2
+       vpand           $T7, $X2, $T5           # out[5] = (in[2] >> ((shift*5)%64)) & mask;
+       vpsrlq          \$29, $X2, $X2
+       vpsllq          \$18, $X3, $T6
+       vpxor           $X2, $T6, $T6
+       vpand           $T7, $T6, $T6           # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
+       vpsrlq          \$11, $X3, $X3
+        vmovdqa        $T0, 32*0(%rdi)
+        lea            112(%rdi), %rax         # size optimization
+       vpand           $T7, $X3, $T0           # out[7] = (in[3] >> ((shift*7)%64)) & mask;
+       vpsrlq          \$29, $X3, $X3          # out[8] = (in[3] >> ((shift*8)%64)) & mask;
+
+       vmovdqa         $T1, 32*1(%rdi)
+       vmovdqa         $T2, 32*2(%rdi)
+       vmovdqa         $T3, 32*3(%rdi)
+       vmovdqa         $T4, 32*4-112(%rax)
+       vmovdqa         $T5, 32*5-112(%rax)
+       vmovdqa         $T6, 32*6-112(%rax)
+       vmovdqa         $T0, 32*7-112(%rax)
+       vmovdqa         $X3, 32*8-112(%rax)
+       lea             448(%rdi), %rax         # size optimization
+
+       vpand           $T7, $Y0, $T0           # out[0] = in[0] & mask;
+       vpsrlq          \$29, $Y0, $Y0
+       vpand           $T7, $Y0, $T1           # out[1] = (in[0] >> shift) & mask;
+       vpsrlq          \$29, $Y0, $Y0
+       vpsllq          \$6, $Y1, $T2
+       vpxor           $Y0, $T2, $T2
+       vpand           $T7, $T2, $T2           # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask;
+       vpsrlq          \$23, $Y1, $Y1
+       vpand           $T7, $Y1, $T3           # out[3] = (in[1] >> ((shift*3)%64)) & mask;
+       vpsrlq          \$29, $Y1, $Y1
+       vpsllq          \$12, $Y2, $T4
+       vpxor           $Y1, $T4, $T4
+       vpand           $T7, $T4, $T4           # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask;
+       vpsrlq          \$17, $Y2, $Y2
+       vpand           $T7, $Y2, $T5           # out[5] = (in[2] >> ((shift*5)%64)) & mask;
+       vpsrlq          \$29, $Y2, $Y2
+       vpsllq          \$18, $Y3, $T6
+       vpxor           $Y2, $T6, $T6
+       vpand           $T7, $T6, $T6           # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask;
+       vpsrlq          \$11, $Y3, $Y3
+        vmovdqa        $T0, 32*9-448(%rax)
+       vpand           $T7, $Y3, $T0           # out[7] = (in[3] >> ((shift*7)%64)) & mask;
+       vpsrlq          \$29, $Y3, $Y3          # out[8] = (in[3] >> ((shift*8)%64)) & mask;
+
+       vmovdqa         $T1, 32*10-448(%rax)
+       vmovdqa         $T2, 32*11-448(%rax)
+       vmovdqa         $T3, 32*12-448(%rax)
+       vmovdqa         $T4, 32*13-448(%rax)
+       vmovdqa         $T5, 32*14-448(%rax)
+       vmovdqa         $T6, 32*15-448(%rax)
+       vmovdqa         $T0, 32*16-448(%rax)
+       vmovdqa         $Y3, 32*17-448(%rax)
+
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       movaps  16*0(%rsp), %xmm6
+       movaps  16*1(%rsp), %xmm7
+       movaps  16*2(%rsp), %xmm8
+       movaps  16*3(%rsp), %xmm9
+       movaps  16*4(%rsp), %xmm10
+       movaps  16*5(%rsp), %xmm11
+       movaps  16*6(%rsp), %xmm12
+       movaps  16*7(%rsp), %xmm13
+       movaps  16*8(%rsp), %xmm14
+       movaps  16*9(%rsp), %xmm15
+       lea     8+16*10(%rsp), %rsp
+___
+$code.=<<___;
+       ret
+.size  ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert
+___
+}
+{
+################################################################################
+# This function recieves a pointer to an array of four AVX2 formatted points
+# (X, Y, Z) convert the data to normal representation, and rearanges the data
+
+my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8));
+my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15));
+
+$code.=<<___;
+
+.globl ecp_nistz256_avx2_convert_transpose_back
+.type  ecp_nistz256_avx2_convert_transpose_back,\@function,2
+.align 32
+ecp_nistz256_avx2_convert_transpose_back:
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       lea     -8-16*10(%rsp), %rsp
+       vmovaps %xmm6, -8-16*10(%rax)
+       vmovaps %xmm7, -8-16*9(%rax)
+       vmovaps %xmm8, -8-16*8(%rax)
+       vmovaps %xmm9, -8-16*7(%rax)
+       vmovaps %xmm10, -8-16*6(%rax)
+       vmovaps %xmm11, -8-16*5(%rax)
+       vmovaps %xmm12, -8-16*4(%rax)
+       vmovaps %xmm13, -8-16*3(%rax)
+       vmovaps %xmm14, -8-16*2(%rax)
+       vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+       mov     \$3, %ecx
+
+.Lconv_loop:
+       vmovdqa         32*0(%rsi), $D0
+       lea             160(%rsi), %rax         # size optimization
+       vmovdqa         32*1(%rsi), $D1
+       vmovdqa         32*2(%rsi), $D2
+       vmovdqa         32*3(%rsi), $D3
+       vmovdqa         32*4-160(%rax), $D4
+       vmovdqa         32*5-160(%rax), $D5
+       vmovdqa         32*6-160(%rax), $D6
+       vmovdqa         32*7-160(%rax), $D7
+       vmovdqa         32*8-160(%rax), $D8
+
+       vpsllq          \$29, $D1, $D1
+       vpsllq          \$58, $D2, $T0
+       vpaddq          $D1, $D0, $D0
+       vpaddq          $T0, $D0, $D0           # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2);
+
+       vpsrlq          \$6, $D2, $D2
+       vpsllq          \$23, $D3, $D3
+       vpsllq          \$52, $D4, $T1
+       vpaddq          $D2, $D3, $D3
+       vpaddq          $D3, $T1, $D1           # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64);
+
+       vpsrlq          \$12, $D4, $D4
+       vpsllq          \$17, $D5, $D5
+       vpsllq          \$46, $D6, $T2
+       vpaddq          $D4, $D5, $D5
+       vpaddq          $D5, $T2, $D2           # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64);
+
+       vpsrlq          \$18, $D6, $D6
+       vpsllq          \$11, $D7, $D7
+       vpsllq          \$40, $D8, $T3
+       vpaddq          $D6, $D7, $D7
+       vpaddq          $D7, $T3, $D3           # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64);
+
+       vpunpcklqdq     $D1, $D0, $T0           # T0 = [B2 A2 B0 A0]
+       vpunpcklqdq     $D3, $D2, $T1           # T1 = [D2 C2 D0 C0]
+       vpunpckhqdq     $D1, $D0, $T2           # T2 = [B3 A3 B1 A1]
+       vpunpckhqdq     $D3, $D2, $T3           # T3 = [D3 C3 D1 C1]
+
+       vperm2i128      \$0x20, $T1, $T0, $D0   # X0 = [D0 C0 B0 A0]
+       vperm2i128      \$0x20, $T3, $T2, $D1   # X1 = [D1 C1 B1 A1]
+       vperm2i128      \$0x31, $T1, $T0, $D2   # X2 = [D2 C2 B2 A2]
+       vperm2i128      \$0x31, $T3, $T2, $D3   # X3 = [D3 C3 B3 A3]
+
+       vmovdqa         $D0, 32*0(%rdi)
+       vmovdqa         $D1, 32*3(%rdi)
+       vmovdqa         $D2, 32*6(%rdi)
+       vmovdqa         $D3, 32*9(%rdi)
+
+       lea             32*9(%rsi), %rsi
+       lea             32*1(%rdi), %rdi
+
+       dec     %ecx
+       jnz     .Lconv_loop
+
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       movaps  16*0(%rsp), %xmm6
+       movaps  16*1(%rsp), %xmm7
+       movaps  16*2(%rsp), %xmm8
+       movaps  16*3(%rsp), %xmm9
+       movaps  16*4(%rsp), %xmm10
+       movaps  16*5(%rsp), %xmm11
+       movaps  16*6(%rsp), %xmm12
+       movaps  16*7(%rsp), %xmm13
+       movaps  16*8(%rsp), %xmm14
+       movaps  16*9(%rsp), %xmm15
+       lea     8+16*10(%rsp), %rsp
+___
+$code.=<<___;
+       ret
+.size  ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back
+___
+}
+{
+my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx");
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8));
+my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13));
+
+sub NORMALIZE {
+my $ret=<<___;
+       vpsrlq          $digit_size, $ACC0, $T0
+       vpand           $AND_MASK, $ACC0, $ACC0
+       vpaddq          $T0, $ACC1, $ACC1
+
+       vpsrlq          $digit_size, $ACC1, $T0
+       vpand           $AND_MASK, $ACC1, $ACC1
+       vpaddq          $T0, $ACC2, $ACC2
+
+       vpsrlq          $digit_size, $ACC2, $T0
+       vpand           $AND_MASK, $ACC2, $ACC2
+       vpaddq          $T0, $ACC3, $ACC3
+
+       vpsrlq          $digit_size, $ACC3, $T0
+       vpand           $AND_MASK, $ACC3, $ACC3
+       vpaddq          $T0, $ACC4, $ACC4
+
+       vpsrlq          $digit_size, $ACC4, $T0
+       vpand           $AND_MASK, $ACC4, $ACC4
+       vpaddq          $T0, $ACC5, $ACC5
+
+       vpsrlq          $digit_size, $ACC5, $T0
+       vpand           $AND_MASK, $ACC5, $ACC5
+       vpaddq          $T0, $ACC6, $ACC6
+
+       vpsrlq          $digit_size, $ACC6, $T0
+       vpand           $AND_MASK, $ACC6, $ACC6
+       vpaddq          $T0, $ACC7, $ACC7
+
+       vpsrlq          $digit_size, $ACC7, $T0
+       vpand           $AND_MASK, $ACC7, $ACC7
+       vpaddq          $T0, $ACC8, $ACC8
+       #vpand          $AND_MASK, $ACC8, $ACC8
+___
+    $ret;
+}
+
+sub STORE {
+my $ret=<<___;
+       vmovdqa         $ACC0, 32*0(%rdi)
+       lea             160(%rdi), %rax         # size optimization
+       vmovdqa         $ACC1, 32*1(%rdi)
+       vmovdqa         $ACC2, 32*2(%rdi)
+       vmovdqa         $ACC3, 32*3(%rdi)
+       vmovdqa         $ACC4, 32*4-160(%rax)
+       vmovdqa         $ACC5, 32*5-160(%rax)
+       vmovdqa         $ACC6, 32*6-160(%rax)
+       vmovdqa         $ACC7, 32*7-160(%rax)
+       vmovdqa         $ACC8, 32*8-160(%rax)
+___
+    $ret;
+}
+
+$code.=<<___;
+.type  avx2_normalize,\@abi-omnipotent
+.align 32
+avx2_normalize:
+       vpsrlq          $digit_size, $ACC0, $T0
+       vpand           $AND_MASK, $ACC0, $ACC0
+       vpaddq          $T0, $ACC1, $ACC1
+
+       vpsrlq          $digit_size, $ACC1, $T0
+       vpand           $AND_MASK, $ACC1, $ACC1
+       vpaddq          $T0, $ACC2, $ACC2
+
+       vpsrlq          $digit_size, $ACC2, $T0
+       vpand           $AND_MASK, $ACC2, $ACC2
+       vpaddq          $T0, $ACC3, $ACC3
+
+       vpsrlq          $digit_size, $ACC3, $T0
+       vpand           $AND_MASK, $ACC3, $ACC3
+       vpaddq          $T0, $ACC4, $ACC4
+
+       vpsrlq          $digit_size, $ACC4, $T0
+       vpand           $AND_MASK, $ACC4, $ACC4
+       vpaddq          $T0, $ACC5, $ACC5
+
+       vpsrlq          $digit_size, $ACC5, $T0
+       vpand           $AND_MASK, $ACC5, $ACC5
+       vpaddq          $T0, $ACC6, $ACC6
+
+       vpsrlq          $digit_size, $ACC6, $T0
+       vpand           $AND_MASK, $ACC6, $ACC6
+       vpaddq          $T0, $ACC7, $ACC7
+
+       vpsrlq          $digit_size, $ACC7, $T0
+       vpand           $AND_MASK, $ACC7, $ACC7
+       vpaddq          $T0, $ACC8, $ACC8
+       #vpand          $AND_MASK, $ACC8, $ACC8
+
+       ret
+.size  avx2_normalize,.-avx2_normalize
+
+.type  avx2_normalize_n_store,\@abi-omnipotent
+.align 32
+avx2_normalize_n_store:
+       vpsrlq          $digit_size, $ACC0, $T0
+       vpand           $AND_MASK, $ACC0, $ACC0
+       vpaddq          $T0, $ACC1, $ACC1
+
+       vpsrlq          $digit_size, $ACC1, $T0
+       vpand           $AND_MASK, $ACC1, $ACC1
+        vmovdqa        $ACC0, 32*0(%rdi)
+        lea            160(%rdi), %rax         # size optimization
+       vpaddq          $T0, $ACC2, $ACC2
+
+       vpsrlq          $digit_size, $ACC2, $T0
+       vpand           $AND_MASK, $ACC2, $ACC2
+        vmovdqa        $ACC1, 32*1(%rdi)
+       vpaddq          $T0, $ACC3, $ACC3
+
+       vpsrlq          $digit_size, $ACC3, $T0
+       vpand           $AND_MASK, $ACC3, $ACC3
+        vmovdqa        $ACC2, 32*2(%rdi)
+       vpaddq          $T0, $ACC4, $ACC4
+
+       vpsrlq          $digit_size, $ACC4, $T0
+       vpand           $AND_MASK, $ACC4, $ACC4
+        vmovdqa        $ACC3, 32*3(%rdi)
+       vpaddq          $T0, $ACC5, $ACC5
+
+       vpsrlq          $digit_size, $ACC5, $T0
+       vpand           $AND_MASK, $ACC5, $ACC5
+        vmovdqa        $ACC4, 32*4-160(%rax)
+       vpaddq          $T0, $ACC6, $ACC6
+
+       vpsrlq          $digit_size, $ACC6, $T0
+       vpand           $AND_MASK, $ACC6, $ACC6
+        vmovdqa        $ACC5, 32*5-160(%rax)
+       vpaddq          $T0, $ACC7, $ACC7
+
+       vpsrlq          $digit_size, $ACC7, $T0
+       vpand           $AND_MASK, $ACC7, $ACC7
+        vmovdqa        $ACC6, 32*6-160(%rax)
+       vpaddq          $T0, $ACC8, $ACC8
+       #vpand          $AND_MASK, $ACC8, $ACC8
+        vmovdqa        $ACC7, 32*7-160(%rax)
+        vmovdqa        $ACC8, 32*8-160(%rax)
+
+       ret
+.size  avx2_normalize_n_store,.-avx2_normalize_n_store
+
+################################################################################
+# void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4);
+.type  avx2_mul_x4,\@abi-omnipotent
+.align 32
+avx2_mul_x4:
+       lea     .LAVX2_POLY(%rip), %rax
+
+       vpxor   $ACC0, $ACC0, $ACC0
+       vpxor   $ACC1, $ACC1, $ACC1
+       vpxor   $ACC2, $ACC2, $ACC2
+       vpxor   $ACC3, $ACC3, $ACC3
+       vpxor   $ACC4, $ACC4, $ACC4
+       vpxor   $ACC5, $ACC5, $ACC5
+       vpxor   $ACC6, $ACC6, $ACC6
+       vpxor   $ACC7, $ACC7, $ACC7
+
+       vmovdqa 32*7(%rax), %ymm14
+       vmovdqa 32*8(%rax), %ymm15
+
+       mov     $n_digits, $itr
+       lea     -512($a_ptr), $a_ptr    # strategic bias to control u-op density
+       jmp     .Lavx2_mul_x4_loop
+
+.align 32
+.Lavx2_mul_x4_loop:
+       vmovdqa         32*0($b_ptr), $B
+       lea             32*1($b_ptr), $b_ptr
+
+       vpmuludq        32*0+512($a_ptr), $B, $T0
+       vpmuludq        32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW
+       vpaddq          $T0, $ACC0, $ACC0
+       vpmuludq        32*2+512($a_ptr), $B, $T0
+       vpaddq          $OVERFLOW, $ACC1, $ACC1
+        vpand          $AND_MASK, $ACC0, $Y
+       vpmuludq        32*3+512($a_ptr), $B, $OVERFLOW
+       vpaddq          $T0, $ACC2, $ACC2
+       vpmuludq        32*4+512($a_ptr), $B, $T0
+       vpaddq          $OVERFLOW, $ACC3, $ACC3
+       vpmuludq        32*5+512($a_ptr), $B, $OVERFLOW
+       vpaddq          $T0, $ACC4, $ACC4
+       vpmuludq        32*6+512($a_ptr), $B, $T0
+       vpaddq          $OVERFLOW, $ACC5, $ACC5
+       vpmuludq        32*7+512($a_ptr), $B, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC6
+
+       # Skip some multiplications, optimizing for the constant poly
+       vpmuludq        $AND_MASK, $Y, $T0
+        vpaddq         $OVERFLOW, $ACC7, $ACC7
+        vpmuludq       32*8+512($a_ptr), $B, $ACC8
+       vpaddq          $T0, $ACC0, $OVERFLOW
+       vpaddq          $T0, $ACC1, $ACC0
+       vpsrlq          $digit_size, $OVERFLOW, $OVERFLOW
+       vpaddq          $T0, $ACC2, $ACC1
+       vpmuludq        32*3(%rax), $Y, $T0
+       vpaddq          $OVERFLOW, $ACC0, $ACC0
+       vpaddq          $T0, $ACC3, $ACC2
+       .byte           0x67
+       vmovdqa         $ACC4, $ACC3
+       vpsllq          \$18, $Y, $OVERFLOW
+       .byte           0x67
+       vmovdqa         $ACC5, $ACC4
+       vpmuludq        %ymm14, $Y, $T0
+       vpaddq          $OVERFLOW, $ACC6, $ACC5
+       vpmuludq        %ymm15, $Y, $OVERFLOW
+       vpaddq          $T0, $ACC7, $ACC6
+       vpaddq          $OVERFLOW, $ACC8, $ACC7
+
+       dec     $itr
+       jnz     .Lavx2_mul_x4_loop
+
+       vpxor   $ACC8, $ACC8, $ACC8
+
+       ret
+.size  avx2_mul_x4,.-avx2_mul_x4
+
+# Function optimized for the constant 1
+################################################################################
+# void avx2_mul_by1_x4(void* RESULTx4, void *Ax4);
+.type  avx2_mul_by1_x4,\@abi-omnipotent
+.align 32
+avx2_mul_by1_x4:
+       lea     .LAVX2_POLY(%rip), %rax
+
+       vpxor   $ACC0, $ACC0, $ACC0
+       vpxor   $ACC1, $ACC1, $ACC1
+       vpxor   $ACC2, $ACC2, $ACC2
+       vpxor   $ACC3, $ACC3, $ACC3
+       vpxor   $ACC4, $ACC4, $ACC4
+       vpxor   $ACC5, $ACC5, $ACC5
+       vpxor   $ACC6, $ACC6, $ACC6
+       vpxor   $ACC7, $ACC7, $ACC7
+       vpxor   $ACC8, $ACC8, $ACC8
+
+       vmovdqa 32*3+.LONE(%rip), %ymm14
+       vmovdqa 32*7+.LONE(%rip), %ymm15
+
+       mov     $n_digits, $itr
+       jmp     .Lavx2_mul_by1_x4_loop
+
+.align 32
+.Lavx2_mul_by1_x4_loop:
+       vmovdqa         32*0($a_ptr), $B
+       .byte           0x48,0x8d,0xb6,0x20,0,0,0       # lea   32*1($a_ptr), $a_ptr
+
+       vpsllq          \$5, $B, $OVERFLOW
+       vpmuludq        %ymm14, $B, $T0
+       vpaddq          $OVERFLOW, $ACC0, $ACC0
+       vpaddq          $T0, $ACC3, $ACC3
+       .byte           0x67
+       vpmuludq        $AND_MASK, $B, $T0
+       vpand           $AND_MASK, $ACC0, $Y
+       vpaddq          $T0, $ACC4, $ACC4
+       vpaddq          $T0, $ACC5, $ACC5
+       vpaddq          $T0, $ACC6, $ACC6
+       vpsllq          \$23, $B, $T0
+
+       .byte           0x67,0x67
+       vpmuludq        %ymm15, $B, $OVERFLOW
+       vpsubq          $T0, $ACC6, $ACC6
+
+       vpmuludq        $AND_MASK, $Y, $T0
+       vpaddq          $OVERFLOW, $ACC7, $ACC7
+       vpaddq          $T0, $ACC0, $OVERFLOW
+       vpaddq          $T0, $ACC1, $ACC0
+       .byte           0x67,0x67
+       vpsrlq          $digit_size, $OVERFLOW, $OVERFLOW
+       vpaddq          $T0, $ACC2, $ACC1
+       vpmuludq        32*3(%rax), $Y, $T0
+       vpaddq          $OVERFLOW, $ACC0, $ACC0
+       vpaddq          $T0, $ACC3, $ACC2
+       vmovdqa         $ACC4, $ACC3
+       vpsllq          \$18, $Y, $OVERFLOW
+       vmovdqa         $ACC5, $ACC4
+       vpmuludq        32*7(%rax), $Y, $T0
+       vpaddq          $OVERFLOW, $ACC6, $ACC5
+       vpaddq          $T0, $ACC7, $ACC6
+       vpmuludq        32*8(%rax), $Y, $ACC7
+
+       dec     $itr
+       jnz     .Lavx2_mul_by1_x4_loop
+
+       ret
+.size  avx2_mul_by1_x4,.-avx2_mul_by1_x4
+
+################################################################################
+# void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4);
+.type  avx2_sqr_x4,\@abi-omnipotent
+.align 32
+avx2_sqr_x4:
+       lea             .LAVX2_POLY(%rip), %rax
+
+       vmovdqa         32*7(%rax), %ymm14
+       vmovdqa         32*8(%rax), %ymm15
+
+       vmovdqa         32*0($a_ptr), $B
+       vmovdqa         32*1($a_ptr), $ACC1
+       vmovdqa         32*2($a_ptr), $ACC2
+       vmovdqa         32*3($a_ptr), $ACC3
+       vmovdqa         32*4($a_ptr), $ACC4
+       vmovdqa         32*5($a_ptr), $ACC5
+       vmovdqa         32*6($a_ptr), $ACC6
+       vmovdqa         32*7($a_ptr), $ACC7
+       vpaddq          $ACC1, $ACC1, $ACC1     # 2*$ACC0..7
+       vmovdqa         32*8($a_ptr), $ACC8
+       vpaddq          $ACC2, $ACC2, $ACC2
+       vmovdqa         $ACC1, 32*0(%rcx)
+       vpaddq          $ACC3, $ACC3, $ACC3
+       vmovdqa         $ACC2, 32*1(%rcx)
+       vpaddq          $ACC4, $ACC4, $ACC4
+       vmovdqa         $ACC3, 32*2(%rcx)
+       vpaddq          $ACC5, $ACC5, $ACC5
+       vmovdqa         $ACC4, 32*3(%rcx)
+       vpaddq          $ACC6, $ACC6, $ACC6
+       vmovdqa         $ACC5, 32*4(%rcx)
+       vpaddq          $ACC7, $ACC7, $ACC7
+       vmovdqa         $ACC6, 32*5(%rcx)
+       vpaddq          $ACC8, $ACC8, $ACC8
+       vmovdqa         $ACC7, 32*6(%rcx)
+       vmovdqa         $ACC8, 32*7(%rcx)
+
+       #itr            1
+       vpmuludq        $B, $B, $ACC0
+       vpmuludq        $B, $ACC1, $ACC1
+        vpand          $AND_MASK, $ACC0, $Y
+       vpmuludq        $B, $ACC2, $ACC2
+       vpmuludq        $B, $ACC3, $ACC3
+       vpmuludq        $B, $ACC4, $ACC4
+       vpmuludq        $B, $ACC5, $ACC5
+       vpmuludq        $B, $ACC6, $ACC6
+        vpmuludq       $AND_MASK, $Y, $T0
+       vpmuludq        $B, $ACC7, $ACC7
+       vpmuludq        $B, $ACC8, $ACC8
+        vmovdqa        32*1($a_ptr), $B
+
+       vpaddq          $T0, $ACC0, $OVERFLOW
+       vpaddq          $T0, $ACC1, $ACC0
+       vpsrlq          $digit_size, $OVERFLOW, $OVERFLOW
+       vpaddq          $T0, $ACC2, $ACC1
+       vpmuludq        32*3(%rax), $Y, $T0
+       vpaddq          $OVERFLOW, $ACC0, $ACC0
+       vpaddq          $T0, $ACC3, $ACC2
+       vmovdqa         $ACC4, $ACC3
+       vpsllq          \$18, $Y, $T0
+       vmovdqa         $ACC5, $ACC4
+       vpmuludq        %ymm14, $Y, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC5
+       vpmuludq        %ymm15, $Y, $T0
+       vpaddq          $OVERFLOW, $ACC7, $ACC6
+       vpaddq          $T0, $ACC8, $ACC7
+
+       #itr            2
+       vpmuludq        $B, $B, $OVERFLOW
+        vpand          $AND_MASK, $ACC0, $Y
+       vpmuludq        32*1(%rcx), $B, $T0
+       vpaddq          $OVERFLOW, $ACC1, $ACC1
+       vpmuludq        32*2(%rcx), $B, $OVERFLOW
+       vpaddq          $T0, $ACC2, $ACC2
+       vpmuludq        32*3(%rcx), $B, $T0
+       vpaddq          $OVERFLOW, $ACC3, $ACC3
+       vpmuludq        32*4(%rcx), $B, $OVERFLOW
+       vpaddq          $T0, $ACC4, $ACC4
+       vpmuludq        32*5(%rcx), $B, $T0
+       vpaddq          $OVERFLOW, $ACC5, $ACC5
+       vpmuludq        32*6(%rcx), $B, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC6
+
+       vpmuludq        $AND_MASK, $Y, $T0
+        vpaddq         $OVERFLOW, $ACC7, $ACC7
+        vpmuludq       32*7(%rcx), $B, $ACC8
+        vmovdqa        32*2($a_ptr), $B
+       vpaddq          $T0, $ACC0, $OVERFLOW
+       vpaddq          $T0, $ACC1, $ACC0
+       vpsrlq          $digit_size, $OVERFLOW, $OVERFLOW
+       vpaddq          $T0, $ACC2, $ACC1
+       vpmuludq        32*3(%rax), $Y, $T0
+       vpaddq          $OVERFLOW, $ACC0, $ACC0
+       vpaddq          $T0, $ACC3, $ACC2
+       vmovdqa         $ACC4, $ACC3
+       vpsllq          \$18, $Y, $T0
+       vmovdqa         $ACC5, $ACC4
+       vpmuludq        %ymm14, $Y, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC5
+       vpmuludq        %ymm15, $Y, $T0
+       vpaddq          $OVERFLOW, $ACC7, $ACC6
+       vpaddq          $T0, $ACC8, $ACC7
+
+       #itr            3
+       vpmuludq        $B, $B, $T0
+        vpand          $AND_MASK, $ACC0, $Y
+       vpmuludq        32*2(%rcx), $B, $OVERFLOW
+       vpaddq          $T0, $ACC2, $ACC2
+       vpmuludq        32*3(%rcx), $B, $T0
+       vpaddq          $OVERFLOW, $ACC3, $ACC3
+       vpmuludq        32*4(%rcx), $B, $OVERFLOW
+       vpaddq          $T0, $ACC4, $ACC4
+       vpmuludq        32*5(%rcx), $B, $T0
+       vpaddq          $OVERFLOW, $ACC5, $ACC5
+       vpmuludq        32*6(%rcx), $B, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC6
+
+       vpmuludq        $AND_MASK, $Y, $T0
+        vpaddq         $OVERFLOW, $ACC7, $ACC7
+        vpmuludq       32*7(%rcx), $B, $ACC8
+        vmovdqa        32*3($a_ptr), $B
+       vpaddq          $T0, $ACC0, $OVERFLOW
+       vpaddq          $T0, $ACC1, $ACC0
+       vpsrlq          $digit_size, $OVERFLOW, $OVERFLOW
+       vpaddq          $T0, $ACC2, $ACC1
+       vpmuludq        32*3(%rax), $Y, $T0
+       vpaddq          $OVERFLOW, $ACC0, $ACC0
+       vpaddq          $T0, $ACC3, $ACC2
+       vmovdqa         $ACC4, $ACC3
+       vpsllq          \$18, $Y, $T0
+       vmovdqa         $ACC5, $ACC4
+       vpmuludq        %ymm14, $Y, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC5
+       vpmuludq        %ymm15, $Y, $T0
+        vpand          $AND_MASK, $ACC0, $Y
+       vpaddq          $OVERFLOW, $ACC7, $ACC6
+       vpaddq          $T0, $ACC8, $ACC7
+
+       #itr            4
+       vpmuludq        $B, $B, $OVERFLOW
+       vpmuludq        32*3(%rcx), $B, $T0
+       vpaddq          $OVERFLOW, $ACC3, $ACC3
+       vpmuludq        32*4(%rcx), $B, $OVERFLOW
+       vpaddq          $T0, $ACC4, $ACC4
+       vpmuludq        32*5(%rcx), $B, $T0
+       vpaddq          $OVERFLOW, $ACC5, $ACC5
+       vpmuludq        32*6(%rcx), $B, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC6
+
+       vpmuludq        $AND_MASK, $Y, $T0
+        vpaddq         $OVERFLOW, $ACC7, $ACC7
+        vpmuludq       32*7(%rcx), $B, $ACC8
+        vmovdqa        32*4($a_ptr), $B
+       vpaddq          $T0, $ACC0, $OVERFLOW
+       vpaddq          $T0, $ACC1, $ACC0
+       vpsrlq          $digit_size, $OVERFLOW, $OVERFLOW
+       vpaddq          $T0, $ACC2, $ACC1
+       vpmuludq        32*3(%rax), $Y, $T0
+       vpaddq          $OVERFLOW, $ACC0, $ACC0
+       vpaddq          $T0, $ACC3, $ACC2
+       vmovdqa         $ACC4, $ACC3
+       vpsllq          \$18, $Y, $T0
+       vmovdqa         $ACC5, $ACC4
+       vpmuludq        %ymm14, $Y, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC5
+       vpmuludq        %ymm15, $Y, $T0
+        vpand          $AND_MASK, $ACC0, $Y
+       vpaddq          $OVERFLOW, $ACC7, $ACC6
+       vpaddq          $T0, $ACC8, $ACC7
+
+       #itr            5
+       vpmuludq        $B, $B, $T0
+       vpmuludq        32*4(%rcx), $B, $OVERFLOW
+       vpaddq          $T0, $ACC4, $ACC4
+       vpmuludq        32*5(%rcx), $B, $T0
+       vpaddq          $OVERFLOW, $ACC5, $ACC5
+       vpmuludq        32*6(%rcx), $B, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC6
+
+       vpmuludq        $AND_MASK, $Y, $T0
+        vpaddq         $OVERFLOW, $ACC7, $ACC7
+        vpmuludq       32*7(%rcx), $B, $ACC8
+        vmovdqa        32*5($a_ptr), $B
+       vpaddq          $T0, $ACC0, $OVERFLOW
+       vpsrlq          $digit_size, $OVERFLOW, $OVERFLOW
+       vpaddq          $T0, $ACC1, $ACC0
+       vpaddq          $T0, $ACC2, $ACC1
+       vpmuludq        32*3+.LAVX2_POLY(%rip), $Y, $T0
+       vpaddq          $OVERFLOW, $ACC0, $ACC0
+       vpaddq          $T0, $ACC3, $ACC2
+       vmovdqa         $ACC4, $ACC3
+       vpsllq          \$18, $Y, $T0
+       vmovdqa         $ACC5, $ACC4
+       vpmuludq        %ymm14, $Y, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC5
+       vpmuludq        %ymm15, $Y, $T0
+        vpand          $AND_MASK, $ACC0, $Y
+       vpaddq          $OVERFLOW, $ACC7, $ACC6
+       vpaddq          $T0, $ACC8, $ACC7
+
+       #itr            6
+       vpmuludq        $B, $B, $OVERFLOW
+       vpmuludq        32*5(%rcx), $B, $T0
+       vpaddq          $OVERFLOW, $ACC5, $ACC5
+       vpmuludq        32*6(%rcx), $B, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC6
+
+       vpmuludq        $AND_MASK, $Y, $T0
+        vpaddq         $OVERFLOW, $ACC7, $ACC7
+        vpmuludq       32*7(%rcx), $B, $ACC8
+        vmovdqa        32*6($a_ptr), $B
+       vpaddq          $T0, $ACC0, $OVERFLOW
+       vpaddq          $T0, $ACC1, $ACC0
+       vpsrlq          $digit_size, $OVERFLOW, $OVERFLOW
+       vpaddq          $T0, $ACC2, $ACC1
+       vpmuludq        32*3(%rax), $Y, $T0
+       vpaddq          $OVERFLOW, $ACC0, $ACC0
+       vpaddq          $T0, $ACC3, $ACC2
+       vmovdqa         $ACC4, $ACC3
+       vpsllq          \$18, $Y, $T0
+       vmovdqa         $ACC5, $ACC4
+       vpmuludq        %ymm14, $Y, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC5
+       vpmuludq        %ymm15, $Y, $T0
+        vpand          $AND_MASK, $ACC0, $Y
+       vpaddq          $OVERFLOW, $ACC7, $ACC6
+       vpaddq          $T0, $ACC8, $ACC7
+
+       #itr            7
+       vpmuludq        $B, $B, $T0
+       vpmuludq        32*6(%rcx), $B, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC6
+
+       vpmuludq        $AND_MASK, $Y, $T0
+        vpaddq         $OVERFLOW, $ACC7, $ACC7
+        vpmuludq       32*7(%rcx), $B, $ACC8
+        vmovdqa        32*7($a_ptr), $B
+       vpaddq          $T0, $ACC0, $OVERFLOW
+       vpsrlq          $digit_size, $OVERFLOW, $OVERFLOW
+       vpaddq          $T0, $ACC1, $ACC0
+       vpaddq          $T0, $ACC2, $ACC1
+       vpmuludq        32*3(%rax), $Y, $T0
+       vpaddq          $OVERFLOW, $ACC0, $ACC0
+       vpaddq          $T0, $ACC3, $ACC2
+       vmovdqa         $ACC4, $ACC3
+       vpsllq          \$18, $Y, $T0
+       vmovdqa         $ACC5, $ACC4
+       vpmuludq        %ymm14, $Y, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC5
+       vpmuludq        %ymm15, $Y, $T0
+        vpand          $AND_MASK, $ACC0, $Y
+       vpaddq          $OVERFLOW, $ACC7, $ACC6
+       vpaddq          $T0, $ACC8, $ACC7
+
+       #itr            8
+       vpmuludq        $B, $B, $OVERFLOW
+
+       vpmuludq        $AND_MASK, $Y, $T0
+        vpaddq         $OVERFLOW, $ACC7, $ACC7
+        vpmuludq       32*7(%rcx), $B, $ACC8
+        vmovdqa        32*8($a_ptr), $B
+       vpaddq          $T0, $ACC0, $OVERFLOW
+       vpsrlq          $digit_size, $OVERFLOW, $OVERFLOW
+       vpaddq          $T0, $ACC1, $ACC0
+       vpaddq          $T0, $ACC2, $ACC1
+       vpmuludq        32*3(%rax), $Y, $T0
+       vpaddq          $OVERFLOW, $ACC0, $ACC0
+       vpaddq          $T0, $ACC3, $ACC2
+       vmovdqa         $ACC4, $ACC3
+       vpsllq          \$18, $Y, $T0
+       vmovdqa         $ACC5, $ACC4
+       vpmuludq        %ymm14, $Y, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC5
+       vpmuludq        %ymm15, $Y, $T0
+        vpand          $AND_MASK, $ACC0, $Y
+       vpaddq          $OVERFLOW, $ACC7, $ACC6
+       vpaddq          $T0, $ACC8, $ACC7
+
+       #itr            9
+       vpmuludq        $B, $B, $ACC8
+
+       vpmuludq        $AND_MASK, $Y, $T0
+       vpaddq          $T0, $ACC0, $OVERFLOW
+       vpsrlq          $digit_size, $OVERFLOW, $OVERFLOW
+       vpaddq          $T0, $ACC1, $ACC0
+       vpaddq          $T0, $ACC2, $ACC1
+       vpmuludq        32*3(%rax), $Y, $T0
+       vpaddq          $OVERFLOW, $ACC0, $ACC0
+       vpaddq          $T0, $ACC3, $ACC2
+       vmovdqa         $ACC4, $ACC3
+       vpsllq          \$18, $Y, $T0
+       vmovdqa         $ACC5, $ACC4
+       vpmuludq        %ymm14, $Y, $OVERFLOW
+       vpaddq          $T0, $ACC6, $ACC5
+       vpmuludq        %ymm15, $Y, $T0
+       vpaddq          $OVERFLOW, $ACC7, $ACC6
+       vpaddq          $T0, $ACC8, $ACC7
+
+       vpxor           $ACC8, $ACC8, $ACC8
+
+       ret
+.size  avx2_sqr_x4,.-avx2_sqr_x4
+
+################################################################################
+# void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4);
+.type  avx2_sub_x4,\@abi-omnipotent
+.align 32
+avx2_sub_x4:
+       vmovdqa 32*0($a_ptr), $ACC0
+       lea     160($a_ptr), $a_ptr
+       lea     .LAVX2_POLY_x8+128(%rip), %rax
+       lea     128($b_ptr), $b_ptr
+       vmovdqa 32*1-160($a_ptr), $ACC1
+       vmovdqa 32*2-160($a_ptr), $ACC2
+       vmovdqa 32*3-160($a_ptr), $ACC3
+       vmovdqa 32*4-160($a_ptr), $ACC4
+       vmovdqa 32*5-160($a_ptr), $ACC5
+       vmovdqa 32*6-160($a_ptr), $ACC6
+       vmovdqa 32*7-160($a_ptr), $ACC7
+       vmovdqa 32*8-160($a_ptr), $ACC8
+
+       vpaddq  32*0-128(%rax), $ACC0, $ACC0
+       vpaddq  32*1-128(%rax), $ACC1, $ACC1
+       vpaddq  32*2-128(%rax), $ACC2, $ACC2
+       vpaddq  32*3-128(%rax), $ACC3, $ACC3
+       vpaddq  32*4-128(%rax), $ACC4, $ACC4
+       vpaddq  32*5-128(%rax), $ACC5, $ACC5
+       vpaddq  32*6-128(%rax), $ACC6, $ACC6
+       vpaddq  32*7-128(%rax), $ACC7, $ACC7
+       vpaddq  32*8-128(%rax), $ACC8, $ACC8
+
+       vpsubq  32*0-128($b_ptr), $ACC0, $ACC0
+       vpsubq  32*1-128($b_ptr), $ACC1, $ACC1
+       vpsubq  32*2-128($b_ptr), $ACC2, $ACC2
+       vpsubq  32*3-128($b_ptr), $ACC3, $ACC3
+       vpsubq  32*4-128($b_ptr), $ACC4, $ACC4
+       vpsubq  32*5-128($b_ptr), $ACC5, $ACC5
+       vpsubq  32*6-128($b_ptr), $ACC6, $ACC6
+       vpsubq  32*7-128($b_ptr), $ACC7, $ACC7
+       vpsubq  32*8-128($b_ptr), $ACC8, $ACC8
+
+       ret
+.size  avx2_sub_x4,.-avx2_sub_x4
+
+.type  avx2_select_n_store,\@abi-omnipotent
+.align 32
+avx2_select_n_store:
+       vmovdqa `8+32*9*8`(%rsp), $Y
+       vpor    `8+32*9*8+32`(%rsp), $Y, $Y
+
+       vpandn  $ACC0, $Y, $ACC0
+       vpandn  $ACC1, $Y, $ACC1
+       vpandn  $ACC2, $Y, $ACC2
+       vpandn  $ACC3, $Y, $ACC3
+       vpandn  $ACC4, $Y, $ACC4
+       vpandn  $ACC5, $Y, $ACC5
+       vpandn  $ACC6, $Y, $ACC6
+       vmovdqa `8+32*9*8+32`(%rsp), $B
+       vpandn  $ACC7, $Y, $ACC7
+       vpandn  `8+32*9*8`(%rsp), $B, $B
+       vpandn  $ACC8, $Y, $ACC8
+
+       vpand   32*0(%rsi), $B, $T0
+       lea     160(%rsi), %rax
+       vpand   32*1(%rsi), $B, $Y
+       vpxor   $T0, $ACC0, $ACC0
+       vpand   32*2(%rsi), $B, $T0
+       vpxor   $Y, $ACC1, $ACC1
+       vpand   32*3(%rsi), $B, $Y
+       vpxor   $T0, $ACC2, $ACC2
+       vpand   32*4-160(%rax), $B, $T0
+       vpxor   $Y, $ACC3, $ACC3
+       vpand   32*5-160(%rax), $B, $Y
+       vpxor   $T0, $ACC4, $ACC4
+       vpand   32*6-160(%rax), $B, $T0
+       vpxor   $Y, $ACC5, $ACC5
+       vpand   32*7-160(%rax), $B, $Y
+       vpxor   $T0, $ACC6, $ACC6
+       vpand   32*8-160(%rax), $B, $T0
+       vmovdqa `8+32*9*8+32`(%rsp), $B
+       vpxor   $Y, $ACC7, $ACC7
+
+       vpand   32*0(%rdx), $B, $Y
+       lea     160(%rdx), %rax
+       vpxor   $T0, $ACC8, $ACC8
+       vpand   32*1(%rdx), $B, $T0
+       vpxor   $Y, $ACC0, $ACC0
+       vpand   32*2(%rdx), $B, $Y
+       vpxor   $T0, $ACC1, $ACC1
+       vpand   32*3(%rdx), $B, $T0
+       vpxor   $Y, $ACC2, $ACC2
+       vpand   32*4-160(%rax), $B, $Y
+       vpxor   $T0, $ACC3, $ACC3
+       vpand   32*5-160(%rax), $B, $T0
+       vpxor   $Y, $ACC4, $ACC4
+       vpand   32*6-160(%rax), $B, $Y
+       vpxor   $T0, $ACC5, $ACC5
+       vpand   32*7-160(%rax), $B, $T0
+       vpxor   $Y, $ACC6, $ACC6
+       vpand   32*8-160(%rax), $B, $Y
+       vpxor   $T0, $ACC7, $ACC7
+       vpxor   $Y, $ACC8, $ACC8
+       `&STORE`
+
+       ret
+.size  avx2_select_n_store,.-avx2_select_n_store
+___
+$code.=<<___   if (0);                         # inlined
+################################################################################
+# void avx2_mul_by2_x4(void* RESULTx4, void *Ax4);
+.type  avx2_mul_by2_x4,\@abi-omnipotent
+.align 32
+avx2_mul_by2_x4:
+       vmovdqa 32*0($a_ptr), $ACC0
+       lea     160($a_ptr), %rax
+       vmovdqa 32*1($a_ptr), $ACC1
+       vmovdqa 32*2($a_ptr), $ACC2
+       vmovdqa 32*3($a_ptr), $ACC3
+       vmovdqa 32*4-160(%rax), $ACC4
+       vmovdqa 32*5-160(%rax), $ACC5
+       vmovdqa 32*6-160(%rax), $ACC6
+       vmovdqa 32*7-160(%rax), $ACC7
+       vmovdqa 32*8-160(%rax), $ACC8
+
+       vpaddq  $ACC0, $ACC0, $ACC0
+       vpaddq  $ACC1, $ACC1, $ACC1
+       vpaddq  $ACC2, $ACC2, $ACC2
+       vpaddq  $ACC3, $ACC3, $ACC3
+       vpaddq  $ACC4, $ACC4, $ACC4
+       vpaddq  $ACC5, $ACC5, $ACC5
+       vpaddq  $ACC6, $ACC6, $ACC6
+       vpaddq  $ACC7, $ACC7, $ACC7
+       vpaddq  $ACC8, $ACC8, $ACC8
+
+       ret
+.size  avx2_mul_by2_x4,.-avx2_mul_by2_x4
+___
+my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx");
+my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10");
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4);
+.globl ecp_nistz256_avx2_point_add_affine_x4
+.type  ecp_nistz256_avx2_point_add_affine_x4,\@function,3
+.align 32
+ecp_nistz256_avx2_point_add_affine_x4:
+       mov     %rsp, %rax
+       push    %rbp
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       lea     -16*10(%rsp), %rsp
+       vmovaps %xmm6, -8-16*10(%rax)
+       vmovaps %xmm7, -8-16*9(%rax)
+       vmovaps %xmm8, -8-16*8(%rax)
+       vmovaps %xmm9, -8-16*7(%rax)
+       vmovaps %xmm10, -8-16*6(%rax)
+       vmovaps %xmm11, -8-16*5(%rax)
+       vmovaps %xmm12, -8-16*4(%rax)
+       vmovaps %xmm13, -8-16*3(%rax)
+       vmovaps %xmm14, -8-16*2(%rax)
+       vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+       lea     -8(%rax), %rbp
+
+# Result + 32*0 = Result.X
+# Result + 32*9 = Result.Y
+# Result + 32*18 = Result.Z
+
+# A + 32*0 = A.X
+# A + 32*9 = A.Y
+# A + 32*18 = A.Z
+
+# B + 32*0 = B.X
+# B + 32*9 = B.Y
+
+       sub     \$`32*9*8+32*2+32*8`, %rsp
+       and     \$-64, %rsp
+
+       mov     $r_ptr_in, $r_ptr
+       mov     $a_ptr_in, $a_ptr
+       mov     $b_ptr_in, $b_ptr
+
+       vmovdqa 32*0($a_ptr_in), %ymm0
+       vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
+       vpxor   %ymm1, %ymm1, %ymm1
+       lea     256($a_ptr_in), %rax            # size optimization
+       vpor    32*1($a_ptr_in), %ymm0, %ymm0
+       vpor    32*2($a_ptr_in), %ymm0, %ymm0
+       vpor    32*3($a_ptr_in), %ymm0, %ymm0
+       vpor    32*4-256(%rax), %ymm0, %ymm0
+       lea     256(%rax), %rcx                 # size optimization
+       vpor    32*5-256(%rax), %ymm0, %ymm0
+       vpor    32*6-256(%rax), %ymm0, %ymm0
+       vpor    32*7-256(%rax), %ymm0, %ymm0
+       vpor    32*8-256(%rax), %ymm0, %ymm0
+       vpor    32*9-256(%rax), %ymm0, %ymm0
+       vpor    32*10-256(%rax), %ymm0, %ymm0
+       vpor    32*11-256(%rax), %ymm0, %ymm0
+       vpor    32*12-512(%rcx), %ymm0, %ymm0
+       vpor    32*13-512(%rcx), %ymm0, %ymm0
+       vpor    32*14-512(%rcx), %ymm0, %ymm0
+       vpor    32*15-512(%rcx), %ymm0, %ymm0
+       vpor    32*16-512(%rcx), %ymm0, %ymm0
+       vpor    32*17-512(%rcx), %ymm0, %ymm0
+       vpcmpeqq %ymm1, %ymm0, %ymm0
+       vmovdqa %ymm0, `32*9*8`(%rsp)
+
+       vpxor   %ymm1, %ymm1, %ymm1
+       vmovdqa 32*0($b_ptr), %ymm0
+       lea     256($b_ptr), %rax               # size optimization
+       vpor    32*1($b_ptr), %ymm0, %ymm0
+       vpor    32*2($b_ptr), %ymm0, %ymm0
+       vpor    32*3($b_ptr), %ymm0, %ymm0
+       vpor    32*4-256(%rax), %ymm0, %ymm0
+       lea     256(%rax), %rcx                 # size optimization
+       vpor    32*5-256(%rax), %ymm0, %ymm0
+       vpor    32*6-256(%rax), %ymm0, %ymm0
+       vpor    32*7-256(%rax), %ymm0, %ymm0
+       vpor    32*8-256(%rax), %ymm0, %ymm0
+       vpor    32*9-256(%rax), %ymm0, %ymm0
+       vpor    32*10-256(%rax), %ymm0, %ymm0
+       vpor    32*11-256(%rax), %ymm0, %ymm0
+       vpor    32*12-512(%rcx), %ymm0, %ymm0
+       vpor    32*13-512(%rcx), %ymm0, %ymm0
+       vpor    32*14-512(%rcx), %ymm0, %ymm0
+       vpor    32*15-512(%rcx), %ymm0, %ymm0
+       vpor    32*16-512(%rcx), %ymm0, %ymm0
+       vpor    32*17-512(%rcx), %ymm0, %ymm0
+       vpcmpeqq %ymm1, %ymm0, %ymm0
+       vmovdqa %ymm0, `32*9*8+32`(%rsp)
+
+       #       Z1^2 = Z1*Z1
+       lea     `32*9*2`($a_ptr), %rsi
+       lea     `32*9*2`(%rsp), %rdi
+       lea     `32*9*8+32*2`(%rsp), %rcx       # temporary vector
+       call    avx2_sqr_x4
+       call    avx2_normalize_n_store
+
+       #       U2 = X2*Z1^2
+       lea     `32*9*0`($b_ptr), %rsi
+       lea     `32*9*2`(%rsp), %rdx
+       lea     `32*9*0`(%rsp), %rdi
+       call    avx2_mul_x4
+       #call   avx2_normalize
+       `&STORE`
+
+       #       S2 = Z1*Z1^2 = Z1^3
+       lea     `32*9*2`($a_ptr), %rsi
+       lea     `32*9*2`(%rsp), %rdx
+       lea     `32*9*1`(%rsp), %rdi
+       call    avx2_mul_x4
+       call    avx2_normalize_n_store
+
+       #       S2 = S2*Y2 = Y2*Z1^3
+       lea     `32*9*1`($b_ptr), %rsi
+       lea     `32*9*1`(%rsp), %rdx
+       lea     `32*9*1`(%rsp), %rdi
+       call    avx2_mul_x4
+       call    avx2_normalize_n_store
+
+       #       H = U2 - U1 = U2 - X1
+       lea     `32*9*0`(%rsp), %rsi
+       lea     `32*9*0`($a_ptr), %rdx
+       lea     `32*9*3`(%rsp), %rdi
+       call    avx2_sub_x4
+       call    avx2_normalize_n_store
+
+       #       R = S2 - S1 = S2 - Y1
+       lea     `32*9*1`(%rsp), %rsi
+       lea     `32*9*1`($a_ptr), %rdx
+       lea     `32*9*4`(%rsp), %rdi
+       call    avx2_sub_x4
+       call    avx2_normalize_n_store
+
+       #       Z3 = H*Z1*Z2
+       lea     `32*9*3`(%rsp), %rsi
+       lea     `32*9*2`($a_ptr), %rdx
+       lea     `32*9*2`($r_ptr), %rdi
+       call    avx2_mul_x4
+       call    avx2_normalize
+
+       lea     .LONE(%rip), %rsi
+       lea     `32*9*2`($a_ptr), %rdx
+       call    avx2_select_n_store
+
+       #       R^2 = R^2
+       lea     `32*9*4`(%rsp), %rsi
+       lea     `32*9*6`(%rsp), %rdi
+       lea     `32*9*8+32*2`(%rsp), %rcx       # temporary vector
+       call    avx2_sqr_x4
+       call    avx2_normalize_n_store
+
+       #       H^2 = H^2
+       lea     `32*9*3`(%rsp), %rsi
+       lea     `32*9*5`(%rsp), %rdi
+       call    avx2_sqr_x4
+       call    avx2_normalize_n_store
+
+       #       H^3 = H^2*H
+       lea     `32*9*3`(%rsp), %rsi
+       lea     `32*9*5`(%rsp), %rdx
+       lea     `32*9*7`(%rsp), %rdi
+       call    avx2_mul_x4
+       call    avx2_normalize_n_store
+
+       #       U2 = U1*H^2
+       lea     `32*9*0`($a_ptr), %rsi
+       lea     `32*9*5`(%rsp), %rdx
+       lea     `32*9*0`(%rsp), %rdi
+       call    avx2_mul_x4
+       #call   avx2_normalize
+       `&STORE`
+
+       #       Hsqr = U2*2
+       #lea    32*9*0(%rsp), %rsi
+       #lea    32*9*5(%rsp), %rdi
+       #call   avx2_mul_by2_x4
+
+       vpaddq  $ACC0, $ACC0, $ACC0     # inlined avx2_mul_by2_x4
+       lea     `32*9*5`(%rsp), %rdi
+       vpaddq  $ACC1, $ACC1, $ACC1
+       vpaddq  $ACC2, $ACC2, $ACC2
+       vpaddq  $ACC3, $ACC3, $ACC3
+       vpaddq  $ACC4, $ACC4, $ACC4
+       vpaddq  $ACC5, $ACC5, $ACC5
+       vpaddq  $ACC6, $ACC6, $ACC6
+       vpaddq  $ACC7, $ACC7, $ACC7
+       vpaddq  $ACC8, $ACC8, $ACC8
+       call    avx2_normalize_n_store
+
+       #       X3 = R^2 - H^3
+       #lea    32*9*6(%rsp), %rsi
+       #lea    32*9*7(%rsp), %rdx
+       #lea    32*9*5(%rsp), %rcx
+       #lea    32*9*0($r_ptr), %rdi
+       #call   avx2_sub_x4
+       #NORMALIZE
+       #STORE
+
+       #       X3 = X3 - U2*2
+       #lea    32*9*0($r_ptr), %rsi
+       #lea    32*9*0($r_ptr), %rdi
+       #call   avx2_sub_x4
+       #NORMALIZE
+       #STORE
+
+       lea     `32*9*6+128`(%rsp), %rsi
+       lea     .LAVX2_POLY_x2+128(%rip), %rax
+       lea     `32*9*7+128`(%rsp), %rdx
+       lea     `32*9*5+128`(%rsp), %rcx
+       lea     `32*9*0`($r_ptr), %rdi
+
+       vmovdqa 32*0-128(%rsi), $ACC0
+       vmovdqa 32*1-128(%rsi), $ACC1
+       vmovdqa 32*2-128(%rsi), $ACC2
+       vmovdqa 32*3-128(%rsi), $ACC3
+       vmovdqa 32*4-128(%rsi), $ACC4
+       vmovdqa 32*5-128(%rsi), $ACC5
+       vmovdqa 32*6-128(%rsi), $ACC6
+       vmovdqa 32*7-128(%rsi), $ACC7
+       vmovdqa 32*8-128(%rsi), $ACC8
+
+       vpaddq  32*0-128(%rax), $ACC0, $ACC0
+       vpaddq  32*1-128(%rax), $ACC1, $ACC1
+       vpaddq  32*2-128(%rax), $ACC2, $ACC2
+       vpaddq  32*3-128(%rax), $ACC3, $ACC3
+       vpaddq  32*4-128(%rax), $ACC4, $ACC4
+       vpaddq  32*5-128(%rax), $ACC5, $ACC5
+       vpaddq  32*6-128(%rax), $ACC6, $ACC6
+       vpaddq  32*7-128(%rax), $ACC7, $ACC7
+       vpaddq  32*8-128(%rax), $ACC8, $ACC8
+
+       vpsubq  32*0-128(%rdx), $ACC0, $ACC0
+       vpsubq  32*1-128(%rdx), $ACC1, $ACC1
+       vpsubq  32*2-128(%rdx), $ACC2, $ACC2
+       vpsubq  32*3-128(%rdx), $ACC3, $ACC3
+       vpsubq  32*4-128(%rdx), $ACC4, $ACC4
+       vpsubq  32*5-128(%rdx), $ACC5, $ACC5
+       vpsubq  32*6-128(%rdx), $ACC6, $ACC6
+       vpsubq  32*7-128(%rdx), $ACC7, $ACC7
+       vpsubq  32*8-128(%rdx), $ACC8, $ACC8
+
+       vpsubq  32*0-128(%rcx), $ACC0, $ACC0
+       vpsubq  32*1-128(%rcx), $ACC1, $ACC1
+       vpsubq  32*2-128(%rcx), $ACC2, $ACC2
+       vpsubq  32*3-128(%rcx), $ACC3, $ACC3
+       vpsubq  32*4-128(%rcx), $ACC4, $ACC4
+       vpsubq  32*5-128(%rcx), $ACC5, $ACC5
+       vpsubq  32*6-128(%rcx), $ACC6, $ACC6
+       vpsubq  32*7-128(%rcx), $ACC7, $ACC7
+       vpsubq  32*8-128(%rcx), $ACC8, $ACC8
+       call    avx2_normalize
+
+       lea     32*0($b_ptr), %rsi
+       lea     32*0($a_ptr), %rdx
+       call    avx2_select_n_store
+
+       #       H = U2 - X3
+       lea     `32*9*0`(%rsp), %rsi
+       lea     `32*9*0`($r_ptr), %rdx
+       lea     `32*9*3`(%rsp), %rdi
+       call    avx2_sub_x4
+       call    avx2_normalize_n_store
+
+       #
+       lea     `32*9*3`(%rsp), %rsi
+       lea     `32*9*4`(%rsp), %rdx
+       lea     `32*9*3`(%rsp), %rdi
+       call    avx2_mul_x4
+       call    avx2_normalize_n_store
+
+       #
+       lea     `32*9*7`(%rsp), %rsi
+       lea     `32*9*1`($a_ptr), %rdx
+       lea     `32*9*1`(%rsp), %rdi
+       call    avx2_mul_x4
+       call    avx2_normalize_n_store
+
+       #
+       lea     `32*9*3`(%rsp), %rsi
+       lea     `32*9*1`(%rsp), %rdx
+       lea     `32*9*1`($r_ptr), %rdi
+       call    avx2_sub_x4
+       call    avx2_normalize
+
+       lea     32*9($b_ptr), %rsi
+       lea     32*9($a_ptr), %rdx
+       call    avx2_select_n_store
+
+       #lea    32*9*0($r_ptr), %rsi
+       #lea    32*9*0($r_ptr), %rdi
+       #call   avx2_mul_by1_x4
+       #NORMALIZE
+       #STORE
+
+       lea     `32*9*1`($r_ptr), %rsi
+       lea     `32*9*1`($r_ptr), %rdi
+       call    avx2_mul_by1_x4
+       call    avx2_normalize_n_store
+
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       movaps  %xmm6, -16*10(%rbp)
+       movaps  %xmm7, -16*9(%rbp)
+       movaps  %xmm8, -16*8(%rbp)
+       movaps  %xmm9, -16*7(%rbp)
+       movaps  %xmm10, -16*6(%rbp)
+       movaps  %xmm11, -16*5(%rbp)
+       movaps  %xmm12, -16*4(%rbp)
+       movaps  %xmm13, -16*3(%rbp)
+       movaps  %xmm14, -16*2(%rbp)
+       movaps  %xmm15, -16*1(%rbp)
+___
+$code.=<<___;
+       mov     %rbp, %rsp
+       pop     %rbp
+       ret
+.size  ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4
+
+################################################################################
+# void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4);
+.globl ecp_nistz256_avx2_point_add_affines_x4
+.type  ecp_nistz256_avx2_point_add_affines_x4,\@function,3
+.align 32
+ecp_nistz256_avx2_point_add_affines_x4:
+       mov     %rsp, %rax
+       push    %rbp
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       lea     -16*10(%rsp), %rsp
+       vmovaps %xmm6, -8-16*10(%rax)
+       vmovaps %xmm7, -8-16*9(%rax)
+       vmovaps %xmm8, -8-16*8(%rax)
+       vmovaps %xmm9, -8-16*7(%rax)
+       vmovaps %xmm10, -8-16*6(%rax)
+       vmovaps %xmm11, -8-16*5(%rax)
+       vmovaps %xmm12, -8-16*4(%rax)
+       vmovaps %xmm13, -8-16*3(%rax)
+       vmovaps %xmm14, -8-16*2(%rax)
+       vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+       lea     -8(%rax), %rbp
+
+# Result + 32*0 = Result.X
+# Result + 32*9 = Result.Y
+# Result + 32*18 = Result.Z
+
+# A + 32*0 = A.X
+# A + 32*9 = A.Y
+
+# B + 32*0 = B.X
+# B + 32*9 = B.Y
+
+       sub     \$`32*9*8+32*2+32*8`, %rsp
+       and     \$-64, %rsp
+
+       mov     $r_ptr_in, $r_ptr
+       mov     $a_ptr_in, $a_ptr
+       mov     $b_ptr_in, $b_ptr
+
+       vmovdqa 32*0($a_ptr_in), %ymm0
+       vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
+       vpxor   %ymm1, %ymm1, %ymm1
+       lea     256($a_ptr_in), %rax            # size optimization
+       vpor    32*1($a_ptr_in), %ymm0, %ymm0
+       vpor    32*2($a_ptr_in), %ymm0, %ymm0
+       vpor    32*3($a_ptr_in), %ymm0, %ymm0
+       vpor    32*4-256(%rax), %ymm0, %ymm0
+       lea     256(%rax), %rcx                 # size optimization
+       vpor    32*5-256(%rax), %ymm0, %ymm0
+       vpor    32*6-256(%rax), %ymm0, %ymm0
+       vpor    32*7-256(%rax), %ymm0, %ymm0
+       vpor    32*8-256(%rax), %ymm0, %ymm0
+       vpor    32*9-256(%rax), %ymm0, %ymm0
+       vpor    32*10-256(%rax), %ymm0, %ymm0
+       vpor    32*11-256(%rax), %ymm0, %ymm0
+       vpor    32*12-512(%rcx), %ymm0, %ymm0
+       vpor    32*13-512(%rcx), %ymm0, %ymm0
+       vpor    32*14-512(%rcx), %ymm0, %ymm0
+       vpor    32*15-512(%rcx), %ymm0, %ymm0
+       vpor    32*16-512(%rcx), %ymm0, %ymm0
+       vpor    32*17-512(%rcx), %ymm0, %ymm0
+       vpcmpeqq %ymm1, %ymm0, %ymm0
+       vmovdqa %ymm0, `32*9*8`(%rsp)
+
+       vpxor   %ymm1, %ymm1, %ymm1
+       vmovdqa 32*0($b_ptr), %ymm0
+       lea     256($b_ptr), %rax               # size optimization
+       vpor    32*1($b_ptr), %ymm0, %ymm0
+       vpor    32*2($b_ptr), %ymm0, %ymm0
+       vpor    32*3($b_ptr), %ymm0, %ymm0
+       vpor    32*4-256(%rax), %ymm0, %ymm0
+       lea     256(%rax), %rcx                 # size optimization
+       vpor    32*5-256(%rax), %ymm0, %ymm0
+       vpor    32*6-256(%rax), %ymm0, %ymm0
+       vpor    32*7-256(%rax), %ymm0, %ymm0
+       vpor    32*8-256(%rax), %ymm0, %ymm0
+       vpor    32*9-256(%rax), %ymm0, %ymm0
+       vpor    32*10-256(%rax), %ymm0, %ymm0
+       vpor    32*11-256(%rax), %ymm0, %ymm0
+       vpor    32*12-512(%rcx), %ymm0, %ymm0
+       vpor    32*13-512(%rcx), %ymm0, %ymm0
+       vpor    32*14-512(%rcx), %ymm0, %ymm0
+       vpor    32*15-512(%rcx), %ymm0, %ymm0
+       vpor    32*16-512(%rcx), %ymm0, %ymm0
+       vpor    32*17-512(%rcx), %ymm0, %ymm0
+       vpcmpeqq %ymm1, %ymm0, %ymm0
+       vmovdqa %ymm0, `32*9*8+32`(%rsp)
+
+       #       H = U2 - U1 = X2 - X1
+       lea     `32*9*0`($b_ptr), %rsi
+       lea     `32*9*0`($a_ptr), %rdx
+       lea     `32*9*3`(%rsp), %rdi
+       call    avx2_sub_x4
+       call    avx2_normalize_n_store
+
+       #       R = S2 - S1 = Y2 - Y1
+       lea     `32*9*1`($b_ptr), %rsi
+       lea     `32*9*1`($a_ptr), %rdx
+       lea     `32*9*4`(%rsp), %rdi
+       call    avx2_sub_x4
+       call    avx2_normalize_n_store
+
+       #       Z3 = H*Z1*Z2 = H
+       lea     `32*9*3`(%rsp), %rsi
+       lea     `32*9*2`($r_ptr), %rdi
+       call    avx2_mul_by1_x4
+       call    avx2_normalize
+
+       vmovdqa `32*9*8`(%rsp), $B
+       vpor    `32*9*8+32`(%rsp), $B, $B
+
+       vpandn  $ACC0, $B, $ACC0
+       lea     .LONE+128(%rip), %rax
+       vpandn  $ACC1, $B, $ACC1
+       vpandn  $ACC2, $B, $ACC2
+       vpandn  $ACC3, $B, $ACC3
+       vpandn  $ACC4, $B, $ACC4
+       vpandn  $ACC5, $B, $ACC5
+       vpandn  $ACC6, $B, $ACC6
+       vpandn  $ACC7, $B, $ACC7
+
+       vpand   32*0-128(%rax), $B, $T0
+        vpandn $ACC8, $B, $ACC8
+       vpand   32*1-128(%rax), $B, $Y
+       vpxor   $T0, $ACC0, $ACC0
+       vpand   32*2-128(%rax), $B, $T0
+       vpxor   $Y, $ACC1, $ACC1
+       vpand   32*3-128(%rax), $B, $Y
+       vpxor   $T0, $ACC2, $ACC2
+       vpand   32*4-128(%rax), $B, $T0
+       vpxor   $Y, $ACC3, $ACC3
+       vpand   32*5-128(%rax), $B, $Y
+       vpxor   $T0, $ACC4, $ACC4
+       vpand   32*6-128(%rax), $B, $T0
+       vpxor   $Y, $ACC5, $ACC5
+       vpand   32*7-128(%rax), $B, $Y
+       vpxor   $T0, $ACC6, $ACC6
+       vpand   32*8-128(%rax), $B, $T0
+       vpxor   $Y, $ACC7, $ACC7
+       vpxor   $T0, $ACC8, $ACC8
+       `&STORE`
+
+       #       R^2 = R^2
+       lea     `32*9*4`(%rsp), %rsi
+       lea     `32*9*6`(%rsp), %rdi
+       lea     `32*9*8+32*2`(%rsp), %rcx       # temporary vector
+       call    avx2_sqr_x4
+       call    avx2_normalize_n_store
+
+       #       H^2 = H^2
+       lea     `32*9*3`(%rsp), %rsi
+       lea     `32*9*5`(%rsp), %rdi
+       call    avx2_sqr_x4
+       call    avx2_normalize_n_store
+
+       #       H^3 = H^2*H
+       lea     `32*9*3`(%rsp), %rsi
+       lea     `32*9*5`(%rsp), %rdx
+       lea     `32*9*7`(%rsp), %rdi
+       call    avx2_mul_x4
+       call    avx2_normalize_n_store
+
+       #       U2 = U1*H^2
+       lea     `32*9*0`($a_ptr), %rsi
+       lea     `32*9*5`(%rsp), %rdx
+       lea     `32*9*0`(%rsp), %rdi
+       call    avx2_mul_x4
+       #call   avx2_normalize
+       `&STORE`
+
+       #       Hsqr = U2*2
+       #lea    32*9*0(%rsp), %rsi
+       #lea    32*9*5(%rsp), %rdi
+       #call   avx2_mul_by2_x4
+
+       vpaddq  $ACC0, $ACC0, $ACC0     # inlined avx2_mul_by2_x4
+       lea     `32*9*5`(%rsp), %rdi
+       vpaddq  $ACC1, $ACC1, $ACC1
+       vpaddq  $ACC2, $ACC2, $ACC2
+       vpaddq  $ACC3, $ACC3, $ACC3
+       vpaddq  $ACC4, $ACC4, $ACC4
+       vpaddq  $ACC5, $ACC5, $ACC5
+       vpaddq  $ACC6, $ACC6, $ACC6
+       vpaddq  $ACC7, $ACC7, $ACC7
+       vpaddq  $ACC8, $ACC8, $ACC8
+       call    avx2_normalize_n_store
+
+       #       X3 = R^2 - H^3
+       #lea    32*9*6(%rsp), %rsi
+       #lea    32*9*7(%rsp), %rdx
+       #lea    32*9*5(%rsp), %rcx
+       #lea    32*9*0($r_ptr), %rdi
+       #call   avx2_sub_x4
+       #NORMALIZE
+       #STORE
+
+       #       X3 = X3 - U2*2
+       #lea    32*9*0($r_ptr), %rsi
+       #lea    32*9*0($r_ptr), %rdi
+       #call   avx2_sub_x4
+       #NORMALIZE
+       #STORE
+
+       lea     `32*9*6+128`(%rsp), %rsi
+       lea     .LAVX2_POLY_x2+128(%rip), %rax
+       lea     `32*9*7+128`(%rsp), %rdx
+       lea     `32*9*5+128`(%rsp), %rcx
+       lea     `32*9*0`($r_ptr), %rdi
+
+       vmovdqa 32*0-128(%rsi), $ACC0
+       vmovdqa 32*1-128(%rsi), $ACC1
+       vmovdqa 32*2-128(%rsi), $ACC2
+       vmovdqa 32*3-128(%rsi), $ACC3
+       vmovdqa 32*4-128(%rsi), $ACC4
+       vmovdqa 32*5-128(%rsi), $ACC5
+       vmovdqa 32*6-128(%rsi), $ACC6
+       vmovdqa 32*7-128(%rsi), $ACC7
+       vmovdqa 32*8-128(%rsi), $ACC8
+
+       vpaddq  32*0-128(%rax), $ACC0, $ACC0
+       vpaddq  32*1-128(%rax), $ACC1, $ACC1
+       vpaddq  32*2-128(%rax), $ACC2, $ACC2
+       vpaddq  32*3-128(%rax), $ACC3, $ACC3
+       vpaddq  32*4-128(%rax), $ACC4, $ACC4
+       vpaddq  32*5-128(%rax), $ACC5, $ACC5
+       vpaddq  32*6-128(%rax), $ACC6, $ACC6
+       vpaddq  32*7-128(%rax), $ACC7, $ACC7
+       vpaddq  32*8-128(%rax), $ACC8, $ACC8
+
+       vpsubq  32*0-128(%rdx), $ACC0, $ACC0
+       vpsubq  32*1-128(%rdx), $ACC1, $ACC1
+       vpsubq  32*2-128(%rdx), $ACC2, $ACC2
+       vpsubq  32*3-128(%rdx), $ACC3, $ACC3
+       vpsubq  32*4-128(%rdx), $ACC4, $ACC4
+       vpsubq  32*5-128(%rdx), $ACC5, $ACC5
+       vpsubq  32*6-128(%rdx), $ACC6, $ACC6
+       vpsubq  32*7-128(%rdx), $ACC7, $ACC7
+       vpsubq  32*8-128(%rdx), $ACC8, $ACC8
+
+       vpsubq  32*0-128(%rcx), $ACC0, $ACC0
+       vpsubq  32*1-128(%rcx), $ACC1, $ACC1
+       vpsubq  32*2-128(%rcx), $ACC2, $ACC2
+       vpsubq  32*3-128(%rcx), $ACC3, $ACC3
+       vpsubq  32*4-128(%rcx), $ACC4, $ACC4
+       vpsubq  32*5-128(%rcx), $ACC5, $ACC5
+       vpsubq  32*6-128(%rcx), $ACC6, $ACC6
+       vpsubq  32*7-128(%rcx), $ACC7, $ACC7
+       vpsubq  32*8-128(%rcx), $ACC8, $ACC8
+       call    avx2_normalize
+
+       lea     32*0($b_ptr), %rsi
+       lea     32*0($a_ptr), %rdx
+       call    avx2_select_n_store
+
+       #       H = U2 - X3
+       lea     `32*9*0`(%rsp), %rsi
+       lea     `32*9*0`($r_ptr), %rdx
+       lea     `32*9*3`(%rsp), %rdi
+       call    avx2_sub_x4
+       call    avx2_normalize_n_store
+
+       #       H = H*R
+       lea     `32*9*3`(%rsp), %rsi
+       lea     `32*9*4`(%rsp), %rdx
+       lea     `32*9*3`(%rsp), %rdi
+       call    avx2_mul_x4
+       call    avx2_normalize_n_store
+
+       #       S2 = S1 * H^3
+       lea     `32*9*7`(%rsp), %rsi
+       lea     `32*9*1`($a_ptr), %rdx
+       lea     `32*9*1`(%rsp), %rdi
+       call    avx2_mul_x4
+       call    avx2_normalize_n_store
+
+       #
+       lea     `32*9*3`(%rsp), %rsi
+       lea     `32*9*1`(%rsp), %rdx
+       lea     `32*9*1`($r_ptr), %rdi
+       call    avx2_sub_x4
+       call    avx2_normalize
+
+       lea     32*9($b_ptr), %rsi
+       lea     32*9($a_ptr), %rdx
+       call    avx2_select_n_store
+
+       #lea    32*9*0($r_ptr), %rsi
+       #lea    32*9*0($r_ptr), %rdi
+       #call   avx2_mul_by1_x4
+       #NORMALIZE
+       #STORE
+
+       lea     `32*9*1`($r_ptr), %rsi
+       lea     `32*9*1`($r_ptr), %rdi
+       call    avx2_mul_by1_x4
+       call    avx2_normalize_n_store
+
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       movaps  %xmm6, -16*10(%rbp)
+       movaps  %xmm7, -16*9(%rbp)
+       movaps  %xmm8, -16*8(%rbp)
+       movaps  %xmm9, -16*7(%rbp)
+       movaps  %xmm10, -16*6(%rbp)
+       movaps  %xmm11, -16*5(%rbp)
+       movaps  %xmm12, -16*4(%rbp)
+       movaps  %xmm13, -16*3(%rbp)
+       movaps  %xmm14, -16*2(%rbp)
+       movaps  %xmm15, -16*1(%rbp)
+___
+$code.=<<___;
+       mov     %rbp, %rsp
+       pop     %rbp
+       ret
+.size  ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4
+
+################################################################################
+# void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4);
+.globl ecp_nistz256_avx2_to_mont
+.type  ecp_nistz256_avx2_to_mont,\@function,2
+.align 32
+ecp_nistz256_avx2_to_mont:
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       lea     -8-16*10(%rsp), %rsp
+       vmovaps %xmm6, -8-16*10(%rax)
+       vmovaps %xmm7, -8-16*9(%rax)
+       vmovaps %xmm8, -8-16*8(%rax)
+       vmovaps %xmm9, -8-16*7(%rax)
+       vmovaps %xmm10, -8-16*6(%rax)
+       vmovaps %xmm11, -8-16*5(%rax)
+       vmovaps %xmm12, -8-16*4(%rax)
+       vmovaps %xmm13, -8-16*3(%rax)
+       vmovaps %xmm14, -8-16*2(%rax)
+       vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+       vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
+       lea     .LTO_MONT_AVX2(%rip), %rdx
+       call    avx2_mul_x4
+       call    avx2_normalize_n_store
+
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       movaps  16*0(%rsp), %xmm6
+       movaps  16*1(%rsp), %xmm7
+       movaps  16*2(%rsp), %xmm8
+       movaps  16*3(%rsp), %xmm9
+       movaps  16*4(%rsp), %xmm10
+       movaps  16*5(%rsp), %xmm11
+       movaps  16*6(%rsp), %xmm12
+       movaps  16*7(%rsp), %xmm13
+       movaps  16*8(%rsp), %xmm14
+       movaps  16*9(%rsp), %xmm15
+       lea     8+16*10(%rsp), %rsp
+___
+$code.=<<___;
+       ret
+.size  ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont
+
+################################################################################
+# void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4);
+.globl ecp_nistz256_avx2_from_mont
+.type  ecp_nistz256_avx2_from_mont,\@function,2
+.align 32
+ecp_nistz256_avx2_from_mont:
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       lea     -8-16*10(%rsp), %rsp
+       vmovaps %xmm6, -8-16*10(%rax)
+       vmovaps %xmm7, -8-16*9(%rax)
+       vmovaps %xmm8, -8-16*8(%rax)
+       vmovaps %xmm9, -8-16*7(%rax)
+       vmovaps %xmm10, -8-16*6(%rax)
+       vmovaps %xmm11, -8-16*5(%rax)
+       vmovaps %xmm12, -8-16*4(%rax)
+       vmovaps %xmm13, -8-16*3(%rax)
+       vmovaps %xmm14, -8-16*2(%rax)
+       vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+       vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK
+       lea     .LFROM_MONT_AVX2(%rip), %rdx
+       call    avx2_mul_x4
+       call    avx2_normalize_n_store
+
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       movaps  16*0(%rsp), %xmm6
+       movaps  16*1(%rsp), %xmm7
+       movaps  16*2(%rsp), %xmm8
+       movaps  16*3(%rsp), %xmm9
+       movaps  16*4(%rsp), %xmm10
+       movaps  16*5(%rsp), %xmm11
+       movaps  16*6(%rsp), %xmm12
+       movaps  16*7(%rsp), %xmm13
+       movaps  16*8(%rsp), %xmm14
+       movaps  16*9(%rsp), %xmm15
+       lea     8+16*10(%rsp), %rsp
+___
+$code.=<<___;
+       ret
+.size  ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont
+
+################################################################################
+# void ecp_nistz256_avx2_set1(void* RESULTx4);
+.globl ecp_nistz256_avx2_set1
+.type  ecp_nistz256_avx2_set1,\@function,1
+.align 32
+ecp_nistz256_avx2_set1:
+       lea     .LONE+128(%rip), %rax
+       lea     128(%rdi), %rdi
+       vzeroupper
+       vmovdqa 32*0-128(%rax), %ymm0
+       vmovdqa 32*1-128(%rax), %ymm1
+       vmovdqa 32*2-128(%rax), %ymm2
+       vmovdqa 32*3-128(%rax), %ymm3
+       vmovdqa 32*4-128(%rax), %ymm4
+       vmovdqa 32*5-128(%rax), %ymm5
+       vmovdqa %ymm0, 32*0-128(%rdi)
+       vmovdqa 32*6-128(%rax), %ymm0
+       vmovdqa %ymm1, 32*1-128(%rdi)
+       vmovdqa 32*7-128(%rax), %ymm1
+       vmovdqa %ymm2, 32*2-128(%rdi)
+       vmovdqa 32*8-128(%rax), %ymm2
+       vmovdqa %ymm3, 32*3-128(%rdi)
+       vmovdqa %ymm4, 32*4-128(%rdi)
+       vmovdqa %ymm5, 32*5-128(%rdi)
+       vmovdqa %ymm0, 32*6-128(%rdi)
+       vmovdqa %ymm1, 32*7-128(%rdi)
+       vmovdqa %ymm2, 32*8-128(%rdi)
+
+       vzeroupper
+       ret
+.size  ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1
+___
+}
+{
+################################################################################
+# void ecp_nistz256_avx2_multi_select_w7(void* RESULT, void *in,
+#                          int index0, int index1, int index2, int index3);
+################################################################################
+
+my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d");
+my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3));
+my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11));
+my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15));
+
+$code.=<<___;
+.globl ecp_nistz256_avx2_multi_select_w7
+.type  ecp_nistz256_avx2_multi_select_w7,\@function,6
+.align 32
+ecp_nistz256_avx2_multi_select_w7:
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       lea     -8-16*10(%rsp), %rsp
+       vmovaps %xmm6, -8-16*10(%rax)
+       vmovaps %xmm7, -8-16*9(%rax)
+       vmovaps %xmm8, -8-16*8(%rax)
+       vmovaps %xmm9, -8-16*7(%rax)
+       vmovaps %xmm10, -8-16*6(%rax)
+       vmovaps %xmm11, -8-16*5(%rax)
+       vmovaps %xmm12, -8-16*4(%rax)
+       vmovaps %xmm13, -8-16*3(%rax)
+       vmovaps %xmm14, -8-16*2(%rax)
+       vmovaps %xmm15, -8-16*1(%rax)
+___
+$code.=<<___;
+       lea     .LIntOne(%rip), %rax
+
+       vmovd   $index0, %xmm0
+       vmovd   $index1, %xmm1
+       vmovd   $index2, %xmm2
+       vmovd   $index3, %xmm3
+
+       vpxor   $R0a, $R0a, $R0a
+       vpxor   $R0b, $R0b, $R0b
+       vpxor   $R1a, $R1a, $R1a
+       vpxor   $R1b, $R1b, $R1b
+       vpxor   $R2a, $R2a, $R2a
+       vpxor   $R2b, $R2b, $R2b
+       vpxor   $R3a, $R3a, $R3a
+       vpxor   $R3b, $R3b, $R3b
+       vmovdqa (%rax), $M0
+
+       vpermd  $INDEX0, $R0a, $INDEX0
+       vpermd  $INDEX1, $R0a, $INDEX1
+       vpermd  $INDEX2, $R0a, $INDEX2
+       vpermd  $INDEX3, $R0a, $INDEX3
+
+       mov     \$64, %ecx
+       lea     112($val), $val         # size optimization
+       jmp     .Lmulti_select_loop_avx2
+
+# INDEX=0, corresponds to the point at infty (0,0)
+.align 32
+.Lmulti_select_loop_avx2:
+       vpcmpeqd        $INDEX0, $M0, $TMP0
+
+       vmovdqa         `32*0+32*64*2*0`($in_t), $T0
+       vmovdqa         `32*1+32*64*2*0`($in_t), $T1
+       vpand           $TMP0, $T0, $T0
+       vpand           $TMP0, $T1, $T1
+       vpxor           $T0, $R0a, $R0a
+       vpxor           $T1, $R0b, $R0b
+
+       vpcmpeqd        $INDEX1, $M0, $TMP0
+
+       vmovdqa         `32*0+32*64*2*1`($in_t), $T0
+       vmovdqa         `32*1+32*64*2*1`($in_t), $T1
+       vpand           $TMP0, $T0, $T0
+       vpand           $TMP0, $T1, $T1
+       vpxor           $T0, $R1a, $R1a
+       vpxor           $T1, $R1b, $R1b
+
+       vpcmpeqd        $INDEX2, $M0, $TMP0
+
+       vmovdqa         `32*0+32*64*2*2`($in_t), $T0
+       vmovdqa         `32*1+32*64*2*2`($in_t), $T1
+       vpand           $TMP0, $T0, $T0
+       vpand           $TMP0, $T1, $T1
+       vpxor           $T0, $R2a, $R2a
+       vpxor           $T1, $R2b, $R2b
+
+       vpcmpeqd        $INDEX3, $M0, $TMP0
+
+       vmovdqa         `32*0+32*64*2*3`($in_t), $T0
+       vmovdqa         `32*1+32*64*2*3`($in_t), $T1
+       vpand           $TMP0, $T0, $T0
+       vpand           $TMP0, $T1, $T1
+       vpxor           $T0, $R3a, $R3a
+       vpxor           $T1, $R3b, $R3b
+
+       vpaddd          (%rax), $M0, $M0        # increment
+       lea             32*2($in_t), $in_t
+
+        dec    %ecx
+       jnz     .Lmulti_select_loop_avx2
+
+       vmovdqu $R0a, 32*0-112($val)
+       vmovdqu $R0b, 32*1-112($val)
+       vmovdqu $R1a, 32*2-112($val)
+       vmovdqu $R1b, 32*3-112($val)
+       vmovdqu $R2a, 32*4-112($val)
+       vmovdqu $R2b, 32*5-112($val)
+       vmovdqu $R3a, 32*6-112($val)
+       vmovdqu $R3b, 32*7-112($val)
+
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       movaps  16*0(%rsp), %xmm6
+       movaps  16*1(%rsp), %xmm7
+       movaps  16*2(%rsp), %xmm8
+       movaps  16*3(%rsp), %xmm9
+       movaps  16*4(%rsp), %xmm10
+       movaps  16*5(%rsp), %xmm11
+       movaps  16*6(%rsp), %xmm12
+       movaps  16*7(%rsp), %xmm13
+       movaps  16*8(%rsp), %xmm14
+       movaps  16*9(%rsp), %xmm15
+       lea     8+16*10(%rsp), %rsp
+___
+$code.=<<___;
+       ret
+.size  ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7
+
+.extern        OPENSSL_ia32cap_P
+.globl ecp_nistz_avx2_eligible
+.type  ecp_nistz_avx2_eligible,\@abi-omnipotent
+.align 32
+ecp_nistz_avx2_eligible:
+       mov     OPENSSL_ia32cap_P+8(%rip),%eax
+       shr     \$5,%eax
+       and     \$1,%eax
+       ret
+.size  ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
+___
+}
+}} else {{     # assembler is too old
+$code.=<<___;
+.text
+
+.globl ecp_nistz256_avx2_transpose_convert
+.globl ecp_nistz256_avx2_convert_transpose_back
+.globl ecp_nistz256_avx2_point_add_affine_x4
+.globl ecp_nistz256_avx2_point_add_affines_x4
+.globl ecp_nistz256_avx2_to_mont
+.globl ecp_nistz256_avx2_from_mont
+.globl ecp_nistz256_avx2_set1
+.globl ecp_nistz256_avx2_multi_select_w7
+.type  ecp_nistz256_avx2_multi_select_w7,\@abi-omnipotent
+ecp_nistz256_avx2_transpose_convert:
+ecp_nistz256_avx2_convert_transpose_back:
+ecp_nistz256_avx2_point_add_affine_x4:
+ecp_nistz256_avx2_point_add_affines_x4:
+ecp_nistz256_avx2_to_mont:
+ecp_nistz256_avx2_from_mont:
+ecp_nistz256_avx2_set1:
+ecp_nistz256_avx2_multi_select_w7:
+       .byte   0x0f,0x0b       # ud2
+       ret
+.size  ecp_nistz256_avx2_multi_select_w7,.-ecp_nistz256_avx2_multi_select_w7
+
+.globl ecp_nistz_avx2_eligible
+.type  ecp_nistz_avx2_eligible,\@abi-omnipotent
+ecp_nistz_avx2_eligible:
+       xor     %eax,%eax
+       ret
+.size  ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible
+___
+}}
+
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval($1)/geo;
+
+       print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
new file mode 100755 (executable)
index 0000000..c4b6d0f
--- /dev/null
@@ -0,0 +1,3092 @@
+#!/usr/bin/env perl
+
+##############################################################################
+#                                                                            #
+# Copyright 2014 Intel Corporation                                           #
+#                                                                            #
+# Licensed under the Apache License, Version 2.0 (the "License");            #
+# you may not use this file except in compliance with the License.           #
+# You may obtain a copy of the License at                                    #
+#                                                                            #
+#    http://www.apache.org/licenses/LICENSE-2.0                              #
+#                                                                            #
+# Unless required by applicable law or agreed to in writing, software        #
+# distributed under the License is distributed on an "AS IS" BASIS,          #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   #
+# See the License for the specific language governing permissions and        #
+# limitations under the License.                                             #
+#                                                                            #
+##############################################################################
+#                                                                            #
+#  Developers and authors:                                                   #
+#  Shay Gueron (1, 2), and Vlad Krasnov (1)                                  #
+#  (1) Intel Corporation, Israel Development Center                          #
+#  (2) University of Haifa                                                   #
+#  Reference:                                                                #
+#  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
+#                           256 Bit Primes"                                  #
+#                                                                            #
+##############################################################################
+
+# Further optimization by <appro@openssl.org>:
+#
+#              this/original
+# Opteron      +8-33%
+# Bulldozer    +10-30%
+# P4           +14-38%
+# Westmere     +8-23%
+# Sandy Bridge +8-24%
+# Ivy Bridge   +7-25%
+# Haswell      +5-25%
+# Atom         +10-32%
+# VIA Nano     +37-130%
+#
+# Ranges denote minimum and maximum improvement coefficients depending
+# on benchmark.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+               =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.19) + ($1>=2.22);
+       $addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+           `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+       $avx = ($1>=2.09) + ($1>=2.10);
+       $addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+           `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+       $avx = ($1>=10) + ($1>=11);
+       $addx = ($1>=12);
+}
+
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+       my $ver = $2 + $3/100.0;        # 3.1->3.01, 3.10->3.10
+       $avx = ($ver>=3.0) + ($ver>=3.01);
+       $addx = ($ver>=3.03);
+}
+
+$code.=<<___;
+.text
+.extern        OPENSSL_ia32cap_P
+
+# The polynomial
+.align 64
+.Lpoly:
+.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
+
+# 2^512 mod P precomputed for NIST P256 polynomial
+.LRR:
+.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
+
+.LOne:
+.long 1,1,1,1,1,1,1,1
+.LTwo:
+.long 2,2,2,2,2,2,2,2
+.LThree:
+.long 3,3,3,3,3,3,3,3
+.LONE_mont:
+.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+___
+
+{
+################################################################################
+# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
+
+my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
+my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
+my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+
+.globl ecp_nistz256_mul_by_2
+.type  ecp_nistz256_mul_by_2,\@function,2
+.align 64
+ecp_nistz256_mul_by_2:
+       push    %r12
+       push    %r13
+
+       mov     8*0($a_ptr), $a0
+       mov     8*1($a_ptr), $a1
+       add     $a0, $a0                # a0:a3+a0:a3
+       mov     8*2($a_ptr), $a2
+       adc     $a1, $a1
+       mov     8*3($a_ptr), $a3
+       lea     .Lpoly(%rip), $a_ptr
+        mov    $a0, $t0
+       adc     $a2, $a2
+       adc     $a3, $a3
+        mov    $a1, $t1
+       sbb     $t4, $t4
+
+       sub     8*0($a_ptr), $a0
+        mov    $a2, $t2
+       sbb     8*1($a_ptr), $a1
+       sbb     8*2($a_ptr), $a2
+        mov    $a3, $t3
+       sbb     8*3($a_ptr), $a3
+       test    $t4, $t4
+
+       cmovz   $t0, $a0
+       cmovz   $t1, $a1
+       mov     $a0, 8*0($r_ptr)
+       cmovz   $t2, $a2
+       mov     $a1, 8*1($r_ptr)
+       cmovz   $t3, $a3
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+
+       pop     %r13
+       pop     %r12
+       ret
+.size  ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
+
+################################################################################
+# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]);
+.globl ecp_nistz256_div_by_2
+.type  ecp_nistz256_div_by_2,\@function,2
+.align 32
+ecp_nistz256_div_by_2:
+       push    %r12
+       push    %r13
+
+       mov     8*0($a_ptr), $a0
+       mov     8*1($a_ptr), $a1
+       mov     8*2($a_ptr), $a2
+        mov    $a0, $t0
+       mov     8*3($a_ptr), $a3
+       lea     .Lpoly(%rip), $a_ptr
+
+        mov    $a1, $t1
+       xor     $t4, $t4
+       add     8*0($a_ptr), $a0
+        mov    $a2, $t2
+       adc     8*1($a_ptr), $a1
+       adc     8*2($a_ptr), $a2
+        mov    $a3, $t3
+       adc     8*3($a_ptr), $a3
+       adc     \$0, $t4
+       xor     $a_ptr, $a_ptr          # borrow $a_ptr
+       test    \$1, $t0
+
+       cmovz   $t0, $a0
+       cmovz   $t1, $a1
+       cmovz   $t2, $a2
+       cmovz   $t3, $a3
+       cmovz   $a_ptr, $t4
+
+       mov     $a1, $t0                # a0:a3>>1
+       shr     \$1, $a0
+       shl     \$63, $t0
+       mov     $a2, $t1
+       shr     \$1, $a1
+       or      $t0, $a0
+       shl     \$63, $t1
+       mov     $a3, $t2
+       shr     \$1, $a2
+       or      $t1, $a1
+       shl     \$63, $t2
+       shr     \$1, $a3
+       shl     \$63, $t4
+       or      $t2, $a2
+       or      $t4, $a3
+
+       mov     $a0, 8*0($r_ptr)
+       mov     $a1, 8*1($r_ptr)
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+
+       pop     %r13
+       pop     %r12
+       ret
+.size  ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
+
+################################################################################
+# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]);
+.globl ecp_nistz256_mul_by_3
+.type  ecp_nistz256_mul_by_3,\@function,2
+.align 32
+ecp_nistz256_mul_by_3:
+       push    %r12
+       push    %r13
+
+       mov     8*0($a_ptr), $a0
+       xor     $t4, $t4
+       mov     8*1($a_ptr), $a1
+       add     $a0, $a0                # a0:a3+a0:a3
+       mov     8*2($a_ptr), $a2
+       adc     $a1, $a1
+       mov     8*3($a_ptr), $a3
+        mov    $a0, $t0
+       adc     $a2, $a2
+       adc     $a3, $a3
+        mov    $a1, $t1
+       adc     \$0, $t4
+
+       sub     \$-1, $a0
+        mov    $a2, $t2
+       sbb     .Lpoly+8*1(%rip), $a1
+       sbb     \$0, $a2
+        mov    $a3, $t3
+       sbb     .Lpoly+8*3(%rip), $a3
+       test    $t4, $t4
+
+       cmovz   $t0, $a0
+       cmovz   $t1, $a1
+       cmovz   $t2, $a2
+       cmovz   $t3, $a3
+
+       xor     $t4, $t4
+       add     8*0($a_ptr), $a0        # a0:a3+=a_ptr[0:3]
+       adc     8*1($a_ptr), $a1
+        mov    $a0, $t0
+       adc     8*2($a_ptr), $a2
+       adc     8*3($a_ptr), $a3
+        mov    $a1, $t1
+       adc     \$0, $t4
+
+       sub     \$-1, $a0
+        mov    $a2, $t2
+       sbb     .Lpoly+8*1(%rip), $a1
+       sbb     \$0, $a2
+        mov    $a3, $t3
+       sbb     .Lpoly+8*3(%rip), $a3
+       test    $t4, $t4
+
+       cmovz   $t0, $a0
+       cmovz   $t1, $a1
+       mov     $a0, 8*0($r_ptr)
+       cmovz   $t2, $a2
+       mov     $a1, 8*1($r_ptr)
+       cmovz   $t3, $a3
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+
+       pop %r13
+       pop %r12
+       ret
+.size  ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
+
+################################################################################
+# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
+.globl ecp_nistz256_add
+.type  ecp_nistz256_add,\@function,3
+.align 32
+ecp_nistz256_add:
+       push    %r12
+       push    %r13
+
+       mov     8*0($a_ptr), $a0
+       xor     $t4, $t4
+       mov     8*1($a_ptr), $a1
+       mov     8*2($a_ptr), $a2
+       mov     8*3($a_ptr), $a3
+       lea     .Lpoly(%rip), $a_ptr
+
+       add     8*0($b_ptr), $a0
+       adc     8*1($b_ptr), $a1
+        mov    $a0, $t0
+       adc     8*2($b_ptr), $a2
+       adc     8*3($b_ptr), $a3
+        mov    $a1, $t1
+       adc     \$0, $t4
+
+       sub     8*0($a_ptr), $a0
+        mov    $a2, $t2
+       sbb     8*1($a_ptr), $a1
+       sbb     8*2($a_ptr), $a2
+        mov    $a3, $t3
+       sbb     8*3($a_ptr), $a3
+       test    $t4, $t4
+
+       cmovz   $t0, $a0
+       cmovz   $t1, $a1
+       mov     $a0, 8*0($r_ptr)
+       cmovz   $t2, $a2
+       mov     $a1, 8*1($r_ptr)
+       cmovz   $t3, $a3
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+
+       pop %r13
+       pop %r12
+       ret
+.size  ecp_nistz256_add,.-ecp_nistz256_add
+
+################################################################################
+# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
+.globl ecp_nistz256_sub
+.type  ecp_nistz256_sub,\@function,3
+.align 32
+ecp_nistz256_sub:
+       push    %r12
+       push    %r13
+
+       mov     8*0($a_ptr), $a0
+       xor     $t4, $t4
+       mov     8*1($a_ptr), $a1
+       mov     8*2($a_ptr), $a2
+       mov     8*3($a_ptr), $a3
+       lea     .Lpoly(%rip), $a_ptr
+
+       sub     8*0($b_ptr), $a0
+       sbb     8*1($b_ptr), $a1
+        mov    $a0, $t0
+       sbb     8*2($b_ptr), $a2
+       sbb     8*3($b_ptr), $a3
+        mov    $a1, $t1
+       sbb     \$0, $t4
+
+       add     8*0($a_ptr), $a0
+        mov    $a2, $t2
+       adc     8*1($a_ptr), $a1
+       adc     8*2($a_ptr), $a2
+        mov    $a3, $t3
+       adc     8*3($a_ptr), $a3
+       test    $t4, $t4
+
+       cmovz   $t0, $a0
+       cmovz   $t1, $a1
+       mov     $a0, 8*0($r_ptr)
+       cmovz   $t2, $a2
+       mov     $a1, 8*1($r_ptr)
+       cmovz   $t3, $a3
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+
+       pop %r13
+       pop %r12
+       ret
+.size  ecp_nistz256_sub,.-ecp_nistz256_sub
+
+################################################################################
+# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
+.globl ecp_nistz256_neg
+.type  ecp_nistz256_neg,\@function,2
+.align 32
+ecp_nistz256_neg:
+       push    %r12
+       push    %r13
+
+       xor     $a0, $a0
+       xor     $a1, $a1
+       xor     $a2, $a2
+       xor     $a3, $a3
+       xor     $t4, $t4
+
+       sub     8*0($a_ptr), $a0
+       sbb     8*1($a_ptr), $a1
+       sbb     8*2($a_ptr), $a2
+        mov    $a0, $t0
+       sbb     8*3($a_ptr), $a3
+       lea     .Lpoly(%rip), $a_ptr
+        mov    $a1, $t1
+       sbb     \$0, $t4
+
+       add     8*0($a_ptr), $a0
+        mov    $a2, $t2
+       adc     8*1($a_ptr), $a1
+       adc     8*2($a_ptr), $a2
+        mov    $a3, $t3
+       adc     8*3($a_ptr), $a3
+       test    $t4, $t4
+
+       cmovz   $t0, $a0
+       cmovz   $t1, $a1
+       mov     $a0, 8*0($r_ptr)
+       cmovz   $t2, $a2
+       mov     $a1, 8*1($r_ptr)
+       cmovz   $t3, $a3
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+
+       pop %r13
+       pop %r12
+       ret
+.size  ecp_nistz256_neg,.-ecp_nistz256_neg
+___
+}
+{
+my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
+my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
+my ($poly1,$poly3)=($acc6,$acc7);
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_to_mont(
+#   uint64_t res[4],
+#   uint64_t in[4]);
+.globl ecp_nistz256_to_mont
+.type  ecp_nistz256_to_mont,\@function,2
+.align 32
+ecp_nistz256_to_mont:
+___
+$code.=<<___   if ($addx);
+       mov     \$0x80100, %ecx
+       and     OPENSSL_ia32cap_P+8(%rip), %ecx
+___
+$code.=<<___;
+       lea     .LRR(%rip), $b_org
+       jmp     .Lmul_mont
+.size  ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
+
+################################################################################
+# void ecp_nistz256_mul_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   uint64_t b[4]);
+
+.globl ecp_nistz256_mul_mont
+.type  ecp_nistz256_mul_mont,\@function,3
+.align 32
+ecp_nistz256_mul_mont:
+___
+$code.=<<___   if ($addx);
+       mov     \$0x80100, %ecx
+       and     OPENSSL_ia32cap_P+8(%rip), %ecx
+___
+$code.=<<___;
+.Lmul_mont:
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+___
+$code.=<<___   if ($addx);
+       cmp     \$0x80100, %ecx
+       je      .Lmul_montx
+___
+$code.=<<___;
+       mov     $b_org, $b_ptr
+       mov     8*0($b_org), %rax
+       mov     8*0($a_ptr), $acc1
+       mov     8*1($a_ptr), $acc2
+       mov     8*2($a_ptr), $acc3
+       mov     8*3($a_ptr), $acc4
+
+       call    __ecp_nistz256_mul_montq
+___
+$code.=<<___   if ($addx);
+       jmp     .Lmul_mont_done
+
+.align 32
+.Lmul_montx:
+       mov     $b_org, $b_ptr
+       mov     8*0($b_org), %rdx
+       mov     8*0($a_ptr), $acc1
+       mov     8*1($a_ptr), $acc2
+       mov     8*2($a_ptr), $acc3
+       mov     8*3($a_ptr), $acc4
+       lea     -128($a_ptr), $a_ptr    # control u-op density
+
+       call    __ecp_nistz256_mul_montx
+___
+$code.=<<___;
+.Lmul_mont_done:
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbx
+       pop     %rbp
+       ret
+.size  ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+.type  __ecp_nistz256_mul_montq,\@abi-omnipotent
+.align 32
+__ecp_nistz256_mul_montq:
+       ########################################################################
+       # Multiply a by b[0]
+       mov     %rax, $t1
+       mulq    $acc1
+       mov     .Lpoly+8*1(%rip),$poly1
+       mov     %rax, $acc0
+       mov     $t1, %rax
+       mov     %rdx, $acc1
+
+       mulq    $acc2
+       mov     .Lpoly+8*3(%rip),$poly3
+       add     %rax, $acc1
+       mov     $t1, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $acc2
+
+       mulq    $acc3
+       add     %rax, $acc2
+       mov     $t1, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $acc3
+
+       mulq    $acc4
+       add     %rax, $acc3
+        mov    $acc0, %rax
+       adc     \$0, %rdx
+       xor     $acc5, $acc5
+       mov     %rdx, $acc4
+
+       ########################################################################
+       # First reduction step
+       # Basically now we want to multiply acc[0] by p256,
+       # and add the result to the acc.
+       # Due to the special form of p256 we do some optimizations
+       #
+       # acc[0] x p256[0] = acc[0] x 2^64 - acc[0]
+       # then we add acc[0] and get acc[0] x 2^64
+
+       mulq    $poly1
+       xor     $t0, $t0
+       add     $acc0, $acc1            # +=acc[0]*2^64
+       adc     \$0, %rdx
+       add     %rax, $acc1
+       mov     $acc0, %rax
+
+       # acc[0] x p256[2] = 0
+       adc     %rdx, $acc2
+       adc     \$0, $t0
+
+       mulq    $poly3
+       xor     $acc0, $acc0
+       add     $t0, $acc3
+       adc     \$0, %rdx
+       add     %rax, $acc3
+        mov    8*1($b_ptr), %rax
+       adc     %rdx, $acc4
+       adc     \$0, $acc5
+
+       ########################################################################
+       # Multiply by b[1]
+       mov     %rax, $t1
+       mulq    8*0($a_ptr)
+       add     %rax, $acc1
+       mov     $t1, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t0
+
+       mulq    8*1($a_ptr)
+       add     $t0, $acc2
+       adc     \$0, %rdx
+       add     %rax, $acc2
+       mov     $t1, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t0
+
+       mulq    8*2($a_ptr)
+       add     $t0, $acc3
+       adc     \$0, %rdx
+       add     %rax, $acc3
+       mov     $t1, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t0
+
+       mulq    8*3($a_ptr)
+       add     $t0, $acc4
+       adc     \$0, %rdx
+       add     %rax, $acc4
+        mov    $acc1, %rax
+       adc     %rdx, $acc5
+       adc     \$0, $acc0
+
+       ########################################################################
+       # Second reduction step 
+       mulq    $poly1
+       xor     $t0, $t0
+       add     $acc1, $acc2
+       adc     \$0, %rdx
+       add     %rax, $acc2
+       mov     $acc1, %rax
+       adc     %rdx, $acc3
+       adc     \$0, $t0
+
+       mulq    $poly3
+       xor     $acc1, $acc1
+       add     $t0, $acc4
+       adc     \$0, %rdx
+       add     %rax, $acc4
+        mov    8*2($b_ptr), %rax
+       adc     %rdx, $acc5
+       adc     \$0, $acc0
+
+       ########################################################################
+       # Multiply by b[2]
+       mov     %rax, $t1
+       mulq    8*0($a_ptr)
+       add     %rax, $acc2
+       mov     $t1, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t0
+
+       mulq    8*1($a_ptr)
+       add     $t0, $acc3
+       adc     \$0, %rdx
+       add     %rax, $acc3
+       mov     $t1, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t0
+
+       mulq    8*2($a_ptr)
+       add     $t0, $acc4
+       adc     \$0, %rdx
+       add     %rax, $acc4
+       mov     $t1, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t0
+
+       mulq    8*3($a_ptr)
+       add     $t0, $acc5
+       adc     \$0, %rdx
+       add     %rax, $acc5
+        mov    $acc2, %rax
+       adc     %rdx, $acc0
+       adc     \$0, $acc1
+
+       ########################################################################
+       # Third reduction step  
+       mulq    $poly1
+       xor     $t0, $t0
+       add     $acc2, $acc3
+       adc     \$0, %rdx
+       add     %rax, $acc3
+       mov     $acc2, %rax
+       adc     %rdx, $acc4
+       adc     \$0, $t0
+
+       mulq    $poly3
+       xor     $acc2, $acc2
+       add     $t0, $acc5
+       adc     \$0, %rdx
+       add     %rax, $acc5
+        mov    8*3($b_ptr), %rax
+       adc     %rdx, $acc0
+       adc     \$0, $acc1
+
+       ########################################################################
+       # Multiply by b[3]
+       mov     %rax, $t1
+       mulq    8*0($a_ptr)
+       add     %rax, $acc3
+       mov     $t1, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t0
+
+       mulq    8*1($a_ptr)
+       add     $t0, $acc4
+       adc     \$0, %rdx
+       add     %rax, $acc4
+       mov     $t1, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t0
+
+       mulq    8*2($a_ptr)
+       add     $t0, $acc5
+       adc     \$0, %rdx
+       add     %rax, $acc5
+       mov     $t1, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t0
+
+       mulq    8*3($a_ptr)
+       add     $t0, $acc0
+       adc     \$0, %rdx
+       add     %rax, $acc0
+        mov    $acc3, %rax
+       adc     %rdx, $acc1
+       adc     \$0, $acc2
+
+       ########################################################################
+       # Final reduction step  
+       mulq    $poly1
+       #xor    $t0, $t0
+       add     $acc3, $acc4
+       adc     \$0, %rdx
+       add     %rax, $acc4
+       mov     $acc3, %rax
+       adc     %rdx, $acc5
+       #adc    \$0, $t0                # doesn't overflow
+
+       mulq    $poly3
+       #add    $t0, $acc0
+       #adc    \$0, %rdx
+        mov    $acc4, $t0
+       add     %rax, $acc0
+       adc     %rdx, $acc1
+        mov    $acc5, $t1
+       adc     \$0, $acc2
+
+       ########################################################################        
+       # Branch-less conditional subtraction of P
+       sub     \$-1, $acc4             # .Lpoly[0]
+        mov    $acc0, $t2
+       sbb     $poly1, $acc5           # .Lpoly[1]
+       sbb     \$0, $acc0              # .Lpoly[2]
+        mov    $acc1, $t3
+       sbb     $poly3, $acc1           # .Lpoly[3]
+       neg     $acc2
+
+       cmovnc  $t0, $acc4
+       cmovnc  $t1, $acc5
+       mov     $acc4, 8*0($r_ptr)
+       cmovnc  $t2, $acc0
+       mov     $acc5, 8*1($r_ptr)
+       cmovnc  $t3, $acc1
+       mov     $acc0, 8*2($r_ptr)
+       mov     $acc1, 8*3($r_ptr)
+
+       ret
+.size  __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
+
+################################################################################
+# void ecp_nistz256_sqr_mont(
+#   uint64_t res[4],
+#   uint64_t a[4]);
+
+# we optimize the square according to S.Gueron and V.Krasnov,
+# "Speeding up Big-Number Squaring"
+.globl ecp_nistz256_sqr_mont
+.type  ecp_nistz256_sqr_mont,\@function,2
+.align 32
+ecp_nistz256_sqr_mont:
+___
+$code.=<<___   if ($addx);
+       mov     \$0x80100, %ecx
+       and     OPENSSL_ia32cap_P+8(%rip), %ecx
+___
+$code.=<<___;
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+___
+$code.=<<___   if ($addx);
+       cmp     \$0x80100, %ecx
+       je      .Lsqr_montx
+___
+$code.=<<___;
+       mov     8*0($a_ptr), %rax
+       mov     8*1($a_ptr), $acc6
+       mov     8*2($a_ptr), $acc7
+       mov     8*3($a_ptr), $acc0
+
+       call    __ecp_nistz256_sqr_montq
+___
+$code.=<<___   if ($addx);
+       jmp     .Lsqr_mont_done
+
+.align 32
+.Lsqr_montx:
+       mov     8*0($a_ptr), %rdx
+       mov     8*1($a_ptr), $acc6
+       mov     8*2($a_ptr), $acc7
+       mov     8*3($a_ptr), $acc0
+       lea     -128($a_ptr), $a_ptr    # control u-op density
+
+       call    __ecp_nistz256_sqr_montx
+___
+$code.=<<___;
+.Lsqr_mont_done:
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbx
+       pop     %rbp
+       ret
+.size  ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+.type  __ecp_nistz256_sqr_montq,\@abi-omnipotent
+.align 32
+__ecp_nistz256_sqr_montq:
+       mov     %rax, $acc5
+       mulq    $acc6                   # a[1]*a[0]
+       mov     %rax, $acc1
+       mov     $acc7, %rax
+       mov     %rdx, $acc2
+
+       mulq    $acc5                   # a[0]*a[2]
+       add     %rax, $acc2
+       mov     $acc0, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $acc3
+
+       mulq    $acc5                   # a[0]*a[3]
+       add     %rax, $acc3
+        mov    $acc7, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $acc4
+
+       #################################
+       mulq    $acc6                   # a[1]*a[2]
+       add     %rax, $acc3
+       mov     $acc0, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t1
+
+       mulq    $acc6                   # a[1]*a[3]
+       add     %rax, $acc4
+        mov    $acc0, %rax
+       adc     \$0, %rdx
+       add     $t1, $acc4
+       mov     %rdx, $acc5
+       adc     \$0, $acc5
+
+       #################################
+       mulq    $acc7                   # a[2]*a[3]
+       xor     $acc7, $acc7
+       add     %rax, $acc5
+        mov    8*0($a_ptr), %rax
+       mov     %rdx, $acc6
+       adc     \$0, $acc6
+
+       add     $acc1, $acc1            # acc1:6<<1
+       adc     $acc2, $acc2
+       adc     $acc3, $acc3
+       adc     $acc4, $acc4
+       adc     $acc5, $acc5
+       adc     $acc6, $acc6
+       adc     \$0, $acc7
+
+       mulq    %rax
+       mov     %rax, $acc0
+       mov     8*1($a_ptr), %rax
+       mov     %rdx, $t0
+
+       mulq    %rax
+       add     $t0, $acc1
+       adc     %rax, $acc2
+       mov     8*2($a_ptr), %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t0
+
+       mulq    %rax
+       add     $t0, $acc3
+       adc     %rax, $acc4
+       mov     8*3($a_ptr), %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t0
+
+       mulq    %rax
+       add     $t0, $acc5
+       adc     %rax, $acc6
+        mov    $acc0, %rax
+       adc     %rdx, $acc7
+
+       mov     .Lpoly+8*1(%rip), $a_ptr
+       mov     .Lpoly+8*3(%rip), $t1
+
+       ##########################################
+       # Now the reduction
+       # First iteration
+       mulq    $a_ptr
+       #xor    $t0, $t0
+       add     $acc0, $acc1
+       adc     \$0, %rdx
+       add     %rax, $acc1
+       mov     $acc0, %rax
+       adc     %rdx, $acc2     # doesn't overflow
+       #adc    \$0, $t0
+
+       mulq    $t1
+       xor     $acc0, $acc0
+       #add    $t0, $acc3
+       #adc    \$0, %rdx
+       add     %rax, $acc3
+        mov    $acc1, %rax
+       adc     %rdx, $acc4
+       adc     \$0, $acc0
+
+       ##########################################
+       # Second iteration
+       mulq    $a_ptr
+       #xor    $t0, $t0
+       add     $acc1, $acc2
+       adc     \$0, %rdx
+       add     %rax, $acc2
+       mov     $acc1, %rax
+       adc     %rdx, $acc3     # doesn't overflow
+       #adc    \$0, $t0
+
+       mulq    $t1
+       xor     $acc1, $acc1
+       #add    $t0, $acc4
+       #adc    \$0, %rdx
+       add     %rax, $acc4
+        mov    $acc2, %rax
+       adc     %rdx, $acc0
+       adc     \$0, $acc1
+
+       ##########################################
+       # Third iteration
+       mulq    $a_ptr
+       #xor    $t0, $t0
+       add     $acc2, $acc3
+       adc     \$0, %rdx
+       add     %rax, $acc3
+       mov     $acc2, %rax
+       adc     %rdx, $acc4     # doesn't overflow
+       #adc    \$0, $t0
+
+       mulq    $t1
+       xor     $acc2, $acc2
+       #add    $t0, $acc0
+       #adc    \$0, %rdx
+       add     %rax, $acc0
+        mov    $acc3, %rax
+       adc     %rdx, $acc1
+       adc     \$0, $acc2
+
+       ###########################################
+       # Last iteration
+       mulq    $a_ptr
+       #xor    $t0, $t0
+       add     $acc3, $acc4
+       adc     \$0, %rdx
+       add     %rax, $acc4
+       mov     $acc3, %rax
+       adc     %rdx, $acc0     # doesn't overflow
+       #adc    \$0, $t0
+
+       mulq    $t1
+       xor     $acc3, $acc3
+       #add    $t0, $acc1
+       #adc    \$0, %rdx
+       add     %rax, $acc1
+       adc     %rdx, $acc2
+       adc     \$0, $acc3
+
+       ############################################
+       # Add the rest of the acc
+       add     $acc0, $acc5
+        mov    $acc4, $acc0
+       adc     $acc1, $acc6
+       adc     $acc2, $acc7
+        mov    $acc5, $acc1
+       adc     \$0, $acc3
+
+       sub     \$-1, $acc4             # .Lpoly[0]
+        mov    $acc6, $acc2
+       sbb     $a_ptr, $acc5           # .Lpoly[1]
+       sbb     \$0, $acc6              # .Lpoly[2]
+        mov    $acc7, $t0
+       sbb     $t1, $acc7              # .Lpoly[3]
+       neg     $acc3
+
+       cmovnc  $acc0, $acc4
+       cmovnc  $acc1, $acc5
+       mov     $acc4, 8*0($r_ptr)
+       cmovnc  $acc2, $acc6
+       mov     $acc5, 8*1($r_ptr)
+       cmovnc  $t0, $acc7
+       mov     $acc6, 8*2($r_ptr)
+       mov     $acc7, 8*3($r_ptr)
+
+       ret
+.size  __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+___
+
+if ($addx) {
+$code.=<<___;
+.type  __ecp_nistz256_mul_montx,\@abi-omnipotent
+.align 32
+__ecp_nistz256_mul_montx:
+       ########################################################################
+       # Multiply by b[0]
+       mulx    $acc1, $acc0, $acc1
+       mulx    $acc2, $t0, $acc2
+       mov     \$32, $poly1
+       xor     $acc5, $acc5            # cf=0
+       mulx    $acc3, $t1, $acc3
+       mov     .Lpoly+8*3(%rip), $poly3
+       adc     $t0, $acc1
+       mulx    $acc4, $t0, $acc4
+        mov    $acc0, %rdx
+       adc     $t1, $acc2
+        shlx   $poly1,$acc0,$t1
+       adc     $t0, $acc3
+        shrx   $poly1,$acc0,$t0
+       adc     \$0, $acc4
+
+       ########################################################################
+       # First reduction step
+       xor     $acc0, $acc0            # $acc0=0,cf=0,of=0
+       adox    $t1, $acc1
+       adox    $t0, $acc2
+
+       mulx    $poly3, $t0, $t1
+        mov    8*1($b_ptr), %rdx
+       adox    $t0, $acc3
+       adcx    $t1, $acc4
+
+       adox    $acc0, $acc4
+       adcx    $acc0, $acc5            # cf=0
+       adox    $acc0, $acc5            # of=0
+
+       ########################################################################
+       # Multiply by b[1]
+       mulx    8*0+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc1
+       adox    $t1, $acc2
+
+       mulx    8*1+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc2
+       adox    $t1, $acc3
+
+       mulx    8*2+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc3
+       adox    $t1, $acc4
+
+       mulx    8*3+128($a_ptr), $t0, $t1
+        mov    $acc1, %rdx
+       adcx    $t0, $acc4
+        shlx   $poly1, $acc1, $t0
+       adox    $t1, $acc5
+        shrx   $poly1, $acc1, $t1
+
+       adcx    $acc0, $acc5
+       adox    $acc0, $acc0
+       adc     \$0, $acc0
+
+       ########################################################################
+       # Second reduction step
+       xor     $acc1 ,$acc1            # $acc1=0,cf=0,of=0
+       adox    $t0, $acc2
+       adox    $t1, $acc3
+
+       mulx    $poly3, $t0, $t1
+        mov    8*2($b_ptr), %rdx
+       adox    $t0, $acc4
+       adcx    $t1, $acc5
+
+       adox    $acc1, $acc5
+       adcx    $acc1, $acc0            # cf=0
+       adox    $acc1, $acc0            # of=0
+
+       ########################################################################
+       # Multiply by b[2]
+       mulx    8*0+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc2
+       adox    $t1, $acc3
+
+       mulx    8*1+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc3
+       adox    $t1, $acc4
+
+       mulx    8*2+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc4
+       adox    $t1, $acc5
+
+       mulx    8*3+128($a_ptr), $t0, $t1
+        mov    $acc2, %rdx
+       adcx    $t0, $acc5
+        shlx   $poly1, $acc2, $t0
+       adox    $t1, $acc0
+        shrx   $poly1, $acc2, $t1
+
+       adcx    $acc1, $acc0
+       adox    $acc1, $acc1
+       adc     \$0, $acc1
+
+       ########################################################################
+       # Third reduction step
+       xor     $acc2, $acc2            # $acc2=0,cf=0,of=0
+       adox    $t0, $acc3
+       adox    $t1, $acc4
+
+       mulx    $poly3, $t0, $t1
+        mov    8*3($b_ptr), %rdx
+       adox    $t0, $acc5
+       adcx    $t1, $acc0
+
+       adox    $acc2, $acc0
+       adcx    $acc2, $acc1            # cf=0
+       adox    $acc2, $acc1            # of=0
+
+       ########################################################################
+       # Multiply by b[3]
+       mulx    8*0+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc3
+       adox    $t1, $acc4
+
+       mulx    8*1+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc4
+       adox    $t1, $acc5
+
+       mulx    8*2+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc5
+       adox    $t1, $acc0
+
+       mulx    8*3+128($a_ptr), $t0, $t1
+        mov    $acc3, %rdx
+       adcx    $t0, $acc0
+        shlx   $poly1, $acc3, $t0
+       adox    $t1, $acc1
+        shrx   $poly1, $acc3, $t1
+
+       adcx    $acc2, $acc1
+       adox    $acc2, $acc2
+       adc     \$0, $acc2
+
+       ########################################################################
+       # Fourth reduction step
+       xor     $acc3, $acc3            # $acc3=0,cf=0,of=0
+       adox    $t0, $acc4
+       adox    $t1, $acc5
+
+       mulx    $poly3, $t0, $t1
+        mov    $acc4, $t2
+       mov     .Lpoly+8*1(%rip), $poly1
+       adcx    $t0, $acc0
+       adox    $t1, $acc1
+        mov    $acc5, $t3
+
+       adcx    $acc3, $acc1
+       adox    $acc3, $acc2
+       adc     \$0, $acc2
+        mov    $acc0, $t0
+
+       ########################################################################
+       # Branch-less conditional subtraction of P
+       xor     %eax, %eax
+       sbb     \$-1, $acc4             # .Lpoly[0]
+       sbb     $poly1, $acc5           # .Lpoly[1]
+       sbb     \$0, $acc0              # .Lpoly[2]
+        mov    $acc1, $t1
+       sbb     $poly3, $acc1           # .Lpoly[3]
+
+       bt      \$0,$acc2
+       cmovnc  $t2, $acc4
+       cmovnc  $t3, $acc5
+       mov     $acc4, 8*0($r_ptr)
+       cmovnc  $t0, $acc0
+       mov     $acc5, 8*1($r_ptr)
+       cmovnc  $t1, $acc1
+       mov     $acc0, 8*2($r_ptr)
+       mov     $acc1, 8*3($r_ptr)
+
+       ret
+.size  __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.type  __ecp_nistz256_sqr_montx,\@abi-omnipotent
+.align 32
+__ecp_nistz256_sqr_montx:
+       mulx    $acc6, $acc1, $acc2     # a[0]*a[1]
+       mulx    $acc7, $t0, $acc3       # a[0]*a[2]
+       xor     %eax, %eax
+       adc     $t0, $acc2
+       mulx    $acc0, $t1, $acc4       # a[0]*a[3]
+        mov    $acc6, %rdx
+       adc     $t1, $acc3
+       adc     \$0, $acc4
+       xor     $acc5, $acc5            # $acc5=0,cf=0,of=0
+
+       #################################
+       mulx    $acc7, $t0, $t1         # a[1]*a[2]
+       adcx    $t0, $acc3
+       adox    $t1, $acc4
+
+       mulx    $acc0, $t0, $t1         # a[1]*a[3]
+        mov    $acc7, %rdx
+       adcx    $t0, $acc4
+       adox    $t1, $acc5
+       adc     \$0, $acc5
+
+       #################################
+       mulx    $acc0, $t0, $acc6       # a[2]*a[3]
+        mov    8*0+128($a_ptr), %rdx
+       xor     $acc7, $acc7            # $acc7=0,cf=0,of=0
+        adcx   $acc1, $acc1            # acc1:6<<1
+       adox    $t0, $acc5
+        adcx   $acc2, $acc2
+       adox    $acc7, $acc6            # of=0
+
+       mulx    %rdx, $acc0, $t1
+       mov     8*1+128($a_ptr), %rdx
+        adcx   $acc3, $acc3
+       adox    $t1, $acc1
+        adcx   $acc4, $acc4
+       mulx    %rdx, $t0, $t4
+       mov     8*2+128($a_ptr), %rdx
+        adcx   $acc5, $acc5
+       adox    $t0, $acc2
+        adcx   $acc6, $acc6
+       .byte   0x67
+       mulx    %rdx, $t0, $t1
+       mov     8*3+128($a_ptr), %rdx
+       adox    $t4, $acc3
+        adcx   $acc7, $acc7
+       adox    $t0, $acc4
+        mov    \$32, $a_ptr
+       adox    $t1, $acc5
+       .byte   0x67,0x67
+       mulx    %rdx, $t0, $t4
+        mov    $acc0, %rdx
+       adox    $t0, $acc6
+        shlx   $a_ptr, $acc0, $t0
+       adox    $t4, $acc7
+        shrx   $a_ptr, $acc0, $t4
+        mov    .Lpoly+8*3(%rip), $t1
+
+       # reduction step 1
+       xor     $acc0, $acc0
+       adcx    $t0, $acc1
+       adcx    $t4, $acc2
+
+       mulx    $t1, $t0, $t4
+        mov    $acc1, %rdx
+       adcx    $t0, $acc3
+        shlx   $a_ptr, $acc1, $t0
+       adox    $t4, $acc0
+        shrx   $a_ptr, $acc1, $t4
+       adc     \$0, $acc0
+
+       # reduction step 2
+       xor     $acc1, $acc1
+       adcx    $t0, $acc2
+       adcx    $t4, $acc3
+
+       mulx    $t1, $t0, $t4
+        mov    $acc2, %rdx
+       adcx    $t0, $acc0
+        shlx   $a_ptr, $acc2, $t0
+       adox    $t4, $acc1
+        shrx   $a_ptr, $acc2, $t4
+       adc     \$0, $acc1
+
+       # reduction step 3
+       xor     $acc2, $acc2
+       adcx    $t0, $acc3
+       adcx    $t4, $acc0
+
+       mulx    $t1, $t0, $t4
+        mov    $acc3, %rdx
+       adcx    $t0, $acc1
+        shlx   $a_ptr, $acc3, $t0
+       adox    $t4, $acc2
+        shrx   $a_ptr, $acc3, $t4
+       adc     \$0, $acc2
+
+       # reduction step 4
+       xor     $acc3, $acc3
+       adcx    $t0, $acc0
+       adcx    $t4, $acc1
+
+       mulx    $t1, $t0, $t4
+       adcx    $t0, $acc2
+       adox    $t4, $acc3
+       adc     \$0, $acc3
+
+       xor     $t3, $t3                # cf=0
+       adc     $acc0, $acc4            # accumulate upper half
+        mov    .Lpoly+8*1(%rip), $a_ptr
+       adc     $acc1, $acc5
+        mov    $acc4, $acc0
+       adc     $acc2, $acc6
+       adc     $acc3, $acc7
+        mov    $acc5, $acc1
+       adc     \$0, $t3
+
+       xor     %eax, %eax              # cf=0
+       sbb     \$-1, $acc4             # .Lpoly[0]
+        mov    $acc6, $acc2
+       sbb     $a_ptr, $acc5           # .Lpoly[1]
+       sbb     \$0, $acc6              # .Lpoly[2]
+        mov    $acc7, $acc3
+       sbb     $t1, $acc7              # .Lpoly[3]
+
+       bt      \$0,$t3
+       cmovnc  $acc0, $acc4
+       cmovnc  $acc1, $acc5
+       mov     $acc4, 8*0($r_ptr)
+       cmovnc  $acc2, $acc6
+       mov     $acc5, 8*1($r_ptr)
+       cmovnc  $acc3, $acc7
+       mov     $acc6, 8*2($r_ptr)
+       mov     $acc7, 8*3($r_ptr)
+
+       ret
+.size  __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
+___
+}
+}
+{
+my ($r_ptr,$in_ptr)=("%rdi","%rsi");
+my ($acc0,$acc1,$acc2,$acc3,$acc4)=map("%r$_",(8..12));
+my ($t0,$t1)=("%rcx","%rsi");
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_from_mont(
+#   uint64_t res[4],
+#   uint64_t in[4]);
+# This one performs Montgomery multiplication by 1, so we only need the reduction
+
+.globl ecp_nistz256_from_mont
+.type  ecp_nistz256_from_mont,\@function,2
+.align 32
+ecp_nistz256_from_mont:
+       push    %r12
+       push    %r13
+
+       mov     8*0($in_ptr), %rax
+       mov     8*1($in_ptr), $acc1
+       mov     8*2($in_ptr), $acc2
+       mov     8*3($in_ptr), $acc3
+       lea     .Lpoly(%rip), $in_ptr
+       xor     $acc4, $acc4
+       mov     %rax, $acc0
+
+       #########################################
+       # First iteration
+       mulq    1*8($in_ptr)
+       xor     $t0, $t0
+       add     $acc0, $acc1
+       adc     \$0, %rdx
+       add     %rax, $acc1
+       mov     $acc0, %rax
+       adc     %rdx, $acc2
+       adc     \$0, $t0
+
+       mulq    3*8($in_ptr)
+       xor     $acc0, $acc0
+       add     $t0, $acc3
+       adc     \$0, %rdx
+       add     %rax, $acc3
+        mov    $acc1, %rax
+       adc     %rdx, $acc4
+       adc     \$0, $acc0
+
+       #########################################
+       # Second iteration
+       mulq    1*8($in_ptr)
+       xor     $t0, $t0
+       add     $acc1, $acc2
+       adc     \$0, %rdx
+       add     %rax, $acc2
+       mov     $acc1, %rax
+       adc     %rdx, $acc3
+       adc     \$0, $t0
+
+       mulq    3*8($in_ptr)
+       xor     $acc1, $acc1
+       add     $t0, $acc4
+       adc     \$0, %rdx
+       add     %rax, $acc4
+        mov    $acc2, %rax
+       adc     %rdx, $acc0
+       adc     \$0, $acc1
+
+       ##########################################
+       # Third iteration
+       mulq    1*8($in_ptr)
+       xor     $t0, $t0
+       add     $acc2, $acc3
+       adc     \$0, %rdx
+       add     %rax, $acc3
+       mov     $acc2, %rax
+       adc     %rdx, $acc4
+       adc     \$0, $t0
+
+       mulq    3*8($in_ptr)
+       xor     $acc2, $acc2
+       add     $t0, $acc0
+       adc     \$0, %rdx
+       add     %rax, $acc0
+        mov    $acc3, %rax
+       adc     %rdx, $acc1
+       adc     \$0, $acc2
+
+       ###########################################
+       # Last iteration
+       mulq    1*8($in_ptr)
+       xor     $t0, $t0
+       add     $acc3, $acc4
+       adc     \$0, %rdx
+       add     %rax, $acc4
+       mov     $acc3, %rax
+       adc     %rdx, $acc0
+       adc     \$0, $t0
+
+       mulq    3*8($in_ptr)
+       add     $t0, $acc1
+       adc     \$0, %rdx
+       add     %rax, $acc1
+       adc     %rdx, $acc2
+       sbb     $acc3, $acc3
+
+       mov     0*8($in_ptr), %rax
+       mov     1*8($in_ptr), %rdx
+       mov     2*8($in_ptr), $t0
+       mov     3*8($in_ptr), $t1
+
+       and     $acc3, %rax
+       and     $acc3, %rdx
+       and     $acc3, $t0
+       and     $acc3, $t1
+
+       sub     %rax, $acc4
+       sbb     %rdx, $acc0
+       mov     $acc4, 8*0($r_ptr)
+       sbb     $t0, $acc1
+       mov     $acc0, 8*1($r_ptr)
+       sbb     $t1, $acc2
+       mov     $acc1, 8*2($r_ptr)
+       mov     $acc2, 8*3($r_ptr)
+
+       pop     %r13
+       pop     %r12
+       ret
+.size  ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
+___
+}
+{
+my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
+my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
+my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
+my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_select_w5
+.type  ecp_nistz256_select_w5,\@abi-omnipotent
+.align 32
+ecp_nistz256_select_w5:
+___
+$code.=<<___   if ($avx>1);
+       mov     OPENSSL_ia32cap_P+8(%rip), %eax
+       test    \$`1<<5`, %eax
+       jnz     .Lavx2_select_w5
+___
+$code.=<<___   if ($win64);
+       lea     -0x88(%rsp), %rax
+.LSEH_begin_ecp_nistz256_select_w5:
+       .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax), %rsp
+       .byte   0x0f,0x29,0x70,0xe0             #movaps %xmm6, -0x20(%rax)
+       .byte   0x0f,0x29,0x78,0xf0             #movaps %xmm7, -0x10(%rax)
+       .byte   0x44,0x0f,0x29,0x00             #movaps %xmm8, 0(%rax)
+       .byte   0x44,0x0f,0x29,0x48,0x10        #movaps %xmm9, 0x10(%rax)
+       .byte   0x44,0x0f,0x29,0x50,0x20        #movaps %xmm10, 0x20(%rax)
+       .byte   0x44,0x0f,0x29,0x58,0x30        #movaps %xmm11, 0x30(%rax)
+       .byte   0x44,0x0f,0x29,0x60,0x40        #movaps %xmm12, 0x40(%rax)
+       .byte   0x44,0x0f,0x29,0x68,0x50        #movaps %xmm13, 0x50(%rax)
+       .byte   0x44,0x0f,0x29,0x70,0x60        #movaps %xmm14, 0x60(%rax)
+       .byte   0x44,0x0f,0x29,0x78,0x70        #movaps %xmm15, 0x70(%rax)
+___
+$code.=<<___;
+       movdqa  .LOne(%rip), $ONE
+       movd    $index, $INDEX
+
+       pxor    $Ra, $Ra
+       pxor    $Rb, $Rb
+       pxor    $Rc, $Rc
+       pxor    $Rd, $Rd
+       pxor    $Re, $Re
+       pxor    $Rf, $Rf
+
+       movdqa  $ONE, $M0
+       pshufd  \$0, $INDEX, $INDEX
+
+       mov     \$16, %rax
+.Lselect_loop_sse_w5:
+
+       movdqa  $M0, $TMP0
+       paddd   $ONE, $M0
+       pcmpeqd $INDEX, $TMP0
+
+       movdqa  16*0($in_t), $T0a
+       movdqa  16*1($in_t), $T0b
+       movdqa  16*2($in_t), $T0c
+       movdqa  16*3($in_t), $T0d
+       movdqa  16*4($in_t), $T0e
+       movdqa  16*5($in_t), $T0f
+       lea 16*6($in_t), $in_t
+
+       pand    $TMP0, $T0a
+       pand    $TMP0, $T0b
+       por     $T0a, $Ra
+       pand    $TMP0, $T0c
+       por     $T0b, $Rb
+       pand    $TMP0, $T0d
+       por     $T0c, $Rc
+       pand    $TMP0, $T0e
+       por     $T0d, $Rd
+       pand    $TMP0, $T0f
+       por     $T0e, $Re
+       por     $T0f, $Rf
+
+       dec     %rax
+       jnz     .Lselect_loop_sse_w5
+
+       movdqu  $Ra, 16*0($val)
+       movdqu  $Rb, 16*1($val)
+       movdqu  $Rc, 16*2($val)
+       movdqu  $Rd, 16*3($val)
+       movdqu  $Re, 16*4($val)
+       movdqu  $Rf, 16*5($val)
+___
+$code.=<<___   if ($win64);
+       movaps  (%rsp), %xmm6
+       movaps  0x10(%rsp), %xmm7
+       movaps  0x20(%rsp), %xmm8
+       movaps  0x30(%rsp), %xmm9
+       movaps  0x40(%rsp), %xmm10
+       movaps  0x50(%rsp), %xmm11
+       movaps  0x60(%rsp), %xmm12
+       movaps  0x70(%rsp), %xmm13
+       movaps  0x80(%rsp), %xmm14
+       movaps  0x90(%rsp), %xmm15
+       lea     0xa8(%rsp), %rsp
+.LSEH_end_ecp_nistz256_select_w5:
+___
+$code.=<<___;
+       ret
+.size  ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+################################################################################
+# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_select_w7
+.type  ecp_nistz256_select_w7,\@abi-omnipotent
+.align 32
+ecp_nistz256_select_w7:
+___
+$code.=<<___   if ($avx>1);
+       mov     OPENSSL_ia32cap_P+8(%rip), %eax
+       test    \$`1<<5`, %eax
+       jnz     .Lavx2_select_w7
+___
+$code.=<<___   if ($win64);
+       lea     -0x88(%rsp), %rax
+.LSEH_begin_ecp_nistz256_select_w7:
+       .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax), %rsp
+       .byte   0x0f,0x29,0x70,0xe0             #movaps %xmm6, -0x20(%rax)
+       .byte   0x0f,0x29,0x78,0xf0             #movaps %xmm7, -0x10(%rax)
+       .byte   0x44,0x0f,0x29,0x00             #movaps %xmm8, 0(%rax)
+       .byte   0x44,0x0f,0x29,0x48,0x10        #movaps %xmm9, 0x10(%rax)
+       .byte   0x44,0x0f,0x29,0x50,0x20        #movaps %xmm10, 0x20(%rax)
+       .byte   0x44,0x0f,0x29,0x58,0x30        #movaps %xmm11, 0x30(%rax)
+       .byte   0x44,0x0f,0x29,0x60,0x40        #movaps %xmm12, 0x40(%rax)
+       .byte   0x44,0x0f,0x29,0x68,0x50        #movaps %xmm13, 0x50(%rax)
+       .byte   0x44,0x0f,0x29,0x70,0x60        #movaps %xmm14, 0x60(%rax)
+       .byte   0x44,0x0f,0x29,0x78,0x70        #movaps %xmm15, 0x70(%rax)
+___
+$code.=<<___;
+       movdqa  .LOne(%rip), $M0
+       movd    $index, $INDEX
+
+       pxor    $Ra, $Ra
+       pxor    $Rb, $Rb
+       pxor    $Rc, $Rc
+       pxor    $Rd, $Rd
+
+       movdqa  $M0, $ONE
+       pshufd  \$0, $INDEX, $INDEX
+       mov     \$64, %rax
+
+.Lselect_loop_sse_w7:
+       movdqa  $M0, $TMP0
+       paddd   $ONE, $M0
+       movdqa  16*0($in_t), $T0a
+       movdqa  16*1($in_t), $T0b
+       pcmpeqd $INDEX, $TMP0
+       movdqa  16*2($in_t), $T0c
+       movdqa  16*3($in_t), $T0d
+       lea     16*4($in_t), $in_t
+
+       pand    $TMP0, $T0a
+       pand    $TMP0, $T0b
+       por     $T0a, $Ra
+       pand    $TMP0, $T0c
+       por     $T0b, $Rb
+       pand    $TMP0, $T0d
+       por     $T0c, $Rc
+       prefetcht0      255($in_t)
+       por     $T0d, $Rd
+
+       dec     %rax
+       jnz     .Lselect_loop_sse_w7
+
+       movdqu  $Ra, 16*0($val)
+       movdqu  $Rb, 16*1($val)
+       movdqu  $Rc, 16*2($val)
+       movdqu  $Rd, 16*3($val)
+___
+$code.=<<___   if ($win64);
+       movaps  (%rsp), %xmm6
+       movaps  0x10(%rsp), %xmm7
+       movaps  0x20(%rsp), %xmm8
+       movaps  0x30(%rsp), %xmm9
+       movaps  0x40(%rsp), %xmm10
+       movaps  0x50(%rsp), %xmm11
+       movaps  0x60(%rsp), %xmm12
+       movaps  0x70(%rsp), %xmm13
+       movaps  0x80(%rsp), %xmm14
+       movaps  0x90(%rsp), %xmm15
+       lea     0xa8(%rsp), %rsp
+.LSEH_end_ecp_nistz256_select_w7:
+___
+$code.=<<___;
+       ret
+.size  ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+___
+}
+if ($avx>1) {
+my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
+my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
+my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
+my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
+
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.type  ecp_nistz256_avx2_select_w5,\@abi-omnipotent
+.align 32
+ecp_nistz256_avx2_select_w5:
+.Lavx2_select_w5:
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       lea     -0x88(%rsp), %rax
+.LSEH_begin_ecp_nistz256_avx2_select_w5:
+       .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax), %rsp
+       .byte   0xc5,0xf8,0x29,0x70,0xe0        #vmovaps %xmm6, -0x20(%rax)
+       .byte   0xc5,0xf8,0x29,0x78,0xf0        #vmovaps %xmm7, -0x10(%rax)
+       .byte   0xc5,0x78,0x29,0x40,0x00        #vmovaps %xmm8, 8(%rax)
+       .byte   0xc5,0x78,0x29,0x48,0x10        #vmovaps %xmm9, 0x10(%rax)
+       .byte   0xc5,0x78,0x29,0x50,0x20        #vmovaps %xmm10, 0x20(%rax)
+       .byte   0xc5,0x78,0x29,0x58,0x30        #vmovaps %xmm11, 0x30(%rax)
+       .byte   0xc5,0x78,0x29,0x60,0x40        #vmovaps %xmm12, 0x40(%rax)
+       .byte   0xc5,0x78,0x29,0x68,0x50        #vmovaps %xmm13, 0x50(%rax)
+       .byte   0xc5,0x78,0x29,0x70,0x60        #vmovaps %xmm14, 0x60(%rax)
+       .byte   0xc5,0x78,0x29,0x78,0x70        #vmovaps %xmm15, 0x70(%rax)
+___
+$code.=<<___;
+       vmovdqa .LTwo(%rip), $TWO
+
+       vpxor   $Ra, $Ra, $Ra
+       vpxor   $Rb, $Rb, $Rb
+       vpxor   $Rc, $Rc, $Rc
+
+       vmovdqa .LOne(%rip), $M0
+       vmovdqa .LTwo(%rip), $M1
+
+       vmovd   $index, %xmm1
+       vpermd  $INDEX, $Ra, $INDEX
+
+       mov     \$8, %rax
+.Lselect_loop_avx2_w5:
+
+       vmovdqa 32*0($in_t), $T0a
+       vmovdqa 32*1($in_t), $T0b
+       vmovdqa 32*2($in_t), $T0c
+
+       vmovdqa 32*3($in_t), $T1a
+       vmovdqa 32*4($in_t), $T1b
+       vmovdqa 32*5($in_t), $T1c
+
+       vpcmpeqd        $INDEX, $M0, $TMP0
+       vpcmpeqd        $INDEX, $M1, $TMP1
+
+       vpaddd  $TWO, $M0, $M0
+       vpaddd  $TWO, $M1, $M1
+       lea     32*6($in_t), $in_t
+
+       vpand   $TMP0, $T0a, $T0a
+       vpand   $TMP0, $T0b, $T0b
+       vpand   $TMP0, $T0c, $T0c
+       vpand   $TMP1, $T1a, $T1a
+       vpand   $TMP1, $T1b, $T1b
+       vpand   $TMP1, $T1c, $T1c
+
+       vpxor   $T0a, $Ra, $Ra
+       vpxor   $T0b, $Rb, $Rb
+       vpxor   $T0c, $Rc, $Rc
+       vpxor   $T1a, $Ra, $Ra
+       vpxor   $T1b, $Rb, $Rb
+       vpxor   $T1c, $Rc, $Rc
+
+       dec %rax
+       jnz .Lselect_loop_avx2_w5
+
+       vmovdqu $Ra, 32*0($val)
+       vmovdqu $Rb, 32*1($val)
+       vmovdqu $Rc, 32*2($val)
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       movaps  (%rsp), %xmm6
+       movaps  0x10(%rsp), %xmm7
+       movaps  0x20(%rsp), %xmm8
+       movaps  0x30(%rsp), %xmm9
+       movaps  0x40(%rsp), %xmm10
+       movaps  0x50(%rsp), %xmm11
+       movaps  0x60(%rsp), %xmm12
+       movaps  0x70(%rsp), %xmm13
+       movaps  0x80(%rsp), %xmm14
+       movaps  0x90(%rsp), %xmm15
+       lea     0xa8(%rsp), %rsp
+.LSEH_end_ecp_nistz256_avx2_select_w5:
+___
+$code.=<<___;
+       ret
+.size  ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
+___
+}
+if ($avx>1) {
+my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
+my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
+my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
+my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
+my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
+
+$code.=<<___;
+
+################################################################################
+# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_avx2_select_w7
+.type  ecp_nistz256_avx2_select_w7,\@abi-omnipotent
+.align 32
+ecp_nistz256_avx2_select_w7:
+.Lavx2_select_w7:
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       lea     -0x88(%rsp), %rax
+.LSEH_begin_ecp_nistz256_avx2_select_w7:
+       .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax), %rsp
+       .byte   0xc5,0xf8,0x29,0x70,0xe0        #vmovaps %xmm6, -0x20(%rax)
+       .byte   0xc5,0xf8,0x29,0x78,0xf0        #vmovaps %xmm7, -0x10(%rax)
+       .byte   0xc5,0x78,0x29,0x40,0x00        #vmovaps %xmm8, 8(%rax)
+       .byte   0xc5,0x78,0x29,0x48,0x10        #vmovaps %xmm9, 0x10(%rax)
+       .byte   0xc5,0x78,0x29,0x50,0x20        #vmovaps %xmm10, 0x20(%rax)
+       .byte   0xc5,0x78,0x29,0x58,0x30        #vmovaps %xmm11, 0x30(%rax)
+       .byte   0xc5,0x78,0x29,0x60,0x40        #vmovaps %xmm12, 0x40(%rax)
+       .byte   0xc5,0x78,0x29,0x68,0x50        #vmovaps %xmm13, 0x50(%rax)
+       .byte   0xc5,0x78,0x29,0x70,0x60        #vmovaps %xmm14, 0x60(%rax)
+       .byte   0xc5,0x78,0x29,0x78,0x70        #vmovaps %xmm15, 0x70(%rax)
+___
+$code.=<<___;
+       vmovdqa .LThree(%rip), $THREE
+
+       vpxor   $Ra, $Ra, $Ra
+       vpxor   $Rb, $Rb, $Rb
+
+       vmovdqa .LOne(%rip), $M0
+       vmovdqa .LTwo(%rip), $M1
+       vmovdqa .LThree(%rip), $M2
+
+       vmovd   $index, %xmm1
+       vpermd  $INDEX, $Ra, $INDEX
+       # Skip index = 0, because it is implicitly the point at infinity
+
+       mov     \$21, %rax
+.Lselect_loop_avx2_w7:
+
+       vmovdqa 32*0($in_t), $T0a
+       vmovdqa 32*1($in_t), $T0b
+
+       vmovdqa 32*2($in_t), $T1a
+       vmovdqa 32*3($in_t), $T1b
+
+       vmovdqa 32*4($in_t), $T2a
+       vmovdqa 32*5($in_t), $T2b
+
+       vpcmpeqd        $INDEX, $M0, $TMP0
+       vpcmpeqd        $INDEX, $M1, $TMP1
+       vpcmpeqd        $INDEX, $M2, $TMP2
+
+       vpaddd  $THREE, $M0, $M0
+       vpaddd  $THREE, $M1, $M1
+       vpaddd  $THREE, $M2, $M2
+       lea     32*6($in_t), $in_t
+
+       vpand   $TMP0, $T0a, $T0a
+       vpand   $TMP0, $T0b, $T0b
+       vpand   $TMP1, $T1a, $T1a
+       vpand   $TMP1, $T1b, $T1b
+       vpand   $TMP2, $T2a, $T2a
+       vpand   $TMP2, $T2b, $T2b
+
+       vpxor   $T0a, $Ra, $Ra
+       vpxor   $T0b, $Rb, $Rb
+       vpxor   $T1a, $Ra, $Ra
+       vpxor   $T1b, $Rb, $Rb
+       vpxor   $T2a, $Ra, $Ra
+       vpxor   $T2b, $Rb, $Rb
+
+       dec %rax
+       jnz .Lselect_loop_avx2_w7
+
+
+       vmovdqa 32*0($in_t), $T0a
+       vmovdqa 32*1($in_t), $T0b
+
+       vpcmpeqd        $INDEX, $M0, $TMP0
+
+       vpand   $TMP0, $T0a, $T0a
+       vpand   $TMP0, $T0b, $T0b
+
+       vpxor   $T0a, $Ra, $Ra
+       vpxor   $T0b, $Rb, $Rb
+
+       vmovdqu $Ra, 32*0($val)
+       vmovdqu $Rb, 32*1($val)
+       vzeroupper
+___
+$code.=<<___   if ($win64);
+       movaps  (%rsp), %xmm6
+       movaps  0x10(%rsp), %xmm7
+       movaps  0x20(%rsp), %xmm8
+       movaps  0x30(%rsp), %xmm9
+       movaps  0x40(%rsp), %xmm10
+       movaps  0x50(%rsp), %xmm11
+       movaps  0x60(%rsp), %xmm12
+       movaps  0x70(%rsp), %xmm13
+       movaps  0x80(%rsp), %xmm14
+       movaps  0x90(%rsp), %xmm15
+       lea     0xa8(%rsp), %rsp
+.LSEH_end_ecp_nistz256_avx2_select_w7:
+___
+$code.=<<___;
+       ret
+.size  ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+___
+} else {
+$code.=<<___;
+.globl ecp_nistz256_avx2_select_w7
+.type  ecp_nistz256_avx2_select_w7,\@function,3
+.align 32
+ecp_nistz256_avx2_select_w7:
+       .byte   0x0f,0x0b       # ud2
+       ret
+.size  ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+___
+}
+{{{
+########################################################################
+# This block implements higher level point_double, point_add and
+# point_add_affine. The key to performance in this case is to allow
+# out-of-order execution logic to overlap computations from next step
+# with tail processing from current step. By using tailored calling
+# sequence we minimize inter-step overhead to give processor better
+# shot at overlapping operations...
+#
+# You will notice that input data is copied to stack. Trouble is that
+# there are no registers to spare for holding original pointers and
+# reloading them, pointers, would create undesired dependencies on
+# effective addresses calculation paths. In other words it's too done
+# to favour out-of-order execution logic.
+#                                              <appro@openssl.org>
+
+my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
+my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
+my ($poly1,$poly3)=($acc6,$acc7);
+
+sub load_for_mul () {
+my ($a,$b,$src0) = @_;
+my $bias = $src0 eq "%rax" ? 0 : -128;
+
+"      mov     $b, $src0
+       lea     $b, $b_ptr
+       mov     8*0+$a, $acc1
+       mov     8*1+$a, $acc2
+       lea     $bias+$a, $a_ptr
+       mov     8*2+$a, $acc3
+       mov     8*3+$a, $acc4"
+}
+
+sub load_for_sqr () {
+my ($a,$src0) = @_;
+my $bias = $src0 eq "%rax" ? 0 : -128;
+
+"      mov     8*0+$a, $src0
+       mov     8*1+$a, $acc6
+       lea     $bias+$a, $a_ptr
+       mov     8*2+$a, $acc7
+       mov     8*3+$a, $acc0"
+}
+
+                                                                       {
+########################################################################
+# operate in 4-5-0-1 "name space" that matches multiplication output
+#
+my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
+
+$code.=<<___;
+.type  __ecp_nistz256_add_toq,\@abi-omnipotent
+.align 32
+__ecp_nistz256_add_toq:
+       add     8*0($b_ptr), $a0
+       adc     8*1($b_ptr), $a1
+        mov    $a0, $t0
+       adc     8*2($b_ptr), $a2
+       adc     8*3($b_ptr), $a3
+        mov    $a1, $t1
+       sbb     $t4, $t4
+
+       sub     \$-1, $a0
+        mov    $a2, $t2
+       sbb     $poly1, $a1
+       sbb     \$0, $a2
+        mov    $a3, $t3
+       sbb     $poly3, $a3
+       test    $t4, $t4
+
+       cmovz   $t0, $a0
+       cmovz   $t1, $a1
+       mov     $a0, 8*0($r_ptr)
+       cmovz   $t2, $a2
+       mov     $a1, 8*1($r_ptr)
+       cmovz   $t3, $a3
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+
+       ret
+.size  __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
+
+.type  __ecp_nistz256_sub_fromq,\@abi-omnipotent
+.align 32
+__ecp_nistz256_sub_fromq:
+       sub     8*0($b_ptr), $a0
+       sbb     8*1($b_ptr), $a1
+        mov    $a0, $t0
+       sbb     8*2($b_ptr), $a2
+       sbb     8*3($b_ptr), $a3
+        mov    $a1, $t1
+       sbb     $t4, $t4
+
+       add     \$-1, $a0
+        mov    $a2, $t2
+       adc     $poly1, $a1
+       adc     \$0, $a2
+        mov    $a3, $t3
+       adc     $poly3, $a3
+       test    $t4, $t4
+
+       cmovz   $t0, $a0
+       cmovz   $t1, $a1
+       mov     $a0, 8*0($r_ptr)
+       cmovz   $t2, $a2
+       mov     $a1, 8*1($r_ptr)
+       cmovz   $t3, $a3
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+
+       ret
+.size  __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
+
+.type  __ecp_nistz256_subq,\@abi-omnipotent
+.align 32
+__ecp_nistz256_subq:
+       sub     $a0, $t0
+       sbb     $a1, $t1
+        mov    $t0, $a0
+       sbb     $a2, $t2
+       sbb     $a3, $t3
+        mov    $t1, $a1
+       sbb     $t4, $t4
+
+       add     \$-1, $t0
+        mov    $t2, $a2
+       adc     $poly1, $t1
+       adc     \$0, $t2
+        mov    $t3, $a3
+       adc     $poly3, $t3
+       test    $t4, $t4
+
+       cmovnz  $t0, $a0
+       cmovnz  $t1, $a1
+       cmovnz  $t2, $a2
+       cmovnz  $t3, $a3
+
+       ret
+.size  __ecp_nistz256_subq,.-__ecp_nistz256_subq
+
+.type  __ecp_nistz256_mul_by_2q,\@abi-omnipotent
+.align 32
+__ecp_nistz256_mul_by_2q:
+       add     $a0, $a0                # a0:a3+a0:a3
+       adc     $a1, $a1
+        mov    $a0, $t0
+       adc     $a2, $a2
+       adc     $a3, $a3
+        mov    $a1, $t1
+       sbb     $t4, $t4
+
+       sub     \$-1, $a0
+        mov    $a2, $t2
+       sbb     $poly1, $a1
+       sbb     \$0, $a2
+        mov    $a3, $t3
+       sbb     $poly3, $a3
+       test    $t4, $t4
+
+       cmovz   $t0, $a0
+       cmovz   $t1, $a1
+       mov     $a0, 8*0($r_ptr)
+       cmovz   $t2, $a2
+       mov     $a1, 8*1($r_ptr)
+       cmovz   $t3, $a3
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+
+       ret
+.size  __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
+___
+                                                                       }
+sub gen_double () {
+    my $x = shift;
+    my ($src0,$sfx,$bias);
+    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
+
+    if ($x ne "x") {
+       $src0 = "%rax";
+       $sfx  = "";
+       $bias = 0;
+
+$code.=<<___;
+.globl ecp_nistz256_point_double
+.type  ecp_nistz256_point_double,\@function,2
+.align 32
+ecp_nistz256_point_double:
+___
+$code.=<<___   if ($addx);
+       mov     \$0x80100, %ecx
+       and     OPENSSL_ia32cap_P+8(%rip), %ecx
+       cmp     \$0x80100, %ecx
+       je      .Lpoint_doublex
+___
+    } else {
+       $src0 = "%rdx";
+       $sfx  = "x";
+       $bias = 128;
+
+$code.=<<___;
+.type  ecp_nistz256_point_doublex,\@function,2
+.align 32
+ecp_nistz256_point_doublex:
+.Lpoint_doublex:
+___
+    }
+$code.=<<___;
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       sub     \$32*5+8, %rsp
+
+       movdqu  0x00($a_ptr), %xmm0             # copy  *(P256_POINT *)$a_ptr.x
+       mov     $a_ptr, $b_ptr                  # backup copy
+       movdqu  0x10($a_ptr), %xmm1
+        mov    0x20+8*0($a_ptr), $acc4         # load in_y in "5-4-0-1" order
+        mov    0x20+8*1($a_ptr), $acc5
+        mov    0x20+8*2($a_ptr), $acc0
+        mov    0x20+8*3($a_ptr), $acc1
+        mov    .Lpoly+8*1(%rip), $poly1
+        mov    .Lpoly+8*3(%rip), $poly3
+       movdqa  %xmm0, $in_x(%rsp)
+       movdqa  %xmm1, $in_x+0x10(%rsp)
+       lea     0x20($r_ptr), $acc2
+       lea     0x40($r_ptr), $acc3
+       movq    $r_ptr, %xmm0
+       movq    $acc2, %xmm1
+       movq    $acc3, %xmm2
+
+       lea     $S(%rsp), $r_ptr
+       call    __ecp_nistz256_mul_by_2$x       # p256_mul_by_2(S, in_y);
+
+       mov     0x40+8*0($a_ptr), $src0
+       mov     0x40+8*1($a_ptr), $acc6
+       mov     0x40+8*2($a_ptr), $acc7
+       mov     0x40+8*3($a_ptr), $acc0
+       lea     0x40-$bias($a_ptr), $a_ptr
+       lea     $Zsqr(%rsp), $r_ptr
+       call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Zsqr, in_z);
+
+       `&load_for_sqr("$S(%rsp)", "$src0")`
+       lea     $S(%rsp), $r_ptr
+       call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(S, S);
+
+       mov     0x20($b_ptr), $src0             # $b_ptr is still valid
+       mov     0x40+8*0($b_ptr), $acc1
+       mov     0x40+8*1($b_ptr), $acc2
+       mov     0x40+8*2($b_ptr), $acc3
+       mov     0x40+8*3($b_ptr), $acc4
+       lea     0x40-$bias($b_ptr), $a_ptr
+       lea     0x20($b_ptr), $b_ptr
+       movq    %xmm2, $r_ptr
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_z, in_z, in_y);
+       call    __ecp_nistz256_mul_by_2$x       # p256_mul_by_2(res_z, res_z);
+
+       mov     $in_x+8*0(%rsp), $acc4          # "5-4-0-1" order
+       mov     $in_x+8*1(%rsp), $acc5
+       lea     $Zsqr(%rsp), $b_ptr
+       mov     $in_x+8*2(%rsp), $acc0
+       mov     $in_x+8*3(%rsp), $acc1
+       lea     $M(%rsp), $r_ptr
+       call    __ecp_nistz256_add_to$x         # p256_add(M, in_x, Zsqr);
+
+       mov     $in_x+8*0(%rsp), $acc4          # "5-4-0-1" order
+       mov     $in_x+8*1(%rsp), $acc5
+       lea     $Zsqr(%rsp), $b_ptr
+       mov     $in_x+8*2(%rsp), $acc0
+       mov     $in_x+8*3(%rsp), $acc1
+       lea     $Zsqr(%rsp), $r_ptr
+       call    __ecp_nistz256_sub_from$x       # p256_sub(Zsqr, in_x, Zsqr);
+
+       `&load_for_sqr("$S(%rsp)", "$src0")`
+       movq    %xmm1, $r_ptr
+       call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(res_y, S);
+___
+{      
+######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
+# operate in 4-5-6-7 "name space" that matches squaring output
+#
+my ($poly1,$poly3)=($a_ptr,$t1);
+my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
+
+$code.=<<___;
+       xor     $t4, $t4
+       mov     $a0, $t0
+       add     \$-1, $a0
+       mov     $a1, $t1
+       adc     $poly1, $a1
+       mov     $a2, $t2
+       adc     \$0, $a2
+       mov     $a3, $t3
+       adc     $poly3, $a3
+       adc     \$0, $t4
+       xor     $a_ptr, $a_ptr          # borrow $a_ptr
+       test    \$1, $t0
+
+       cmovz   $t0, $a0
+       cmovz   $t1, $a1
+       cmovz   $t2, $a2
+       cmovz   $t3, $a3
+       cmovz   $a_ptr, $t4
+
+       mov     $a1, $t0                # a0:a3>>1
+       shr     \$1, $a0
+       shl     \$63, $t0
+       mov     $a2, $t1
+       shr     \$1, $a1
+       or      $t0, $a0
+       shl     \$63, $t1
+       mov     $a3, $t2
+       shr     \$1, $a2
+       or      $t1, $a1
+       shl     \$63, $t2
+       mov     $a0, 8*0($r_ptr)
+       shr     \$1, $a3
+       mov     $a1, 8*1($r_ptr)
+       shl     \$63, $t4
+       or      $t2, $a2
+       or      $t4, $a3
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+___
+}
+$code.=<<___;
+       `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
+       lea     $M(%rsp), $r_ptr
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(M, M, Zsqr);
+
+       lea     $tmp0(%rsp), $r_ptr
+       call    __ecp_nistz256_mul_by_2$x
+
+       lea     $M(%rsp), $b_ptr
+       lea     $M(%rsp), $r_ptr
+       call    __ecp_nistz256_add_to$x         # p256_mul_by_3(M, M);
+
+       `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
+       lea     $S(%rsp), $r_ptr
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S, S, in_x);
+
+       lea     $tmp0(%rsp), $r_ptr
+       call    __ecp_nistz256_mul_by_2$x       # p256_mul_by_2(tmp0, S);
+
+       `&load_for_sqr("$M(%rsp)", "$src0")`
+       movq    %xmm0, $r_ptr
+       call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(res_x, M);
+
+       lea     $tmp0(%rsp), $b_ptr
+       mov     $acc6, $acc0                    # harmonize sqr output and sub input
+       mov     $acc7, $acc1
+       mov     $a_ptr, $poly1
+       mov     $t1, $poly3
+       call    __ecp_nistz256_sub_from$x       # p256_sub(res_x, res_x, tmp0);
+
+       mov     $S+8*0(%rsp), $t0
+       mov     $S+8*1(%rsp), $t1
+       mov     $S+8*2(%rsp), $t2
+       mov     $S+8*3(%rsp), $acc2             # "4-5-0-1" order
+       lea     $S(%rsp), $r_ptr
+       call    __ecp_nistz256_sub$x            # p256_sub(S, S, res_x);
+
+       mov     $M(%rsp), $src0
+       lea     $M(%rsp), $b_ptr
+       mov     $acc4, $acc6                    # harmonize sub output and mul input
+       xor     %ecx, %ecx
+       mov     $acc4, $S+8*0(%rsp)             # have to save:-(       
+       mov     $acc5, $acc2
+       mov     $acc5, $S+8*1(%rsp)
+       cmovz   $acc0, $acc3
+       mov     $acc0, $S+8*2(%rsp)
+       lea     $S-$bias(%rsp), $a_ptr
+       cmovz   $acc1, $acc4
+       mov     $acc1, $S+8*3(%rsp)
+       mov     $acc6, $acc1
+       lea     $S(%rsp), $r_ptr
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S, S, M);
+
+       movq    %xmm1, $b_ptr
+       movq    %xmm1, $r_ptr
+       call    __ecp_nistz256_sub_from$x       # p256_sub(res_y, S, res_y);
+
+       add     \$32*5+8, %rsp
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbx
+       pop     %rbp
+       ret
+.size  ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
+___
+}
+&gen_double("q");
+
+sub gen_add () {
+    my $x = shift;
+    my ($src0,$sfx,$bias);
+    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
+       $U1,$U2,$S1,$S2,
+       $res_x,$res_y,$res_z,
+       $in1_x,$in1_y,$in1_z,
+       $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
+    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
+
+    if ($x ne "x") {
+       $src0 = "%rax";
+       $sfx  = "";
+       $bias = 0;
+
+$code.=<<___;
+.globl ecp_nistz256_point_add
+.type  ecp_nistz256_point_add,\@function,3
+.align 32
+ecp_nistz256_point_add:
+___
+$code.=<<___   if ($addx);
+       mov     \$0x80100, %ecx
+       and     OPENSSL_ia32cap_P+8(%rip), %ecx
+       cmp     \$0x80100, %ecx
+       je      .Lpoint_addx
+___
+    } else {
+       $src0 = "%rdx";
+       $sfx  = "x";
+       $bias = 128;
+
+$code.=<<___;
+.type  ecp_nistz256_point_addx,\@function,3
+.align 32
+ecp_nistz256_point_addx:
+.Lpoint_addx:
+___
+    }
+$code.=<<___;
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       sub     \$32*18+8, %rsp
+
+       movdqu  0x00($a_ptr), %xmm0             # copy  *(P256_POINT *)$a_ptr
+       movdqu  0x10($a_ptr), %xmm1
+       movdqu  0x20($a_ptr), %xmm2
+       movdqu  0x30($a_ptr), %xmm3
+       movdqu  0x40($a_ptr), %xmm4
+       movdqu  0x50($a_ptr), %xmm5
+       mov     $a_ptr, $b_ptr                  # reassign
+       mov     $b_org, $a_ptr                  # reassign
+       movdqa  %xmm0, $in1_x(%rsp)
+       movdqa  %xmm1, $in1_x+0x10(%rsp)
+       por     %xmm0, %xmm1
+       movdqa  %xmm2, $in1_y(%rsp)
+       movdqa  %xmm3, $in1_y+0x10(%rsp)
+       por     %xmm2, %xmm3
+       movdqa  %xmm4, $in1_z(%rsp)
+       movdqa  %xmm5, $in1_z+0x10(%rsp)
+       por     %xmm1, %xmm3
+
+       movdqu  0x00($a_ptr), %xmm0             # copy  *(P256_POINT *)$b_ptr
+        pshufd \$0xb1, %xmm3, %xmm5
+       movdqu  0x10($a_ptr), %xmm1
+       movdqu  0x20($a_ptr), %xmm2
+        por    %xmm3, %xmm5
+       movdqu  0x30($a_ptr), %xmm3
+        mov    0x40+8*0($a_ptr), $src0         # load original in2_z
+        mov    0x40+8*1($a_ptr), $acc6
+        mov    0x40+8*2($a_ptr), $acc7
+        mov    0x40+8*3($a_ptr), $acc0
+       movdqa  %xmm0, $in2_x(%rsp)
+        pshufd \$0x1e, %xmm5, %xmm4
+       movdqa  %xmm1, $in2_x+0x10(%rsp)
+       por     %xmm0, %xmm1
+        movq   $r_ptr, %xmm0                   # save $r_ptr
+       movdqa  %xmm2, $in2_y(%rsp)
+       movdqa  %xmm3, $in2_y+0x10(%rsp)
+       por     %xmm2, %xmm3
+        por    %xmm4, %xmm5
+        pxor   %xmm4, %xmm4
+       por     %xmm1, %xmm3
+
+       lea     0x40-$bias($a_ptr), $a_ptr      # $a_ptr is still valid
+        mov    $src0, $in2_z+8*0(%rsp)         # make in2_z copy
+        mov    $acc6, $in2_z+8*1(%rsp)
+        mov    $acc7, $in2_z+8*2(%rsp)
+        mov    $acc0, $in2_z+8*3(%rsp)
+       lea     $Z2sqr(%rsp), $r_ptr            # Z2^2
+       call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Z2sqr, in2_z);
+
+       pcmpeqd %xmm4, %xmm5
+       pshufd  \$0xb1, %xmm3, %xmm4
+       por     %xmm3, %xmm4
+       pshufd  \$0, %xmm5, %xmm5               # in1infty
+       pshufd  \$0x1e, %xmm4, %xmm3
+       por     %xmm3, %xmm4
+       pxor    %xmm3, %xmm3
+       pcmpeqd %xmm3, %xmm4
+       pshufd  \$0, %xmm4, %xmm4               # in2infty
+        mov    0x40+8*0($b_ptr), $src0         # load original in1_z
+        mov    0x40+8*1($b_ptr), $acc6
+        mov    0x40+8*2($b_ptr), $acc7
+        mov    0x40+8*3($b_ptr), $acc0
+
+       lea     0x40-$bias($b_ptr), $a_ptr
+       lea     $Z1sqr(%rsp), $r_ptr            # Z1^2
+       call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Z1sqr, in1_z);
+
+       `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
+       lea     $S1(%rsp), $r_ptr               # S1 = Z2^3
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S1, Z2sqr, in2_z);
+
+       `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
+       lea     $S2(%rsp), $r_ptr               # S2 = Z1^3
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, Z1sqr, in1_z);
+
+       `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
+       lea     $S1(%rsp), $r_ptr               # S1 = Y1*Z2^3
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S1, S1, in1_y);
+
+       `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
+       lea     $S2(%rsp), $r_ptr               # S2 = Y2*Z1^3
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, S2, in2_y);
+
+       lea     $S1(%rsp), $b_ptr
+       lea     $R(%rsp), $r_ptr                # R = S2 - S1
+       call    __ecp_nistz256_sub_from$x       # p256_sub(R, S2, S1);
+
+       or      $acc5, $acc4                    # see if result is zero
+       movdqa  %xmm4, %xmm2
+       or      $acc0, $acc4
+       or      $acc1, $acc4
+       por     %xmm5, %xmm2                    # in1infty || in2infty
+       movq    $acc4, %xmm3
+
+       `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
+       lea     $U1(%rsp), $r_ptr               # U1 = X1*Z2^2
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U1, in1_x, Z2sqr);
+
+       `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
+       lea     $U2(%rsp), $r_ptr               # U2 = X2*Z1^2
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U2, in2_x, Z1sqr);
+
+       lea     $U1(%rsp), $b_ptr
+       lea     $H(%rsp), $r_ptr                # H = U2 - U1
+       call    __ecp_nistz256_sub_from$x       # p256_sub(H, U2, U1);
+
+       or      $acc5, $acc4                    # see if result is zero
+       or      $acc0, $acc4
+       or      $acc1, $acc4
+
+       .byte   0x3e                            # predict taken
+       jnz     .Ladd_proceed$x                 # is_equal(U1,U2)?
+       movq    %xmm2, $acc0
+       movq    %xmm3, $acc1
+       test    $acc0, $acc0
+       jnz     .Ladd_proceed$x                 # (in1infty || in2infty)?
+       test    $acc1, $acc1
+       jz      .Ladd_proceed$x                 # is_equal(S1,S2)?
+
+       movq    %xmm0, $r_ptr                   # restore $r_ptr
+       pxor    %xmm0, %xmm0
+       movdqu  %xmm0, 0x00($r_ptr)
+       movdqu  %xmm0, 0x10($r_ptr)
+       movdqu  %xmm0, 0x20($r_ptr)
+       movdqu  %xmm0, 0x30($r_ptr)
+       movdqu  %xmm0, 0x40($r_ptr)
+       movdqu  %xmm0, 0x50($r_ptr)
+       jmp     .Ladd_done$x
+
+.align 32
+.Ladd_proceed$x:
+       `&load_for_sqr("$R(%rsp)", "$src0")`
+       lea     $Rsqr(%rsp), $r_ptr             # R^2
+       call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Rsqr, R);
+
+       `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
+       lea     $res_z(%rsp), $r_ptr            # Z3 = H*Z1*Z2
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_z, H, in1_z);
+
+       `&load_for_sqr("$H(%rsp)", "$src0")`
+       lea     $Hsqr(%rsp), $r_ptr             # H^2
+       call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Hsqr, H);
+
+       `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
+       lea     $res_z(%rsp), $r_ptr            # Z3 = H*Z1*Z2
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_z, res_z, in2_z);
+
+       `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
+       lea     $Hcub(%rsp), $r_ptr             # H^3
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(Hcub, Hsqr, H);
+
+       `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
+       lea     $U2(%rsp), $r_ptr               # U1*H^2
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U2, U1, Hsqr);
+___
+{
+#######################################################################
+# operate in 4-5-0-1 "name space" that matches multiplication output
+#
+my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
+my ($poly1, $poly3)=($acc6,$acc7);
+
+$code.=<<___;
+       #lea    $U2(%rsp), $a_ptr
+       #lea    $Hsqr(%rsp), $r_ptr     # 2*U1*H^2
+       #call   __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
+
+       add     $acc0, $acc0            # a0:a3+a0:a3
+       lea     $Rsqr(%rsp), $a_ptr
+       adc     $acc1, $acc1
+        mov    $acc0, $t0
+       adc     $acc2, $acc2
+       adc     $acc3, $acc3
+        mov    $acc1, $t1
+       sbb     $t4, $t4
+
+       sub     \$-1, $acc0
+        mov    $acc2, $t2
+       sbb     $poly1, $acc1
+       sbb     \$0, $acc2
+        mov    $acc3, $t3
+       sbb     $poly3, $acc3
+       test    $t4, $t4
+
+       cmovz   $t0, $acc0
+       mov     8*0($a_ptr), $t0
+       cmovz   $t1, $acc1
+       mov     8*1($a_ptr), $t1
+       cmovz   $t2, $acc2
+       mov     8*2($a_ptr), $t2
+       cmovz   $t3, $acc3
+       mov     8*3($a_ptr), $t3
+
+       call    __ecp_nistz256_sub$x            # p256_sub(res_x, Rsqr, Hsqr);
+
+       lea     $Hcub(%rsp), $b_ptr
+       lea     $res_x(%rsp), $r_ptr
+       call    __ecp_nistz256_sub_from$x       # p256_sub(res_x, res_x, Hcub);
+
+       mov     $U2+8*0(%rsp), $t0
+       mov     $U2+8*1(%rsp), $t1
+       mov     $U2+8*2(%rsp), $t2
+       mov     $U2+8*3(%rsp), $t3
+       lea     $res_y(%rsp), $r_ptr
+
+       call    __ecp_nistz256_sub$x            # p256_sub(res_y, U2, res_x);
+
+       mov     $acc0, 8*0($r_ptr)              # save the result, as
+       mov     $acc1, 8*1($r_ptr)              # __ecp_nistz256_sub doesn't
+       mov     $acc2, 8*2($r_ptr)
+       mov     $acc3, 8*3($r_ptr)
+___
+}
+$code.=<<___;
+       `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
+       lea     $S2(%rsp), $r_ptr
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, S1, Hcub);
+
+       `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
+       lea     $res_y(%rsp), $r_ptr
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_y, R, res_y);
+
+       lea     $S2(%rsp), $b_ptr
+       lea     $res_y(%rsp), $r_ptr
+       call    __ecp_nistz256_sub_from$x       # p256_sub(res_y, res_y, S2);
+
+       movq    %xmm0, $r_ptr           # restore $r_ptr
+
+       movdqa  %xmm5, %xmm0            # copy_conditional(res_z, in2_z, in1infty);
+       movdqa  %xmm5, %xmm1
+       pandn   $res_z(%rsp), %xmm0
+       movdqa  %xmm5, %xmm2
+       pandn   $res_z+0x10(%rsp), %xmm1
+       movdqa  %xmm5, %xmm3
+       pand    $in2_z(%rsp), %xmm2
+       pand    $in2_z+0x10(%rsp), %xmm3
+       por     %xmm0, %xmm2
+       por     %xmm1, %xmm3
+
+       movdqa  %xmm4, %xmm0            # copy_conditional(res_z, in1_z, in2infty);
+       movdqa  %xmm4, %xmm1
+       pandn   %xmm2, %xmm0
+       movdqa  %xmm4, %xmm2
+       pandn   %xmm3, %xmm1
+       movdqa  %xmm4, %xmm3
+       pand    $in1_z(%rsp), %xmm2
+       pand    $in1_z+0x10(%rsp), %xmm3
+       por     %xmm0, %xmm2
+       por     %xmm1, %xmm3
+       movdqu  %xmm2, 0x40($r_ptr)
+       movdqu  %xmm3, 0x50($r_ptr)
+
+       movdqa  %xmm5, %xmm0            # copy_conditional(res_x, in2_x, in1infty);
+       movdqa  %xmm5, %xmm1
+       pandn   $res_x(%rsp), %xmm0
+       movdqa  %xmm5, %xmm2
+       pandn   $res_x+0x10(%rsp), %xmm1
+       movdqa  %xmm5, %xmm3
+       pand    $in2_x(%rsp), %xmm2
+       pand    $in2_x+0x10(%rsp), %xmm3
+       por     %xmm0, %xmm2
+       por     %xmm1, %xmm3
+
+       movdqa  %xmm4, %xmm0            # copy_conditional(res_x, in1_x, in2infty);
+       movdqa  %xmm4, %xmm1
+       pandn   %xmm2, %xmm0
+       movdqa  %xmm4, %xmm2
+       pandn   %xmm3, %xmm1
+       movdqa  %xmm4, %xmm3
+       pand    $in1_x(%rsp), %xmm2
+       pand    $in1_x+0x10(%rsp), %xmm3
+       por     %xmm0, %xmm2
+       por     %xmm1, %xmm3
+       movdqu  %xmm2, 0x00($r_ptr)
+       movdqu  %xmm3, 0x10($r_ptr)
+
+       movdqa  %xmm5, %xmm0            # copy_conditional(res_y, in2_y, in1infty);
+       movdqa  %xmm5, %xmm1
+       pandn   $res_y(%rsp), %xmm0
+       movdqa  %xmm5, %xmm2
+       pandn   $res_y+0x10(%rsp), %xmm1
+       movdqa  %xmm5, %xmm3
+       pand    $in2_y(%rsp), %xmm2
+       pand    $in2_y+0x10(%rsp), %xmm3
+       por     %xmm0, %xmm2
+       por     %xmm1, %xmm3
+
+       movdqa  %xmm4, %xmm0            # copy_conditional(res_y, in1_y, in2infty);
+       movdqa  %xmm4, %xmm1
+       pandn   %xmm2, %xmm0
+       movdqa  %xmm4, %xmm2
+       pandn   %xmm3, %xmm1
+       movdqa  %xmm4, %xmm3
+       pand    $in1_y(%rsp), %xmm2
+       pand    $in1_y+0x10(%rsp), %xmm3
+       por     %xmm0, %xmm2
+       por     %xmm1, %xmm3
+       movdqu  %xmm2, 0x20($r_ptr)
+       movdqu  %xmm3, 0x30($r_ptr)
+
+.Ladd_done$x:
+       add     \$32*18+8, %rsp
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbx
+       pop     %rbp
+       ret
+.size  ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
+___
+}
+&gen_add("q");
+
+sub gen_add_affine () {
+    my $x = shift;
+    my ($src0,$sfx,$bias);
+    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
+       $res_x,$res_y,$res_z,
+       $in1_x,$in1_y,$in1_z,
+       $in2_x,$in2_y)=map(32*$_,(0..14));
+    my $Z1sqr = $S2;
+
+    if ($x ne "x") {
+       $src0 = "%rax";
+       $sfx  = "";
+       $bias = 0;
+
+$code.=<<___;
+.globl ecp_nistz256_point_add_affine
+.type  ecp_nistz256_point_add_affine,\@function,3
+.align 32
+ecp_nistz256_point_add_affine:
+___
+$code.=<<___   if ($addx);
+       mov     \$0x80100, %ecx
+       and     OPENSSL_ia32cap_P+8(%rip), %ecx
+       cmp     \$0x80100, %ecx
+       je      .Lpoint_add_affinex
+___
+    } else {
+       $src0 = "%rdx";
+       $sfx  = "x";
+       $bias = 128;
+
+$code.=<<___;
+.type  ecp_nistz256_point_add_affinex,\@function,3
+.align 32
+ecp_nistz256_point_add_affinex:
+.Lpoint_add_affinex:
+___
+    }
+$code.=<<___;
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       sub     \$32*15+8, %rsp
+
+       movdqu  0x00($a_ptr), %xmm0     # copy  *(P256_POINT *)$a_ptr
+       mov     $b_org, $b_ptr          # reassign
+       movdqu  0x10($a_ptr), %xmm1
+       movdqu  0x20($a_ptr), %xmm2
+       movdqu  0x30($a_ptr), %xmm3
+       movdqu  0x40($a_ptr), %xmm4
+       movdqu  0x50($a_ptr), %xmm5
+        mov    0x40+8*0($a_ptr), $src0 # load original in1_z
+        mov    0x40+8*1($a_ptr), $acc6
+        mov    0x40+8*2($a_ptr), $acc7
+        mov    0x40+8*3($a_ptr), $acc0
+       movdqa  %xmm0, $in1_x(%rsp)
+       movdqa  %xmm1, $in1_x+0x10(%rsp)
+       por     %xmm0, %xmm1
+       movdqa  %xmm2, $in1_y(%rsp)
+       movdqa  %xmm3, $in1_y+0x10(%rsp)
+       por     %xmm2, %xmm3
+       movdqa  %xmm4, $in1_z(%rsp)
+       movdqa  %xmm5, $in1_z+0x10(%rsp)
+       por     %xmm1, %xmm3
+
+       movdqu  0x00($b_ptr), %xmm0     # copy  *(P256_POINT_AFFINE *)$b_ptr
+        pshufd \$0xb1, %xmm3, %xmm5
+       movdqu  0x10($b_ptr), %xmm1
+       movdqu  0x20($b_ptr), %xmm2
+        por    %xmm3, %xmm5
+       movdqu  0x30($b_ptr), %xmm3
+       movdqa  %xmm0, $in2_x(%rsp)
+        pshufd \$0x1e, %xmm5, %xmm4
+       movdqa  %xmm1, $in2_x+0x10(%rsp)
+       por     %xmm0, %xmm1
+        movq   $r_ptr, %xmm0           # save $r_ptr
+       movdqa  %xmm2, $in2_y(%rsp)
+       movdqa  %xmm3, $in2_y+0x10(%rsp)
+       por     %xmm2, %xmm3
+        por    %xmm4, %xmm5
+        pxor   %xmm4, %xmm4
+       por     %xmm1, %xmm3
+
+       lea     0x40-$bias($a_ptr), $a_ptr      # $a_ptr is still valid
+       lea     $Z1sqr(%rsp), $r_ptr            # Z1^2
+       call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Z1sqr, in1_z);
+
+       pcmpeqd %xmm4, %xmm5
+       pshufd  \$0xb1, %xmm3, %xmm4
+        mov    0x00($b_ptr), $src0             # $b_ptr is still valid
+        #lea   0x00($b_ptr), $b_ptr
+        mov    $acc4, $acc1                    # harmonize sqr output and mul input
+       por     %xmm3, %xmm4
+       pshufd  \$0, %xmm5, %xmm5               # in1infty
+       pshufd  \$0x1e, %xmm4, %xmm3
+        mov    $acc5, $acc2
+       por     %xmm3, %xmm4
+       pxor    %xmm3, %xmm3
+        mov    $acc6, $acc3
+       pcmpeqd %xmm3, %xmm4
+       pshufd  \$0, %xmm4, %xmm4               # in2infty
+
+       lea     $Z1sqr-$bias(%rsp), $a_ptr
+       mov     $acc7, $acc4
+       lea     $U2(%rsp), $r_ptr               # U2 = X2*Z1^2
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U2, Z1sqr, in2_x);
+
+       lea     $in1_x(%rsp), $b_ptr
+       lea     $H(%rsp), $r_ptr                # H = U2 - U1
+       call    __ecp_nistz256_sub_from$x       # p256_sub(H, U2, in1_x);
+
+       `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
+       lea     $S2(%rsp), $r_ptr               # S2 = Z1^3
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, Z1sqr, in1_z);
+
+       `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
+       lea     $res_z(%rsp), $r_ptr            # Z3 = H*Z1*Z2
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_z, H, in1_z);
+
+       `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
+       lea     $S2(%rsp), $r_ptr               # S2 = Y2*Z1^3
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, S2, in2_y);
+
+       lea     $in1_y(%rsp), $b_ptr
+       lea     $R(%rsp), $r_ptr                # R = S2 - S1
+       call    __ecp_nistz256_sub_from$x       # p256_sub(R, S2, in1_y);
+
+       `&load_for_sqr("$H(%rsp)", "$src0")`
+       lea     $Hsqr(%rsp), $r_ptr             # H^2
+       call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Hsqr, H);
+
+       `&load_for_sqr("$R(%rsp)", "$src0")`
+       lea     $Rsqr(%rsp), $r_ptr             # R^2
+       call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Rsqr, R);
+
+       `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
+       lea     $Hcub(%rsp), $r_ptr             # H^3
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(Hcub, Hsqr, H);
+
+       `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
+       lea     $U2(%rsp), $r_ptr               # U1*H^2
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U2, in1_x, Hsqr);
+___
+{
+#######################################################################
+# operate in 4-5-0-1 "name space" that matches multiplication output
+#
+my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
+my ($poly1, $poly3)=($acc6,$acc7);
+
+$code.=<<___;
+       #lea    $U2(%rsp), $a_ptr
+       #lea    $Hsqr(%rsp), $r_ptr     # 2*U1*H^2
+       #call   __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
+
+       add     $acc0, $acc0            # a0:a3+a0:a3
+       lea     $Rsqr(%rsp), $a_ptr
+       adc     $acc1, $acc1
+        mov    $acc0, $t0
+       adc     $acc2, $acc2
+       adc     $acc3, $acc3
+        mov    $acc1, $t1
+       sbb     $t4, $t4
+
+       sub     \$-1, $acc0
+        mov    $acc2, $t2
+       sbb     $poly1, $acc1
+       sbb     \$0, $acc2
+        mov    $acc3, $t3
+       sbb     $poly3, $acc3
+       test    $t4, $t4
+
+       cmovz   $t0, $acc0
+       mov     8*0($a_ptr), $t0
+       cmovz   $t1, $acc1
+       mov     8*1($a_ptr), $t1
+       cmovz   $t2, $acc2
+       mov     8*2($a_ptr), $t2
+       cmovz   $t3, $acc3
+       mov     8*3($a_ptr), $t3
+
+       call    __ecp_nistz256_sub$x            # p256_sub(res_x, Rsqr, Hsqr);
+
+       lea     $Hcub(%rsp), $b_ptr
+       lea     $res_x(%rsp), $r_ptr
+       call    __ecp_nistz256_sub_from$x       # p256_sub(res_x, res_x, Hcub);
+
+       mov     $U2+8*0(%rsp), $t0
+       mov     $U2+8*1(%rsp), $t1
+       mov     $U2+8*2(%rsp), $t2
+       mov     $U2+8*3(%rsp), $t3
+       lea     $H(%rsp), $r_ptr
+
+       call    __ecp_nistz256_sub$x            # p256_sub(H, U2, res_x);
+
+       mov     $acc0, 8*0($r_ptr)              # save the result, as
+       mov     $acc1, 8*1($r_ptr)              # __ecp_nistz256_sub doesn't
+       mov     $acc2, 8*2($r_ptr)
+       mov     $acc3, 8*3($r_ptr)
+___
+}
+$code.=<<___;
+       `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
+       lea     $S2(%rsp), $r_ptr
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, Hcub, in1_y);
+
+       `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
+       lea     $H(%rsp), $r_ptr
+       call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(H, H, R);
+
+       lea     $S2(%rsp), $b_ptr
+       lea     $res_y(%rsp), $r_ptr
+       call    __ecp_nistz256_sub_from$x       # p256_sub(res_y, H, S2);
+
+       movq    %xmm0, $r_ptr           # restore $r_ptr
+
+       movdqa  %xmm5, %xmm0            # copy_conditional(res_z, ONE, in1infty);
+       movdqa  %xmm5, %xmm1
+       pandn   $res_z(%rsp), %xmm0
+       movdqa  %xmm5, %xmm2
+       pandn   $res_z+0x10(%rsp), %xmm1
+       movdqa  %xmm5, %xmm3
+       pand    .LONE_mont(%rip), %xmm2
+       pand    .LONE_mont+0x10(%rip), %xmm3
+       por     %xmm0, %xmm2
+       por     %xmm1, %xmm3
+
+       movdqa  %xmm4, %xmm0            # copy_conditional(res_z, in1_z, in2infty);
+       movdqa  %xmm4, %xmm1
+       pandn   %xmm2, %xmm0
+       movdqa  %xmm4, %xmm2
+       pandn   %xmm3, %xmm1
+       movdqa  %xmm4, %xmm3
+       pand    $in1_z(%rsp), %xmm2
+       pand    $in1_z+0x10(%rsp), %xmm3
+       por     %xmm0, %xmm2
+       por     %xmm1, %xmm3
+       movdqu  %xmm2, 0x40($r_ptr)
+       movdqu  %xmm3, 0x50($r_ptr)
+
+       movdqa  %xmm5, %xmm0            # copy_conditional(res_x, in2_x, in1infty);
+       movdqa  %xmm5, %xmm1
+       pandn   $res_x(%rsp), %xmm0
+       movdqa  %xmm5, %xmm2
+       pandn   $res_x+0x10(%rsp), %xmm1
+       movdqa  %xmm5, %xmm3
+       pand    $in2_x(%rsp), %xmm2
+       pand    $in2_x+0x10(%rsp), %xmm3
+       por     %xmm0, %xmm2
+       por     %xmm1, %xmm3
+
+       movdqa  %xmm4, %xmm0            # copy_conditional(res_x, in1_x, in2infty);
+       movdqa  %xmm4, %xmm1
+       pandn   %xmm2, %xmm0
+       movdqa  %xmm4, %xmm2
+       pandn   %xmm3, %xmm1
+       movdqa  %xmm4, %xmm3
+       pand    $in1_x(%rsp), %xmm2
+       pand    $in1_x+0x10(%rsp), %xmm3
+       por     %xmm0, %xmm2
+       por     %xmm1, %xmm3
+       movdqu  %xmm2, 0x00($r_ptr)
+       movdqu  %xmm3, 0x10($r_ptr)
+
+       movdqa  %xmm5, %xmm0            # copy_conditional(res_y, in2_y, in1infty);
+       movdqa  %xmm5, %xmm1
+       pandn   $res_y(%rsp), %xmm0
+       movdqa  %xmm5, %xmm2
+       pandn   $res_y+0x10(%rsp), %xmm1
+       movdqa  %xmm5, %xmm3
+       pand    $in2_y(%rsp), %xmm2
+       pand    $in2_y+0x10(%rsp), %xmm3
+       por     %xmm0, %xmm2
+       por     %xmm1, %xmm3
+
+       movdqa  %xmm4, %xmm0            # copy_conditional(res_y, in1_y, in2infty);
+       movdqa  %xmm4, %xmm1
+       pandn   %xmm2, %xmm0
+       movdqa  %xmm4, %xmm2
+       pandn   %xmm3, %xmm1
+       movdqa  %xmm4, %xmm3
+       pand    $in1_y(%rsp), %xmm2
+       pand    $in1_y+0x10(%rsp), %xmm3
+       por     %xmm0, %xmm2
+       por     %xmm1, %xmm3
+       movdqu  %xmm2, 0x20($r_ptr)
+       movdqu  %xmm3, 0x30($r_ptr)
+
+       add     \$32*15+8, %rsp
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbx
+       pop     %rbp
+       ret
+.size  ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
+___
+}
+&gen_add_affine("q");
+
+########################################################################
+# AD*X magic
+#
+if ($addx) {                                                           {
+########################################################################
+# operate in 4-5-0-1 "name space" that matches multiplication output
+#
+my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
+
+$code.=<<___;
+.type  __ecp_nistz256_add_tox,\@abi-omnipotent
+.align 32
+__ecp_nistz256_add_tox:
+       xor     $t4, $t4
+       adc     8*0($b_ptr), $a0
+       adc     8*1($b_ptr), $a1
+        mov    $a0, $t0
+       adc     8*2($b_ptr), $a2
+       adc     8*3($b_ptr), $a3
+        mov    $a1, $t1
+       adc     \$0, $t4
+
+       xor     $t3, $t3
+       sbb     \$-1, $a0
+        mov    $a2, $t2
+       sbb     $poly1, $a1
+       sbb     \$0, $a2
+        mov    $a3, $t3
+       sbb     $poly3, $a3
+
+       bt      \$0, $t4
+       cmovnc  $t0, $a0
+       cmovnc  $t1, $a1
+       mov     $a0, 8*0($r_ptr)
+       cmovnc  $t2, $a2
+       mov     $a1, 8*1($r_ptr)
+       cmovnc  $t3, $a3
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+
+       ret
+.size  __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type  __ecp_nistz256_sub_fromx,\@abi-omnipotent
+.align 32
+__ecp_nistz256_sub_fromx:
+       xor     $t4, $t4
+       sbb     8*0($b_ptr), $a0
+       sbb     8*1($b_ptr), $a1
+        mov    $a0, $t0
+       sbb     8*2($b_ptr), $a2
+       sbb     8*3($b_ptr), $a3
+        mov    $a1, $t1
+       sbb     \$0, $t4
+
+       xor     $t3, $t3
+       adc     \$-1, $a0
+        mov    $a2, $t2
+       adc     $poly1, $a1
+       adc     \$0, $a2
+        mov    $a3, $t3
+       adc     $poly3, $a3
+
+       bt      \$0, $t4
+       cmovnc  $t0, $a0
+       cmovnc  $t1, $a1
+       mov     $a0, 8*0($r_ptr)
+       cmovnc  $t2, $a2
+       mov     $a1, 8*1($r_ptr)
+       cmovnc  $t3, $a3
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+
+       ret
+.size  __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type  __ecp_nistz256_subx,\@abi-omnipotent
+.align 32
+__ecp_nistz256_subx:
+       xor     $t4, $t4
+       sbb     $a0, $t0
+       sbb     $a1, $t1
+        mov    $t0, $a0
+       sbb     $a2, $t2
+       sbb     $a3, $t3
+        mov    $t1, $a1
+       sbb     \$0, $t4
+
+       xor     $a3 ,$a3
+       adc     \$-1, $t0
+        mov    $t2, $a2
+       adc     $poly1, $t1
+       adc     \$0, $t2
+        mov    $t3, $a3
+       adc     $poly3, $t3
+
+       bt      \$0, $t4
+       cmovc   $t0, $a0
+       cmovc   $t1, $a1
+       cmovc   $t2, $a2
+       cmovc   $t3, $a3
+
+       ret
+.size  __ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type  __ecp_nistz256_mul_by_2x,\@abi-omnipotent
+.align 32
+__ecp_nistz256_mul_by_2x:
+       xor     $t4, $t4
+       adc     $a0, $a0                # a0:a3+a0:a3
+       adc     $a1, $a1
+        mov    $a0, $t0
+       adc     $a2, $a2
+       adc     $a3, $a3
+        mov    $a1, $t1
+       adc     \$0, $t4
+
+       xor     $t3, $t3
+       sbb     \$-1, $a0
+        mov    $a2, $t2
+       sbb     $poly1, $a1
+       sbb     \$0, $a2
+        mov    $a3, $t3
+       sbb     $poly3, $a3
+
+       bt      \$0, $t4
+       cmovnc  $t0, $a0
+       cmovnc  $t1, $a1
+       mov     $a0, 8*0($r_ptr)
+       cmovnc  $t2, $a2
+       mov     $a1, 8*1($r_ptr)
+       cmovnc  $t3, $a3
+       mov     $a2, 8*2($r_ptr)
+       mov     $a3, 8*3($r_ptr)
+
+       ret
+.size  __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+___
+                                                                       }
+&gen_double("x");
+&gen_add("x");
+&gen_add_affine("x");
+}
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
index 99e161e..72df1fa 100644 (file)
@@ -1182,6 +1182,11 @@ void ERR_load_EC_strings(void);
 #define EC_F_NISTP224_PRE_COMP_NEW                      227
 #define EC_F_NISTP256_PRE_COMP_NEW                      236
 #define EC_F_NISTP521_PRE_COMP_NEW                      237
+#define EC_F_NISTZ256_GET_AFFINE_COORDINATES            240
+#define EC_F_NISTZ256_POINTS_MUL                        241
+#define EC_F_NISTZ256_POINTS_MUL_W                      242
+#define EC_F_NISTZ256_PRECOMPUTE_MULT                   243
+#define EC_F_NISTZ256_PRE_COMP_NEW                      244
 #define EC_F_O2I_ECPUBLICKEY                            152
 #define EC_F_OLD_EC_PRIV_DECODE                                 222
 #define EC_F_PKEY_EC_CTRL                               197
index 869a384..83785b9 100644 (file)
@@ -2353,11 +2353,15 @@ static const ec_list_element curve_list[] = {
        { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, 0, "X9.62 curve over a 239 bit prime field" },
        { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, 0, "X9.62 curve over a 239 bit prime field" },
        { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0, "X9.62 curve over a 239 bit prime field" },
-#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
-       { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method, "X9.62/SECG curve over a 256 bit prime field" },
+       { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h,
+#if defined(ECP_NISTZ256_ASM)
+               EC_GFp_nistz256_method,
+#elif !defined(OPENSSL_NO_EC_NISTP_64_GCC_128)
+               EC_GFp_nistp256_method,
 #else
-       { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0, "X9.62/SECG curve over a 256 bit prime field" },
+               0,
 #endif
+               "X9.62/SECG curve over a 256 bit prime field" },
 #ifndef OPENSSL_NO_EC2M
        /* characteristic two field curves */
        /* NIST/SECG curves */
index b7e1874..d152056 100644 (file)
@@ -1,6 +1,6 @@
 /* crypto/ec/ec_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2013 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2014 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -200,6 +200,11 @@ static ERR_STRING_DATA EC_str_functs[]=
 {ERR_FUNC(EC_F_NISTP224_PRE_COMP_NEW), "NISTP224_PRE_COMP_NEW"},
 {ERR_FUNC(EC_F_NISTP256_PRE_COMP_NEW), "NISTP256_PRE_COMP_NEW"},
 {ERR_FUNC(EC_F_NISTP521_PRE_COMP_NEW), "NISTP521_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTZ256_GET_AFFINE_COORDINATES),       "NISTZ256_GET_AFFINE_COORDINATES"},
+{ERR_FUNC(EC_F_NISTZ256_POINTS_MUL),   "NISTZ256_POINTS_MUL"},
+{ERR_FUNC(EC_F_NISTZ256_POINTS_MUL_W), "NISTZ256_POINTS_MUL_W"},
+{ERR_FUNC(EC_F_NISTZ256_PRECOMPUTE_MULT),      "NISTZ256_PRECOMPUTE_MULT"},
+{ERR_FUNC(EC_F_NISTZ256_PRE_COMP_NEW), "NISTZ256_PRE_COMP_NEW"},
 {ERR_FUNC(EC_F_O2I_ECPUBLICKEY),       "o2i_ECPublicKey"},
 {ERR_FUNC(EC_F_OLD_EC_PRIV_DECODE),    "OLD_EC_PRIV_DECODE"},
 {ERR_FUNC(EC_F_PKEY_EC_CTRL),  "PKEY_EC_CTRL"},
diff --git a/crypto/ec/ecp_nistz256.c b/crypto/ec/ecp_nistz256.c
new file mode 100644 (file)
index 0000000..521e139
--- /dev/null
@@ -0,0 +1,1458 @@
+/******************************************************************************
+ *                                                                            *
+ * Copyright 2014 Intel Corporation                                           *
+ *                                                                            *
+ * Licensed under the Apache License, Version 2.0 (the "License");            *
+ * you may not use this file except in compliance with the License.           *
+ * You may obtain a copy of the License at                                    *
+ *                                                                            *
+ *    http://www.apache.org/licenses/LICENSE-2.0                              *
+ *                                                                            *
+ * Unless required by applicable law or agreed to in writing, software        *
+ * distributed under the License is distributed on an "AS IS" BASIS,          *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   *
+ * See the License for the specific language governing permissions and        *
+ * limitations under the License.                                             *
+ *                                                                            *
+ ******************************************************************************
+ *                                                                            *
+ * Developers and authors:                                                    *
+ * Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
+ * (1) Intel Corporation, Israel Development Center                           *
+ * (2) University of Haifa                                                    *
+ * Reference:                                                                 *
+ * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with *
+ *                          256 Bit Primes"                                   *
+ *                                                                            *
+ ******************************************************************************/
+
+#include <string.h>
+
+#include <openssl/bn.h>
+#include <openssl/err.h>
+#include <openssl/ec.h>
+#include "cryptlib.h"
+
+#include "ec_lcl.h"
+
+#if BN_BITS2 != 64
+# define TOBN(hi,lo)   lo,hi
+#else
+# define TOBN(hi,lo)   ((BN_ULONG)hi<<32|lo)
+#endif
+
+#if defined(__GNUC__)
+# define ALIGN32       __attribute((aligned(32)))
+#elif defined(_MSC_VER)
+# define ALIGN32       __declspec(align(32))
+#else
+# define ALIGN32
+#endif
+
+#define ALIGNPTR(p,N)  ((unsigned char *)p+N-(size_t)p%N)
+#define P256_LIMBS     (256/BN_BITS2)
+
+typedef unsigned short u16;
+
+typedef struct {
+    BN_ULONG X[P256_LIMBS];
+    BN_ULONG Y[P256_LIMBS];
+    BN_ULONG Z[P256_LIMBS];
+} P256_POINT;
+
+typedef struct {
+    BN_ULONG X[P256_LIMBS];
+    BN_ULONG Y[P256_LIMBS];
+} P256_POINT_AFFINE;
+
+typedef P256_POINT_AFFINE PRECOMP256_ROW[64];
+
+/* structure for precomputed multiples of the generator */
+typedef struct ec_pre_comp_st {
+    const EC_GROUP *group;      /* Parent EC_GROUP object */
+    size_t w;                   /* Window size */
+    /* Constant time access to the X and Y coordinates of the pre-computed,
+     * generator multiplies, in the Montgomery domain. Pre-calculated
+     * multiplies are stored in affine form. */
+    PRECOMP256_ROW *precomp;
+    void *precomp_storage;
+    int references;
+} EC_PRE_COMP;
+
+/* Functions implemented in assembly */
+/* Modular mul by 2: res = 2*a mod P */
+void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS],
+                           const BN_ULONG a[P256_LIMBS]);
+/* Modular div by 2: res = a/2 mod P */
+void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS],
+                           const BN_ULONG a[P256_LIMBS]);
+/* Modular mul by 3: res = 3*a mod P */
+void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS],
+                           const BN_ULONG a[P256_LIMBS]);
+/* Modular add: res = a+b mod P          */
+void ecp_nistz256_add(BN_ULONG res[P256_LIMBS],
+                      const BN_ULONG a[P256_LIMBS],
+                      const BN_ULONG b[P256_LIMBS]);
+/* Modular sub: res = a-b mod P          */
+void ecp_nistz256_sub(BN_ULONG res[P256_LIMBS],
+                      const BN_ULONG a[P256_LIMBS],
+                      const BN_ULONG b[P256_LIMBS]);
+/* Modular neg: res = -a mod P   */
+void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]);
+/* Montgomery mul: res = a*b*2^-256 mod P */
+void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
+                           const BN_ULONG a[P256_LIMBS],
+                           const BN_ULONG b[P256_LIMBS]);
+/* Montgomery sqr: res = a*a*2^-256 mod P */
+void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
+                           const BN_ULONG a[P256_LIMBS]);
+/* Convert a number from Montgomery domain, by multiplying with 1 */
+void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS],
+                            const BN_ULONG in[P256_LIMBS]);
+/* Convert a number to Montgomery domain, by multiplying with 2^512 mod P*/
+void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS],
+                          const BN_ULONG in[P256_LIMBS]);
+/* Functions that perform constant time access to the precomputed tables */
+void ecp_nistz256_select_w5(P256_POINT * val,
+                            const P256_POINT * in_t, int index);
+void ecp_nistz256_select_w7(P256_POINT_AFFINE * val,
+                            const P256_POINT_AFFINE * in_t, int index);
+
+/* One converted into the Montgomery domain */
+static const BN_ULONG ONE[P256_LIMBS] = {
+    TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000),
+    TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe)
+};
+
+static void *ec_pre_comp_dup(void *);
+static void ec_pre_comp_free(void *);
+static void ec_pre_comp_clear_free(void *);
+static EC_PRE_COMP *ec_pre_comp_new(const EC_GROUP * group);
+
+/* Precomputed tables for the default generator */
+#include "ecp_nistz256_table.c"
+
+/* Recode window to a signed digit, see ecp_nistputil.c for details */
+static unsigned int _booth_recode_w5(unsigned int in)
+{
+    unsigned int s, d;
+
+    s = ~((in >> 5) - 1);
+    d = (1 << 6) - in - 1;
+    d = (d & s) | (in & ~s);
+    d = (d >> 1) + (d & 1);
+
+    return (d << 1) + (s & 1);
+}
+
+static unsigned int _booth_recode_w7(unsigned int in)
+{
+    unsigned int s, d;
+
+    s = ~((in >> 7) - 1);
+    d = (1 << 8) - in - 1;
+    d = (d & s) | (in & ~s);
+    d = (d >> 1) + (d & 1);
+
+    return (d << 1) + (s & 1);
+}
+
+static void copy_conditional(BN_ULONG dst[P256_LIMBS],
+                             const BN_ULONG src[P256_LIMBS], BN_ULONG move)
+{
+    BN_ULONG mask1 = -move;
+    BN_ULONG mask2 = ~mask1;
+
+    dst[0] = (src[0] & mask1) ^ (dst[0] & mask2);
+    dst[1] = (src[1] & mask1) ^ (dst[1] & mask2);
+    dst[2] = (src[2] & mask1) ^ (dst[2] & mask2);
+    dst[3] = (src[3] & mask1) ^ (dst[3] & mask2);
+    if (P256_LIMBS == 8) {
+        dst[4] = (src[4] & mask1) ^ (dst[4] & mask2);
+        dst[5] = (src[5] & mask1) ^ (dst[5] & mask2);
+        dst[6] = (src[6] & mask1) ^ (dst[6] & mask2);
+        dst[7] = (src[7] & mask1) ^ (dst[7] & mask2);
+    }
+}
+
+static BN_ULONG is_zero(BN_ULONG in)
+{
+    in |= (0 - in);
+    in = ~in;
+    in &= BN_MASK2;
+    in >>= BN_BITS2 - 1;
+    return in;
+}
+
+static BN_ULONG is_equal(const BN_ULONG a[P256_LIMBS],
+                         const BN_ULONG b[P256_LIMBS])
+{
+    BN_ULONG res;
+
+    res = a[0] ^ b[0];
+    res |= a[1] ^ b[1];
+    res |= a[2] ^ b[2];
+    res |= a[3] ^ b[3];
+    if (P256_LIMBS == 8) {
+        res |= a[4] ^ b[4];
+        res |= a[5] ^ b[5];
+        res |= a[6] ^ b[6];
+        res |= a[7] ^ b[7];
+    }
+
+    return is_zero(res);
+}
+
+static BN_ULONG is_one(const BN_ULONG a[P256_LIMBS])
+{
+    BN_ULONG res;
+
+    res = a[0] ^ ONE[0];
+    res |= a[1] ^ ONE[1];
+    res |= a[2] ^ ONE[2];
+    res |= a[3] ^ ONE[3];
+    if (P256_LIMBS == 8) {
+        res |= a[4] ^ ONE[4];
+        res |= a[5] ^ ONE[5];
+        res |= a[6] ^ ONE[6];
+    }
+
+    return is_zero(res);
+}
+
+#ifndef ECP_NISTZ256_REFERENCE_IMPLEMENTATION
+void ecp_nistz256_point_double(P256_POINT * r, const P256_POINT * a);
+void ecp_nistz256_point_add(P256_POINT * r,
+                            const P256_POINT * a, const P256_POINT * b);
+void ecp_nistz256_point_add_affine(P256_POINT * r,
+                                   const P256_POINT * a,
+                                   const P256_POINT_AFFINE * b);
+#else
+/* Point double: r = 2*a */
+static void ecp_nistz256_point_double(P256_POINT * r, const P256_POINT * a)
+{
+    BN_ULONG S[P256_LIMBS];
+    BN_ULONG M[P256_LIMBS];
+    BN_ULONG Zsqr[P256_LIMBS];
+    BN_ULONG tmp0[P256_LIMBS];
+
+    const BN_ULONG *in_x = a->X;
+    const BN_ULONG *in_y = a->Y;
+    const BN_ULONG *in_z = a->Z;
+
+    BN_ULONG *res_x = r->X;
+    BN_ULONG *res_y = r->Y;
+    BN_ULONG *res_z = r->Z;
+
+    ecp_nistz256_mul_by_2(S, in_y);
+
+    ecp_nistz256_sqr_mont(Zsqr, in_z);
+
+    ecp_nistz256_sqr_mont(S, S);
+
+    ecp_nistz256_mul_mont(res_z, in_z, in_y);
+    ecp_nistz256_mul_by_2(res_z, res_z);
+
+    ecp_nistz256_add(M, in_x, Zsqr);
+    ecp_nistz256_sub(Zsqr, in_x, Zsqr);
+
+    ecp_nistz256_sqr_mont(res_y, S);
+    ecp_nistz256_div_by_2(res_y, res_y);
+
+    ecp_nistz256_mul_mont(M, M, Zsqr);
+    ecp_nistz256_mul_by_3(M, M);
+
+    ecp_nistz256_mul_mont(S, S, in_x);
+    ecp_nistz256_mul_by_2(tmp0, S);
+
+    ecp_nistz256_sqr_mont(res_x, M);
+
+    ecp_nistz256_sub(res_x, res_x, tmp0);
+    ecp_nistz256_sub(S, S, res_x);
+
+    ecp_nistz256_mul_mont(S, S, M);
+    ecp_nistz256_sub(res_y, S, res_y);
+}
+
+/* Point addition: r = a+b */
+static void ecp_nistz256_point_add(P256_POINT * r,
+                                   const P256_POINT * a, const P256_POINT * b)
+{
+    BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS];
+    BN_ULONG U1[P256_LIMBS], S1[P256_LIMBS];
+    BN_ULONG Z1sqr[P256_LIMBS];
+    BN_ULONG Z2sqr[P256_LIMBS];
+    BN_ULONG H[P256_LIMBS], R[P256_LIMBS];
+    BN_ULONG Hsqr[P256_LIMBS];
+    BN_ULONG Rsqr[P256_LIMBS];
+    BN_ULONG Hcub[P256_LIMBS];
+
+    BN_ULONG res_x[P256_LIMBS];
+    BN_ULONG res_y[P256_LIMBS];
+    BN_ULONG res_z[P256_LIMBS];
+
+    BN_ULONG in1infty, in2infty;
+
+    const BN_ULONG *in1_x = a->X;
+    const BN_ULONG *in1_y = a->Y;
+    const BN_ULONG *in1_z = a->Z;
+
+    const BN_ULONG *in2_x = b->X;
+    const BN_ULONG *in2_y = b->Y;
+    const BN_ULONG *in2_z = b->Z;
+
+    /* We encode infinity as (0,0), which is not on the curve,
+     * so it is OK. */
+    in1infty = in1_x[0] | in1_x[1] | in1_x[2] | in1_x[3] |
+               in1_y[0] | in1_y[1] | in1_y[2] | in1_y[3];
+    if (P256_LIMBS == 8)
+        in1infty |= in1_x[4] | in1_x[5] | in1_x[6] | in1_x[7] |
+                    in1_y[4] | in1_y[5] | in1_y[6] | in1_y[7];
+
+    in2infty = in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] |
+               in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3];
+    if (P256_LIMBS == 8)
+        in2infty |= in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] |
+                    in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7];
+
+    in1infty = is_zero(in1infty);
+    in2infty = is_zero(in2infty);
+
+    ecp_nistz256_sqr_mont(Z2sqr, in2_z);        /* Z2^2 */
+    ecp_nistz256_sqr_mont(Z1sqr, in1_z);        /* Z1^2 */
+
+    ecp_nistz256_mul_mont(S1, Z2sqr, in2_z);    /* S1 = Z2^3 */
+    ecp_nistz256_mul_mont(S2, Z1sqr, in1_z);    /* S2 = Z1^3 */
+
+    ecp_nistz256_mul_mont(S1, S1, in1_y);       /* S1 = Y1*Z2^3 */
+    ecp_nistz256_mul_mont(S2, S2, in2_y);       /* S2 = Y2*Z1^3 */
+    ecp_nistz256_sub(R, S2, S1);                /* R = S2 - S1 */
+
+    ecp_nistz256_mul_mont(U1, in1_x, Z2sqr);    /* U1 = X1*Z2^2 */
+    ecp_nistz256_mul_mont(U2, in2_x, Z1sqr);    /* U2 = X2*Z1^2 */
+    ecp_nistz256_sub(H, U2, U1);                /* H = U2 - U1 */
+
+    /* This should not happen during sign/ecdh,
+     * so no constant time violation */
+    if (is_equal(U1, U2) && !in1infty && !in2infty) {
+        if (is_equal(S1, S2)) {
+            ecp_nistz256_point_double(r, a);
+            return;
+        } else {
+            memset(r, 0, sizeof(*r));
+            return;
+        }
+    }
+
+    ecp_nistz256_sqr_mont(Rsqr, R);             /* R^2 */
+    ecp_nistz256_mul_mont(res_z, H, in1_z);     /* Z3 = H*Z1*Z2 */
+    ecp_nistz256_sqr_mont(Hsqr, H);             /* H^2 */
+    ecp_nistz256_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */
+    ecp_nistz256_mul_mont(Hcub, Hsqr, H);       /* H^3 */
+
+    ecp_nistz256_mul_mont(U2, U1, Hsqr);        /* U1*H^2 */
+    ecp_nistz256_mul_by_2(Hsqr, U2);            /* 2*U1*H^2 */
+
+    ecp_nistz256_sub(res_x, Rsqr, Hsqr);
+    ecp_nistz256_sub(res_x, res_x, Hcub);
+
+    ecp_nistz256_sub(res_y, U2, res_x);
+
+    ecp_nistz256_mul_mont(S2, S1, Hcub);
+    ecp_nistz256_mul_mont(res_y, R, res_y);
+    ecp_nistz256_sub(res_y, res_y, S2);
+
+    copy_conditional(res_x, in2_x, in1infty);
+    copy_conditional(res_y, in2_y, in1infty);
+    copy_conditional(res_z, in2_z, in1infty);
+
+    copy_conditional(res_x, in1_x, in2infty);
+    copy_conditional(res_y, in1_y, in2infty);
+    copy_conditional(res_z, in1_z, in2infty);
+
+    memcpy(r->X, res_x, sizeof(res_x));
+    memcpy(r->Y, res_y, sizeof(res_y));
+    memcpy(r->Z, res_z, sizeof(res_z));
+}
+
+/* Point addition when b is known to be affine: r = a+b */
+static void ecp_nistz256_point_add_affine(P256_POINT * r,
+                                          const P256_POINT * a,
+                                          const P256_POINT_AFFINE * b)
+{
+    BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS];
+    BN_ULONG Z1sqr[P256_LIMBS];
+    BN_ULONG H[P256_LIMBS], R[P256_LIMBS];
+    BN_ULONG Hsqr[P256_LIMBS];
+    BN_ULONG Rsqr[P256_LIMBS];
+    BN_ULONG Hcub[P256_LIMBS];
+
+    BN_ULONG res_x[P256_LIMBS];
+    BN_ULONG res_y[P256_LIMBS];
+    BN_ULONG res_z[P256_LIMBS];
+
+    BN_ULONG in1infty, in2infty;
+
+    const BN_ULONG *in1_x = a->X;
+    const BN_ULONG *in1_y = a->Y;
+    const BN_ULONG *in1_z = a->Z;
+
+    const BN_ULONG *in2_x = b->X;
+    const BN_ULONG *in2_y = b->Y;
+
+    /* In affine representation we encode infty as (0,0),
+     * which is not on the curve, so it is OK */
+    in1infty = in1_x[0] | in1_x[1] | in1_x[2] | in1_x[3] |
+               in1_y[0] | in1_y[1] | in1_y[2] | in1_y[3];
+    if (P256_LIMBS == 8)
+        in1infty |= in1_x[4] | in1_x[5] | in1_x[6] | in1_x[7] |
+                    in1_y[4] | in1_y[5] | in1_y[6] | in1_y[7];
+
+    in2infty = in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] |
+               in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3];
+    if (P256_LIMBS == 8)
+        in2infty |= in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] |
+                    in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7];
+
+    in1infty = is_zero(in1infty);
+    in2infty = is_zero(in2infty);
+
+    ecp_nistz256_sqr_mont(Z1sqr, in1_z);        /* Z1^2 */
+
+    ecp_nistz256_mul_mont(U2, in2_x, Z1sqr);    /* U2 = X2*Z1^2 */
+    ecp_nistz256_sub(H, U2, in1_x);             /* H = U2 - U1 */
+
+    ecp_nistz256_mul_mont(S2, Z1sqr, in1_z);    /* S2 = Z1^3 */
+
+    ecp_nistz256_mul_mont(res_z, H, in1_z);     /* Z3 = H*Z1*Z2 */
+
+    ecp_nistz256_mul_mont(S2, S2, in2_y);       /* S2 = Y2*Z1^3 */
+    ecp_nistz256_sub(R, S2, in1_y);             /* R = S2 - S1 */
+
+    ecp_nistz256_sqr_mont(Hsqr, H);             /* H^2 */
+    ecp_nistz256_sqr_mont(Rsqr, R);             /* R^2 */
+    ecp_nistz256_mul_mont(Hcub, Hsqr, H);       /* H^3 */
+
+    ecp_nistz256_mul_mont(U2, in1_x, Hsqr);     /* U1*H^2 */
+    ecp_nistz256_mul_by_2(Hsqr, U2);            /* 2*U1*H^2 */
+
+    ecp_nistz256_sub(res_x, Rsqr, Hsqr);
+    ecp_nistz256_sub(res_x, res_x, Hcub);
+    ecp_nistz256_sub(H, U2, res_x);
+
+    ecp_nistz256_mul_mont(S2, in1_y, Hcub);
+    ecp_nistz256_mul_mont(H, H, R);
+    ecp_nistz256_sub(res_y, H, S2);
+
+    copy_conditional(res_x, in2_x, in1infty);
+    copy_conditional(res_x, in1_x, in2infty);
+
+    copy_conditional(res_y, in2_y, in1infty);
+    copy_conditional(res_y, in1_y, in2infty);
+
+    copy_conditional(res_z, ONE, in1infty);
+    copy_conditional(res_z, in1_z, in2infty);
+
+    memcpy(r->X, res_x, sizeof(res_x));
+    memcpy(r->Y, res_y, sizeof(res_y));
+    memcpy(r->Z, res_z, sizeof(res_z));
+}
+#endif
+
+/* r = in^-1 mod p */
+static void ecp_nistz256_mod_inverse(BN_ULONG r[P256_LIMBS],
+                                     const BN_ULONG in[P256_LIMBS])
+{
+    /* The poly is ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
+       We use FLT and used poly-2 as exponent */
+    BN_ULONG p2[P256_LIMBS];
+    BN_ULONG p4[P256_LIMBS];
+    BN_ULONG p8[P256_LIMBS];
+    BN_ULONG p16[P256_LIMBS];
+    BN_ULONG p32[P256_LIMBS];
+    BN_ULONG res[P256_LIMBS];
+    int i;
+
+    ecp_nistz256_sqr_mont(res, in);
+    ecp_nistz256_mul_mont(p2, res, in);         /* 3*p */
+
+    ecp_nistz256_sqr_mont(res, p2);
+    ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_mul_mont(p4, res, p2);         /* f*p */
+
+    ecp_nistz256_sqr_mont(res, p4);
+    ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_mul_mont(p8, res, p4);         /* ff*p */
+
+    ecp_nistz256_sqr_mont(res, p8);
+    for (i = 0; i < 7; i++)
+        ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_mul_mont(p16, res, p8);        /* ffff*p */
+
+    ecp_nistz256_sqr_mont(res, p16);
+    for (i = 0; i < 15; i++)
+        ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_mul_mont(p32, res, p16);       /* ffffffff*p */
+
+    ecp_nistz256_sqr_mont(res, p32);
+    for (i = 0; i < 31; i++)
+        ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_mul_mont(res, res, in);
+
+    for (i = 0; i < 32 * 4; i++)
+        ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_mul_mont(res, res, p32);
+
+    for (i = 0; i < 32; i++)
+        ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_mul_mont(res, res, p32);
+
+    for (i = 0; i < 16; i++)
+        ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_mul_mont(res, res, p16);
+
+    for (i = 0; i < 8; i++)
+        ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_mul_mont(res, res, p8);
+
+    ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_mul_mont(res, res, p4);
+
+    ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_mul_mont(res, res, p2);
+
+    ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_sqr_mont(res, res);
+    ecp_nistz256_mul_mont(res, res, in);
+
+    memcpy(r, res, sizeof(res));
+}
+
+/* ecp_nistz256_bignum_to_field_elem copies the contents of |in| to |out| and
+ * returns one if it fits. Otherwise it returns zero. */
+static int ecp_nistz256_bignum_to_field_elem(BN_ULONG out[P256_LIMBS],
+                                             const BIGNUM * in)
+{
+    if (in->top > P256_LIMBS)
+        return 0;
+
+    memset(out, 0, sizeof(BN_ULONG) * P256_LIMBS);
+    memcpy(out, in->d, sizeof(BN_ULONG) * in->top);
+    return 1;
+}
+
+/* r = sum(scalar[i]*point[i]) */
+static void ecp_nistz256_windowed_mul(const EC_GROUP * group,
+                                      P256_POINT * r,
+                                      const BIGNUM ** scalar,
+                                      const EC_POINT ** point,
+                                      int num, BN_CTX * ctx)
+{
+    int i, j;
+    unsigned int index;
+    unsigned char (*p_str)[33] = NULL;
+    const unsigned int window_size = 5;
+    const unsigned int mask = (1 << (window_size + 1)) - 1;
+    unsigned int wvalue;
+    BN_ULONG tmp[P256_LIMBS];
+    ALIGN32 P256_POINT h;
+    const BIGNUM **scalars = NULL;
+    P256_POINT(*table)[16] = NULL;
+    void *table_storage = NULL;
+
+    if ((table_storage =
+         OPENSSL_malloc(num * 16 * sizeof(P256_POINT) + 64)) == NULL
+        || (p_str =
+            OPENSSL_malloc(num * 33 * sizeof(unsigned char))) == NULL
+        || (scalars = OPENSSL_malloc(num * sizeof(BIGNUM *))) == NULL) {
+        ECerr(EC_F_NISTZ256_POINTS_MUL_W, ERR_R_MALLOC_FAILURE);
+        goto err;
+    } else {
+        table = (void *)ALIGNPTR(table_storage, 64);
+    }
+
+    for (i = 0; i < num; i++) {
+        P256_POINT *row = table[i];
+
+        if ((BN_num_bits(scalar[i]) > 256) || BN_is_negative(scalar[i])) {
+            BIGNUM *mod;
+
+            if ((mod = BN_CTX_get(ctx)) == NULL)
+                goto err;
+            if (!BN_nnmod(mod, scalar[i], &group->order, ctx)) {
+                ECerr(EC_F_NISTZ256_POINTS_MUL_W, ERR_R_BN_LIB);
+                goto err;
+            }
+            scalars[i] = mod;
+        } else
+            scalars[i] = scalar[i];
+
+        for (j = 0; j < scalars[i]->top * BN_BYTES; j += BN_BYTES) {
+            BN_ULONG d = scalars[i]->d[j / BN_BYTES];
+
+            p_str[i][j + 0] = d & 0xff;
+            p_str[i][j + 1] = (d >> 8) & 0xff;
+            p_str[i][j + 2] = (d >> 16) & 0xff;
+            p_str[i][j + 3] = (d >>= 24) & 0xff;
+            if (BN_BYTES == 8) {
+                d >>= 8;
+                p_str[i][j + 4] = d & 0xff;
+                p_str[i][j + 5] = (d >> 8) & 0xff;
+                p_str[i][j + 6] = (d >> 16) & 0xff;
+                p_str[i][j + 7] = (d >> 24) & 0xff;
+            }
+        }
+        for (; j < 33; j++)
+            p_str[i][j] = 0;
+
+        /* table[0] is implicitly (0,0,0) (the point at infinity),
+         * therefore it is not stored. All other values are actually
+         * stored with an offset of -1 in table.
+         */
+
+        if (!ecp_nistz256_bignum_to_field_elem(row[1 - 1].X, &point[i]->X)
+            || !ecp_nistz256_bignum_to_field_elem(row[1 - 1].Y, &point[i]->Y)
+            || !ecp_nistz256_bignum_to_field_elem(row[1 - 1].Z, &point[i]->Z)) {
+            ECerr(EC_F_NISTZ256_POINTS_MUL_W, EC_R_COORDINATES_OUT_OF_RANGE);
+            goto err;
+        }
+
+        ecp_nistz256_point_double(&row[ 2 - 1], &row[ 1 - 1]);
+        ecp_nistz256_point_add   (&row[ 3 - 1], &row[ 2 - 1], &row[1 - 1]);
+        ecp_nistz256_point_double(&row[ 4 - 1], &row[ 2 - 1]);
+        ecp_nistz256_point_double(&row[ 6 - 1], &row[ 3 - 1]);
+        ecp_nistz256_point_double(&row[ 8 - 1], &row[ 4 - 1]);
+        ecp_nistz256_point_double(&row[12 - 1], &row[ 6 - 1]);
+        ecp_nistz256_point_add   (&row[ 5 - 1], &row[ 4 - 1], &row[1 - 1]);
+        ecp_nistz256_point_add   (&row[ 7 - 1], &row[ 6 - 1], &row[1 - 1]);
+        ecp_nistz256_point_add   (&row[ 9 - 1], &row[ 8 - 1], &row[1 - 1]);
+        ecp_nistz256_point_add   (&row[13 - 1], &row[12 - 1], &row[1 - 1]);
+        ecp_nistz256_point_double(&row[14 - 1], &row[ 7 - 1]);
+        ecp_nistz256_point_double(&row[10 - 1], &row[ 5 - 1]);
+        ecp_nistz256_point_add   (&row[15 - 1], &row[14 - 1], &row[1 - 1]);
+        ecp_nistz256_point_add   (&row[11 - 1], &row[10 - 1], &row[1 - 1]);
+        ecp_nistz256_point_add   (&row[16 - 1], &row[15 - 1], &row[1 - 1]);
+    }
+
+    index = 255;
+
+    wvalue = p_str[0][(index - 1) / 8];
+    wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+
+    ecp_nistz256_select_w5(r, table[0], _booth_recode_w5(wvalue) >> 1);
+
+    while (index >= 5) {
+        for (i = (index == 255 ? 1 : 0); i < num; i++) {
+            unsigned int off = (index - 1) / 8;
+
+            wvalue = p_str[i][off] | p_str[i][off + 1] << 8;
+            wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+
+            wvalue = _booth_recode_w5(wvalue);
+
+            ecp_nistz256_select_w5(&h, table[i], wvalue >> 1);
+
+            ecp_nistz256_neg(tmp, h.Y);
+            copy_conditional(h.Y, tmp, (wvalue & 1));
+
+            ecp_nistz256_point_add(r, r, &h);
+        }
+
+        index -= window_size;
+
+        ecp_nistz256_point_double(r, r);
+        ecp_nistz256_point_double(r, r);
+        ecp_nistz256_point_double(r, r);
+        ecp_nistz256_point_double(r, r);
+        ecp_nistz256_point_double(r, r);
+    }
+
+    /* Final window */
+    for (i = 0; i < num; i++) {
+        wvalue = p_str[i][0];
+        wvalue = (wvalue << 1) & mask;
+
+        wvalue = _booth_recode_w5(wvalue);
+
+        ecp_nistz256_select_w5(&h, table[i], wvalue >> 1);
+
+        ecp_nistz256_neg(tmp, h.Y);
+        copy_conditional(h.Y, tmp, wvalue & 1);
+
+        ecp_nistz256_point_add(r, r, &h);
+    }
+
+err:
+    if (table_storage)
+        OPENSSL_free(table_storage);
+    if (p_str)
+        OPENSSL_free(p_str);
+    if (scalars)
+        OPENSSL_free(scalars);
+}
+
+/* Coordinates of G, for which we have precomputed tables */
+const static BN_ULONG def_xG[P256_LIMBS] = {
+    TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601),
+    TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6)
+};
+
+const static BN_ULONG def_yG[P256_LIMBS] = {
+    TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c),
+    TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85)
+};
+
+/* ecp_nistz256_is_affine_G returns one if |generator| is the standard,
+ * P-256 generator. */
+static int ecp_nistz256_is_affine_G(const EC_POINT * generator)
+{
+    return (generator->X.top == P256_LIMBS) &&
+        (generator->Y.top == P256_LIMBS) &&
+        (generator->Z.top == (P256_LIMBS - P256_LIMBS / 8)) &&
+        is_equal(generator->X.d, def_xG) &&
+        is_equal(generator->Y.d, def_yG) && is_one(generator->Z.d);
+}
+
+static int ecp_nistz256_mult_precompute(EC_GROUP * group, BN_CTX * ctx)
+{
+    /* We precompute a table for a Booth encoded exponent (wNAF) based
+     * computation. Each table holds 64 values for safe access, with an
+     * implicit value of infinity at index zero. We use window of size 7,
+     * and therefore require ceil(256/7) = 37 tables. */
+    BIGNUM *order;
+    EC_POINT *P = NULL, *T = NULL;
+    const EC_POINT *generator;
+    EC_PRE_COMP *pre_comp;
+    int i, j, k, ret = 0;
+    size_t w;
+
+    PRECOMP256_ROW *preComputedTable = NULL;
+    unsigned char *precomp_storage = NULL;
+
+    /* if there is an old EC_PRE_COMP object, throw it away */
+    EC_EX_DATA_free_data(&group->extra_data, ec_pre_comp_dup,
+                         ec_pre_comp_free, ec_pre_comp_clear_free);
+
+    generator = EC_GROUP_get0_generator(group);
+    if (generator == NULL) {
+        ECerr(EC_F_NISTZ256_PRECOMPUTE_MULT, EC_R_UNDEFINED_GENERATOR);
+        return 0;
+    }
+
+    if (ecp_nistz256_is_affine_G(generator)) {
+        /* No need to calculate tables for the standard generator
+         * because we have them statically. */
+        return 1;
+    }
+
+    if ((pre_comp = ec_pre_comp_new(group)) == NULL)
+        return 0;
+
+    if (ctx == NULL) {
+        ctx = BN_CTX_new();
+        if (ctx == NULL)
+            goto err;
+    }
+
+    BN_CTX_start(ctx);
+    order = BN_CTX_get(ctx);
+
+    if (order == NULL)
+        goto err;
+
+    if (!EC_GROUP_get_order(group, order, ctx))
+        goto err;
+
+    if (BN_is_zero(order)) {
+        ECerr(EC_F_NISTZ256_PRECOMPUTE_MULT, EC_R_UNKNOWN_ORDER);
+        goto err;
+    }
+
+    w = 7;
+
+    if ((precomp_storage =
+         OPENSSL_malloc(37 * 64 * sizeof(P256_POINT_AFFINE) + 64)) == NULL) {
+        ECerr(EC_F_NISTZ256_PRECOMPUTE_MULT, ERR_R_MALLOC_FAILURE);
+        goto err;
+    } else {
+        preComputedTable = (void *)ALIGNPTR(precomp_storage, 64);
+    }
+
+    P = EC_POINT_new(group);
+    T = EC_POINT_new(group);
+
+    /* The zero entry is implicitly infinity, and we skip it,
+     * storing other values with -1 offset. */
+    EC_POINT_copy(T, generator);
+
+    for (k = 0; k < 64; k++) {
+        EC_POINT_copy(P, T);
+        for (j = 0; j < 37; j++) {
+            /* It would be faster to use
+             * ec_GFp_simple_points_make_affine and make multiple
+             * points affine at the same time. */
+            ec_GFp_simple_make_affine(group, P, ctx);
+            ecp_nistz256_bignum_to_field_elem(preComputedTable[j]
+                                              [k].X, &P->X);
+            ecp_nistz256_bignum_to_field_elem(preComputedTable[j]
+                                              [k].Y, &P->Y);
+            for (i = 0; i < 7; i++)
+                ec_GFp_simple_dbl(group, P, P, ctx);
+        }
+        ec_GFp_simple_add(group, T, T, generator, ctx);
+    }
+
+    pre_comp->group = group;
+    pre_comp->w = w;
+    pre_comp->precomp = preComputedTable;
+    pre_comp->precomp_storage = precomp_storage;
+
+    precomp_storage = NULL;
+
+    if (!EC_EX_DATA_set_data(&group->extra_data, pre_comp,
+                             ec_pre_comp_dup, ec_pre_comp_free,
+                             ec_pre_comp_clear_free)) {
+        goto err;
+    }
+
+    pre_comp = NULL;
+
+    ret = 1;
+
+err:
+    if (ctx != NULL)
+        BN_CTX_end(ctx);
+    if (pre_comp)
+        ec_pre_comp_free(pre_comp);
+    if (precomp_storage)
+        OPENSSL_free(precomp_storage);
+    if (P)
+        EC_POINT_free(P);
+    if (T)
+        EC_POINT_free(T);
+    return ret;
+}
+
+/*
+ * Note that by default ECP_NISTZ256_AVX2 is undefined. While it's great
+ * code processing 4 points in parallel, corresponding serial operation
+ * is several times slower, because it uses 29x29=58-bit multiplication
+ * as opposite to 64x64=128-bit in integer-only scalar case. As result
+ * it doesn't provide *significant* performance improvement. Note that
+ * just defining ECP_NISTZ256_AVX2 is not sufficient to make it work,
+ * you'd need to compile even asm/ecp_nistz256-avx.pl module.
+ */
+#if defined(ECP_NISTZ256_AVX2)
+# if !(defined(__x86_64) || defined(__x86_64__)) || \
+       defined(_M_AMD64) || defined(_MX64)) || \
+     !(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */
+#  undef ECP_NISTZ256_AVX2
+# else
+/* Constant time access, loading four values, from four consecutive tables */
+void ecp_nistz256_avx2_select_w7(P256_POINT_AFFINE * val,
+                                 const P256_POINT_AFFINE * in_t, int index);
+void ecp_nistz256_avx2_multi_select_w7(void *result, const void *in, int index0,
+                                       int index1, int index2, int index3);
+void ecp_nistz256_avx2_transpose_convert(void *RESULTx4, const void *in);
+void ecp_nistz256_avx2_convert_transpose_back(void *result, const void *Ax4);
+void ecp_nistz256_avx2_point_add_affine_x4(void *RESULTx4, const void *Ax4,
+                                           const void *Bx4);
+void ecp_nistz256_avx2_point_add_affines_x4(void *RESULTx4, const void *Ax4,
+                                            const void *Bx4);
+void ecp_nistz256_avx2_to_mont(void *RESULTx4, const void *Ax4);
+void ecp_nistz256_avx2_from_mont(void *RESULTx4, const void *Ax4);
+void ecp_nistz256_avx2_set1(void *RESULTx4);
+int ecp_nistz_avx2_eligible(void);
+
+static void booth_recode_w7(unsigned char *sign,
+                            unsigned char *digit, unsigned char in)
+{
+    unsigned char s, d;
+
+    s = ~((in >> 7) - 1);
+    d = (1 << 8) - in - 1;
+    d = (d & s) | (in & ~s);
+    d = (d >> 1) + (d & 1);
+
+    *sign = s & 1;
+    *digit = d;
+}
+
+/* ecp_nistz256_avx2_mul_g performs multiplication by G, using only the
+ * precomputed table. It does 4 affine point additions in parallel,
+ * significantly speeding up point multiplication for a fixed value. */
+static void ecp_nistz256_avx2_mul_g(P256_POINT * r,
+                                    unsigned char p_str[33],
+                                    const
+                                    P256_POINT_AFFINE(*preComputedTable)[64])
+{
+    const unsigned int window_size = 7;
+    const unsigned int mask = (1 << (window_size + 1)) - 1;
+    unsigned int wvalue;
+    /* Using 4 windows at a time */
+    unsigned char sign0, digit0;
+    unsigned char sign1, digit1;
+    unsigned char sign2, digit2;
+    unsigned char sign3, digit3;
+    unsigned int index = 0;
+    BN_ULONG tmp[P256_LIMBS];
+    int i;
+
+    ALIGN32 BN_ULONG aX4[4 * 9 * 3] = { 0 };
+    ALIGN32 BN_ULONG bX4[4 * 9 * 2] = { 0 };
+    ALIGN32 P256_POINT_AFFINE point_arr[P256_LIMBS];
+    ALIGN32 P256_POINT res_point_arr[P256_LIMBS];
+
+    /* Initial four windows */
+    wvalue = *((u16 *) & p_str[0]);
+    wvalue = (wvalue << 1) & mask;
+    index += window_size;
+    booth_recode_w7(&sign0, &digit0, wvalue);
+    wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+    wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+    index += window_size;
+    booth_recode_w7(&sign1, &digit1, wvalue);
+    wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+    wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+    index += window_size;
+    booth_recode_w7(&sign2, &digit2, wvalue);
+    wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+    wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+    index += window_size;
+    booth_recode_w7(&sign3, &digit3, wvalue);
+
+    ecp_nistz256_avx2_multi_select_w7(point_arr, preComputedTable[0],
+                                      digit0, digit1, digit2, digit3);
+
+    ecp_nistz256_neg(tmp, point_arr[0].Y);
+    copy_conditional(point_arr[0].Y, tmp, sign0);
+    ecp_nistz256_neg(tmp, point_arr[1].Y);
+    copy_conditional(point_arr[1].Y, tmp, sign1);
+    ecp_nistz256_neg(tmp, point_arr[2].Y);
+    copy_conditional(point_arr[2].Y, tmp, sign2);
+    ecp_nistz256_neg(tmp, point_arr[3].Y);
+    copy_conditional(point_arr[3].Y, tmp, sign3);
+
+    ecp_nistz256_avx2_transpose_convert(aX4, point_arr);
+    ecp_nistz256_avx2_to_mont(aX4, aX4);
+    ecp_nistz256_avx2_to_mont(&aX4[4 * 9], &aX4[4 * 9]);
+    ecp_nistz256_avx2_set1(&aX4[4 * 9 * 2]);
+
+    wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+    wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+    index += window_size;
+    booth_recode_w7(&sign0, &digit0, wvalue);
+    wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+    wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+    index += window_size;
+    booth_recode_w7(&sign1, &digit1, wvalue);
+    wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+    wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+    index += window_size;
+    booth_recode_w7(&sign2, &digit2, wvalue);
+    wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+    wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+    index += window_size;
+    booth_recode_w7(&sign3, &digit3, wvalue);
+
+    ecp_nistz256_avx2_multi_select_w7(point_arr, preComputedTable[4 * 1],
+                                      digit0, digit1, digit2, digit3);
+
+    ecp_nistz256_neg(tmp, point_arr[0].Y);
+    copy_conditional(point_arr[0].Y, tmp, sign0);
+    ecp_nistz256_neg(tmp, point_arr[1].Y);
+    copy_conditional(point_arr[1].Y, tmp, sign1);
+    ecp_nistz256_neg(tmp, point_arr[2].Y);
+    copy_conditional(point_arr[2].Y, tmp, sign2);
+    ecp_nistz256_neg(tmp, point_arr[3].Y);
+    copy_conditional(point_arr[3].Y, tmp, sign3);
+
+    ecp_nistz256_avx2_transpose_convert(bX4, point_arr);
+    ecp_nistz256_avx2_to_mont(bX4, bX4);
+    ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]);
+    /* Optimized when both inputs are affine */
+    ecp_nistz256_avx2_point_add_affines_x4(aX4, aX4, bX4);
+
+    for (i = 2; i < 9; i++) {
+        wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+        wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+        index += window_size;
+        booth_recode_w7(&sign0, &digit0, wvalue);
+        wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+        wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+        index += window_size;
+        booth_recode_w7(&sign1, &digit1, wvalue);
+        wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+        wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+        index += window_size;
+        booth_recode_w7(&sign2, &digit2, wvalue);
+        wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+        wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+        index += window_size;
+        booth_recode_w7(&sign3, &digit3, wvalue);
+
+        ecp_nistz256_avx2_multi_select_w7(point_arr,
+                                          preComputedTable[4 * i],
+                                          digit0, digit1, digit2, digit3);
+
+        ecp_nistz256_neg(tmp, point_arr[0].Y);
+        copy_conditional(point_arr[0].Y, tmp, sign0);
+        ecp_nistz256_neg(tmp, point_arr[1].Y);
+        copy_conditional(point_arr[1].Y, tmp, sign1);
+        ecp_nistz256_neg(tmp, point_arr[2].Y);
+        copy_conditional(point_arr[2].Y, tmp, sign2);
+        ecp_nistz256_neg(tmp, point_arr[3].Y);
+        copy_conditional(point_arr[3].Y, tmp, sign3);
+
+        ecp_nistz256_avx2_transpose_convert(bX4, point_arr);
+        ecp_nistz256_avx2_to_mont(bX4, bX4);
+        ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]);
+
+        ecp_nistz256_avx2_point_add_affine_x4(aX4, aX4, bX4);
+    }
+
+    ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 0], &aX4[4 * 9 * 0]);
+    ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 1], &aX4[4 * 9 * 1]);
+    ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 2], &aX4[4 * 9 * 2]);
+
+    ecp_nistz256_avx2_convert_transpose_back(res_point_arr, aX4);
+    /* Last window is performed serially */
+    wvalue = *((u16 *) & p_str[(index - 1) / 8]);
+    wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+    booth_recode_w7(&sign0, &digit0, wvalue);
+    ecp_nistz256_avx2_select_w7((P256_POINT_AFFINE *) r,
+                                preComputedTable[36], digit0);
+    ecp_nistz256_neg(tmp, r->Y);
+    copy_conditional(r->Y, tmp, sign0);
+    memcpy(r->Z, ONE, sizeof(ONE));
+    /* Sum the four windows */
+    ecp_nistz256_point_add(r, r, &res_point_arr[0]);
+    ecp_nistz256_point_add(r, r, &res_point_arr[1]);
+    ecp_nistz256_point_add(r, r, &res_point_arr[2]);
+    ecp_nistz256_point_add(r, r, &res_point_arr[3]);
+}
+# endif
+#endif
+
+static int ecp_nistz256_set_from_affine(EC_POINT * out, const EC_GROUP * group,
+                                        const P256_POINT_AFFINE * in,
+                                        BN_CTX * ctx)
+{
+    BIGNUM x, y;
+    BN_ULONG d_x[P256_LIMBS], d_y[P256_LIMBS];
+    int ret = 0;
+
+    memcpy(d_x, in->X, sizeof(d_x));
+    x.d = d_x;
+    x.dmax = x.top = P256_LIMBS;
+    x.neg = 0;
+    x.flags = BN_FLG_STATIC_DATA;
+
+    memcpy(d_y, in->Y, sizeof(d_y));
+    y.d = d_y;
+    y.dmax = y.top = P256_LIMBS;
+    y.neg = 0;
+    y.flags = BN_FLG_STATIC_DATA;
+
+    ret = EC_POINT_set_affine_coordinates_GFp(group, out, &x, &y, ctx);
+
+    return ret;
+}
+
+/* r = scalar*G + sum(scalars[i]*points[i]) */
+static int ecp_nistz256_points_mul(const EC_GROUP * group,
+                                   EC_POINT * r,
+                                   const BIGNUM * scalar,
+                                   size_t num,
+                                   const EC_POINT * points[],
+                                   const BIGNUM * scalars[], BN_CTX * ctx)
+{
+    int i = 0, ret = 0, no_precomp_for_generator = 0, p_is_infinity = 0;
+    unsigned char p_str[33] = { 0 };
+    const PRECOMP256_ROW *preComputedTable = NULL;
+    const EC_PRE_COMP *pre_comp = NULL;
+    const EC_POINT *generator = NULL;
+    unsigned int index = 0;
+    const unsigned int window_size = 7;
+    const unsigned int mask = (1 << (window_size + 1)) - 1;
+    unsigned int wvalue;
+    ALIGN32 union {
+        P256_POINT p;
+        P256_POINT_AFFINE a;
+    } t, p;
+    BIGNUM *tmp_scalar;
+
+    if (group->meth != r->meth) {
+        ECerr(EC_F_NISTZ256_POINTS_MUL, EC_R_INCOMPATIBLE_OBJECTS);
+        return 0;
+    }
+    if ((scalar == NULL) && (num == 0))
+        return EC_POINT_set_to_infinity(group, r);
+
+    for (i = 0; i < num; i++) {
+        if (group->meth != points[i]->meth) {
+            ECerr(EC_F_NISTZ256_POINTS_MUL, EC_R_INCOMPATIBLE_OBJECTS);
+            return 0;
+        }
+    }
+
+    /* Need 256 bits for space for all coordinates. */
+    bn_wexpand(&r->X, P256_LIMBS);
+    bn_wexpand(&r->Y, P256_LIMBS);
+    bn_wexpand(&r->Z, P256_LIMBS);
+    r->X.top = P256_LIMBS;
+    r->Y.top = P256_LIMBS;
+    r->Z.top = P256_LIMBS;
+
+    if (scalar) {
+        generator = EC_GROUP_get0_generator(group);
+        if (generator == NULL) {
+            ECerr(EC_F_NISTZ256_POINTS_MUL, EC_R_UNDEFINED_GENERATOR);
+            goto err;
+        }
+
+        /* look if we can use precomputed multiples of generator */
+        pre_comp =
+            EC_EX_DATA_get_data(group->extra_data, ec_pre_comp_dup,
+                                ec_pre_comp_free, ec_pre_comp_clear_free);
+
+        if (pre_comp) {
+            /* If there is a precomputed table for the generator,
+             * check that it was generated with the same
+             * generator. */
+            EC_POINT *pre_comp_generator = EC_POINT_new(group);
+            if (pre_comp_generator == NULL)
+                goto err;
+
+            if (!ecp_nistz256_set_from_affine
+                (pre_comp_generator, group, pre_comp->precomp[0], ctx))
+                goto err;
+
+            if (0 == EC_POINT_cmp(group, generator, pre_comp_generator, ctx))
+                preComputedTable = (const PRECOMP256_ROW *)pre_comp->precomp;
+
+            EC_POINT_free(pre_comp_generator);
+        }
+
+        if (preComputedTable == NULL && ecp_nistz256_is_affine_G(generator)) {
+            /* If there is no precomputed data, but the generator
+             * is the default, a hardcoded table of precomputed
+             * data is used. This is because applications, such as
+             * Apache, do not use EC_KEY_precompute_mult. */
+            preComputedTable = (const PRECOMP256_ROW *)ecp_nistz256_precomputed;
+        }
+
+        if (preComputedTable) {
+            if ((BN_num_bits(scalar) > 256)
+                || BN_is_negative(scalar)) {
+                if ((tmp_scalar = BN_CTX_get(ctx)) == NULL)
+                    goto err;
+
+                if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
+                    ECerr(EC_F_NISTZ256_POINTS_MUL, ERR_R_BN_LIB);
+                    goto err;
+                }
+                scalar = tmp_scalar;
+            }
+
+            for (i = 0; i < scalar->top * BN_BYTES; i += BN_BYTES) {
+                BN_ULONG d = scalar->d[i / BN_BYTES];
+
+                p_str[i + 0] = d & 0xff;
+                p_str[i + 1] = (d >> 8) & 0xff;
+                p_str[i + 2] = (d >> 16) & 0xff;
+                p_str[i + 3] = (d >>= 24) & 0xff;
+                if (BN_BYTES == 8) {
+                    d >>= 8;
+                    p_str[i + 4] = d & 0xff;
+                    p_str[i + 5] = (d >> 8) & 0xff;
+                    p_str[i + 6] = (d >> 16) & 0xff;
+                    p_str[i + 7] = (d >> 24) & 0xff;
+                }
+            }
+
+            for (; i < 33; i++)
+                p_str[i] = 0;
+
+#if defined(ECP_NISTZ256_AVX2)
+            if (ecp_nistz_avx2_eligible()) {
+                ecp_nistz256_avx2_mul_g(&p.p, p_str, preComputedTable);
+            } else
+#endif
+            {
+                /* First window */
+                wvalue = (p_str[0] << 1) & mask;
+                index += window_size;
+
+                wvalue = _booth_recode_w7(wvalue);
+
+                ecp_nistz256_select_w7(&p.a, preComputedTable[0], wvalue >> 1);
+
+                ecp_nistz256_neg(p.p.Z, p.p.Y);
+                copy_conditional(p.p.Y, p.p.Z, wvalue & 1);
+
+                memcpy(p.p.Z, ONE, sizeof(ONE));
+
+                for (i = 1; i < 37; i++) {
+                    unsigned int off = (index - 1) / 8;
+                    wvalue = p_str[off] | p_str[off + 1] << 8;
+                    wvalue = (wvalue >> ((index - 1) % 8)) & mask;
+                    index += window_size;
+
+                    wvalue = _booth_recode_w7(wvalue);
+
+                    ecp_nistz256_select_w7(&t.a,
+                                           preComputedTable[i], wvalue >> 1);
+
+                    ecp_nistz256_neg(t.p.Z, t.a.Y);
+                    copy_conditional(t.a.Y, t.p.Z, wvalue & 1);
+
+                    ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a);
+                }
+            }
+        } else {
+            p_is_infinity = 1;
+            no_precomp_for_generator = 1;
+        }
+    } else
+        p_is_infinity = 1;
+
+    if (no_precomp_for_generator) {
+        /* Without a precomputed table for the generator, it has to be
+         * handled like a normal point. */
+        const BIGNUM **new_scalars;
+        const EC_POINT **new_points;
+
+        new_scalars = OPENSSL_malloc((num + 1) * sizeof(BIGNUM *));
+        if (!new_scalars) {
+            ECerr(EC_F_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
+            return 0;
+        }
+
+        new_points = OPENSSL_malloc((num + 1) * sizeof(EC_POINT *));
+        if (!new_points) {
+            OPENSSL_free(new_scalars);
+            ECerr(EC_F_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
+            return 0;
+        }
+
+        memcpy(new_scalars, scalars, num * sizeof(BIGNUM *));
+        new_scalars[num] = scalar;
+        memcpy(new_points, points, num * sizeof(EC_POINT *));
+        new_points[num] = generator;
+
+        scalars = new_scalars;
+        points = new_points;
+        num++;
+    }
+
+    if (num) {
+        P256_POINT *out = &t.p;
+        if (p_is_infinity)
+            out = &p.p;
+
+        ecp_nistz256_windowed_mul(group, out, scalars, points, num, ctx);
+
+        if (!p_is_infinity)
+            ecp_nistz256_point_add(&p.p, &p.p, out);
+    }
+
+    if (no_precomp_for_generator) {
+        OPENSSL_free(points);
+        OPENSSL_free(scalars);
+    }
+
+    memcpy(r->X.d, p.p.X, sizeof(p.p.X));
+    memcpy(r->Y.d, p.p.Y, sizeof(p.p.Y));
+    memcpy(r->Z.d, p.p.Z, sizeof(p.p.Z));
+    bn_correct_top(&r->X);
+    bn_correct_top(&r->Y);
+    bn_correct_top(&r->Z);
+
+    ret = 1;
+
+err:
+    return ret;
+}
+
+static int ecp_nistz256_get_affine(const EC_GROUP * group,
+                                   const EC_POINT * point,
+                                   BIGNUM * x, BIGNUM * y, BN_CTX * ctx)
+{
+    BN_ULONG z_inv2[P256_LIMBS];
+    BN_ULONG z_inv3[P256_LIMBS];
+    BN_ULONG x_aff[P256_LIMBS];
+    BN_ULONG y_aff[P256_LIMBS];
+    BN_ULONG point_x[P256_LIMBS], point_y[P256_LIMBS], point_z[P256_LIMBS];
+
+    if (EC_POINT_is_at_infinity(group, point)) {
+        ECerr(EC_F_NISTZ256_GET_AFFINE_COORDINATES, EC_R_POINT_AT_INFINITY);
+        return 0;
+    }
+
+    if (!ecp_nistz256_bignum_to_field_elem(point_x, &point->X) ||
+        !ecp_nistz256_bignum_to_field_elem(point_y, &point->Y) ||
+        !ecp_nistz256_bignum_to_field_elem(point_z, &point->Z)) {
+        ECerr(EC_F_NISTZ256_GET_AFFINE_COORDINATES,
+              EC_R_COORDINATES_OUT_OF_RANGE);
+        return 0;
+    }
+
+    ecp_nistz256_mod_inverse(z_inv3, point_z);
+    ecp_nistz256_sqr_mont(z_inv2, z_inv3);
+    ecp_nistz256_mul_mont(x_aff, z_inv2, point_x);
+
+    if (x != NULL) {
+        bn_wexpand(x, P256_LIMBS);
+        x->top = P256_LIMBS;
+        ecp_nistz256_from_mont(x->d, x_aff);
+        bn_correct_top(x);
+    }
+
+    if (y != NULL) {
+        ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2);
+        ecp_nistz256_mul_mont(y_aff, z_inv3, point_y);
+        bn_wexpand(y, P256_LIMBS);
+        y->top = P256_LIMBS;
+        ecp_nistz256_from_mont(y->d, y_aff);
+        bn_correct_top(y);
+    }
+
+    return 1;
+}
+
+static EC_PRE_COMP *ec_pre_comp_new(const EC_GROUP * group)
+{
+    EC_PRE_COMP *ret = NULL;
+
+    if (!group)
+        return NULL;
+
+    ret = (EC_PRE_COMP *) OPENSSL_malloc(sizeof(EC_PRE_COMP));
+
+    if (!ret) {
+        ECerr(EC_F_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
+        return ret;
+    }
+
+    ret->group = group;
+    ret->w = 6;                 /* default */
+    ret->precomp = NULL;
+    ret->precomp_storage = NULL;
+    ret->references = 1;
+    return ret;
+}
+
+static void *ec_pre_comp_dup(void *src_)
+{
+    EC_PRE_COMP *src = src_;
+
+    /* no need to actually copy, these objects never change! */
+    CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
+
+    return src_;
+}
+
+static void ec_pre_comp_free(void *pre_)
+{
+    int i;
+    EC_PRE_COMP *pre = pre_;
+
+    if (!pre)
+        return;
+
+    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+    if (i > 0)
+        return;
+
+    if (pre->precomp_storage)
+        OPENSSL_free(pre->precomp_storage);
+
+    OPENSSL_free(pre);
+}
+
+static void ec_pre_comp_clear_free(void *pre_)
+{
+    int i;
+    EC_PRE_COMP *pre = pre_;
+
+    if (!pre)
+        return;
+
+    i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+    if (i > 0)
+        return;
+
+    if (pre->precomp_storage) {
+        OPENSSL_cleanse(pre->precomp,
+                        32 * sizeof(unsigned char) * (1 << pre->w) * 2 * 37);
+        OPENSSL_free(pre->precomp_storage);
+    }
+    OPENSSL_cleanse(pre, sizeof *pre);
+    OPENSSL_free(pre);
+}
+
+static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP * group)
+{
+    /* There is a hard-coded table for the default generator. */
+    const EC_POINT *generator = EC_GROUP_get0_generator(group);
+    if (generator != NULL && ecp_nistz256_is_affine_G(generator)) {
+        /* There is a hard-coded table for the default generator. */
+        return 1;
+    }
+
+    return EC_EX_DATA_get_data(group->extra_data, ec_pre_comp_dup,
+                               ec_pre_comp_free,
+                               ec_pre_comp_clear_free) != NULL;
+}
+
+const EC_METHOD *EC_GFp_nistz256_method(void)
+{
+    static const EC_METHOD ret = {
+        EC_FLAGS_DEFAULT_OCT,
+        NID_X9_62_prime_field,
+        ec_GFp_mont_group_init,
+        ec_GFp_mont_group_finish,
+        ec_GFp_mont_group_clear_finish,
+        ec_GFp_mont_group_copy,
+        ec_GFp_mont_group_set_curve,
+        ec_GFp_simple_group_get_curve,
+        ec_GFp_simple_group_get_degree,
+        ec_GFp_simple_group_check_discriminant,
+        ec_GFp_simple_point_init,
+        ec_GFp_simple_point_finish,
+        ec_GFp_simple_point_clear_finish,
+        ec_GFp_simple_point_copy,
+        ec_GFp_simple_point_set_to_infinity,
+        ec_GFp_simple_set_Jprojective_coordinates_GFp,
+        ec_GFp_simple_get_Jprojective_coordinates_GFp,
+        ec_GFp_simple_point_set_affine_coordinates,
+        ecp_nistz256_get_affine,
+        0, 0, 0,
+        ec_GFp_simple_add,
+        ec_GFp_simple_dbl,
+        ec_GFp_simple_invert,
+        ec_GFp_simple_is_at_infinity,
+        ec_GFp_simple_is_on_curve,
+        ec_GFp_simple_cmp,
+        ec_GFp_simple_make_affine,
+        ec_GFp_simple_points_make_affine,
+        ecp_nistz256_points_mul,                    /* mul */
+        ecp_nistz256_mult_precompute,               /* precompute_mult */
+        ecp_nistz256_window_have_precompute_mult,   /* have_precompute_mult */
+        ec_GFp_mont_field_mul,
+        ec_GFp_mont_field_sqr,
+        0,                                          /* field_div */
+        ec_GFp_mont_field_encode,
+        ec_GFp_mont_field_decode,
+        ec_GFp_mont_field_set_to_one
+    };
+
+    return &ret;
+}
diff --git a/crypto/ec/ecp_nistz256_table.c b/crypto/ec/ecp_nistz256_table.c
new file mode 100644 (file)
index 0000000..c5bd839
--- /dev/null
@@ -0,0 +1,9534 @@
+/* This is the precomputed constant time access table for the code in
+ * ecp_montp256.c, for the default generator.
+ *
+ * The table consists of 37 subtables, each subtable contains 64 affine points.
+ * The affine points are encoded as eight uint64's, four for the x coordinate
+ * and four for the y. Both values are in little-endian order.
+ *
+ * There are 37 tables because a signed, 6-bit wNAF form of the scalar is used
+ * and ceil(256/(6 + 1)) = 37. Within each table there are 64 values because
+ * the 6-bit wNAF value can take 64 values, ignoring the sign bit, which is
+ * implemented by performing a negation of the affine point when required.
+ *
+ * We would like to align it to 2MB in order to increase the chances of using a
+ * large page but that appears to lead to invalid ELF files being produced. */
+
+#if defined(__GNUC__)
+__attribute((aligned(4096)))
+#elif defined(_MSC_VER)
+__declspec(align(4096))
+#elif defined(__SUNPRO_C)
+# pragma align 4096(ecp_nistz256_precomputed)
+#endif
+static const BN_ULONG ecp_nistz256_precomputed[37][64 *
+                                                   sizeof(P256_POINT_AFFINE) /
+                                                   sizeof(BN_ULONG)] = {
+    {TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601),
+     TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6),
+     TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c),
+     TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85),
+     TOBN(0x850046d4, 0x10ddd64d), TOBN(0xaa6ae3c1, 0xa433827d),
+     TOBN(0x73220503, 0x8d1490d9), TOBN(0xf6bb32e4, 0x3dcf3a3b),
+     TOBN(0x2f3648d3, 0x61bee1a5), TOBN(0x152cd7cb, 0xeb236ff8),
+     TOBN(0x19a8fb0e, 0x92042dbe), TOBN(0x78c57751, 0x0a5b8a3b),
+     TOBN(0xffac3f90, 0x4eebc127), TOBN(0xb027f84a, 0x087d81fb),
+     TOBN(0x66ad77dd, 0x87cbbc98), TOBN(0x26936a3f, 0xb6ff747e),
+     TOBN(0xb04c5c1f, 0xc983a7eb), TOBN(0x583e47ad, 0x0861fe1a),
+     TOBN(0x78820831, 0x1a2ee98e), TOBN(0xd5f06a29, 0xe587cc07),
+     TOBN(0x74b0b50d, 0x46918dcc), TOBN(0x4650a6ed, 0xc623c173),
+     TOBN(0x0cdaacac, 0xe8100af2), TOBN(0x577362f5, 0x41b0176b),
+     TOBN(0x2d96f24c, 0xe4cbaba6), TOBN(0x17628471, 0xfad6f447),
+     TOBN(0x6b6c36de, 0xe5ddd22e), TOBN(0x84b14c39, 0x4c5ab863),
+     TOBN(0xbe1b8aae, 0xc45c61f5), TOBN(0x90ec649a, 0x94b9537d),
+     TOBN(0x941cb5aa, 0xd076c20c), TOBN(0xc9079605, 0x890523c8),
+     TOBN(0xeb309b4a, 0xe7ba4f10), TOBN(0x73c568ef, 0xe5eb882b),
+     TOBN(0x3540a987, 0x7e7a1f68), TOBN(0x73a076bb, 0x2dd1e916),
+     TOBN(0x40394737, 0x3e77664a), TOBN(0x55ae744f, 0x346cee3e),
+     TOBN(0xd50a961a, 0x5b17a3ad), TOBN(0x13074b59, 0x54213673),
+     TOBN(0x93d36220, 0xd377e44b), TOBN(0x299c2b53, 0xadff14b5),
+     TOBN(0xf424d44c, 0xef639f11), TOBN(0xa4c9916d, 0x4a07f75f),
+     TOBN(0x0746354e, 0xa0173b4f), TOBN(0x2bd20213, 0xd23c00f7),
+     TOBN(0xf43eaab5, 0x0c23bb08), TOBN(0x13ba5119, 0xc3123e03),
+     TOBN(0x2847d030, 0x3f5b9d4d), TOBN(0x6742f2f2, 0x5da67bdd),
+     TOBN(0xef933bdc, 0x77c94195), TOBN(0xeaedd915, 0x6e240867),
+     TOBN(0x27f14cd1, 0x9499a78f), TOBN(0x462ab5c5, 0x6f9b3455),
+     TOBN(0x8f90f02a, 0xf02cfc6b), TOBN(0xb763891e, 0xb265230d),
+     TOBN(0xf59da3a9, 0x532d4977), TOBN(0x21e3327d, 0xcf9eba15),
+     TOBN(0x123c7b84, 0xbe60bbf0), TOBN(0x56ec12f2, 0x7706df76),
+     TOBN(0x75c96e8f, 0x264e20e8), TOBN(0xabe6bfed, 0x59a7a841),
+     TOBN(0x2cc09c04, 0x44c8eb00), TOBN(0xe05b3080, 0xf0c4e16b),
+     TOBN(0x1eb7777a, 0xa45f3314), TOBN(0x56af7bed, 0xce5d45e3),
+     TOBN(0x2b6e019a, 0x88b12f1a), TOBN(0x086659cd, 0xfd835f9b),
+     TOBN(0x2c18dbd1, 0x9dc21ec8), TOBN(0x98f9868a, 0x0fcf8139),
+     TOBN(0x737d2cd6, 0x48250b49), TOBN(0xcc61c947, 0x24b3428f),
+     TOBN(0x0c2b4078, 0x80dd9e76), TOBN(0xc43a8991, 0x383fbe08),
+     TOBN(0x5f7d2d65, 0x779be5d2), TOBN(0x78719a54, 0xeb3b4ab5),
+     TOBN(0xea7d260a, 0x6245e404), TOBN(0x9de40795, 0x6e7fdfe0),
+     TOBN(0x1ff3a415, 0x8dac1ab5), TOBN(0x3e7090f1, 0x649c9073),
+     TOBN(0x1a768561, 0x2b944e88), TOBN(0x250f939e, 0xe57f61c8),
+     TOBN(0x0c0daa89, 0x1ead643d), TOBN(0x68930023, 0xe125b88e),
+     TOBN(0x04b71aa7, 0xd2697768), TOBN(0xabdedef5, 0xca345a33),
+     TOBN(0x2409d29d, 0xee37385e), TOBN(0x4ee1df77, 0xcb83e156),
+     TOBN(0x0cac12d9, 0x1cbb5b43), TOBN(0x170ed2f6, 0xca895637),
+     TOBN(0x28228cfa, 0x8ade6d66), TOBN(0x7ff57c95, 0x53238aca),
+     TOBN(0xccc42563, 0x4b2ed709), TOBN(0x0e356769, 0x856fd30d),
+     TOBN(0xbcbcd43f, 0x559e9811), TOBN(0x738477ac, 0x5395b759),
+     TOBN(0x35752b90, 0xc00ee17f), TOBN(0x68748390, 0x742ed2e3),
+     TOBN(0x7cd06422, 0xbd1f5bc1), TOBN(0xfbc08769, 0xc9e7b797),
+     TOBN(0xa242a35b, 0xb0cf664a), TOBN(0x126e48f7, 0x7f9707e3),
+     TOBN(0x1717bf54, 0xc6832660), TOBN(0xfaae7332, 0xfd12c72e),
+     TOBN(0x27b52db7, 0x995d586b), TOBN(0xbe29569e, 0x832237c2),
+     TOBN(0xe8e4193e, 0x2a65e7db), TOBN(0x152706dc, 0x2eaa1bbb),
+     TOBN(0x72bcd8b7, 0xbc60055b), TOBN(0x03cc23ee, 0x56e27e4b),
+     TOBN(0xee337424, 0xe4819370), TOBN(0xe2aa0e43, 0x0ad3da09),
+     TOBN(0x40b8524f, 0x6383c45d), TOBN(0xd7663554, 0x42a41b25),
+     TOBN(0x64efa6de, 0x778a4797), TOBN(0x2042170a, 0x7079adf4),
+     TOBN(0x808b0b65, 0x0bc6fb80), TOBN(0x5882e075, 0x3ffe2e6b),
+     TOBN(0xd5ef2f7c, 0x2c83f549), TOBN(0x54d63c80, 0x9103b723),
+     TOBN(0xf2f11bd6, 0x52a23f9b), TOBN(0x3670c319, 0x4b0b6587),
+     TOBN(0x55c4623b, 0xb1580e9e), TOBN(0x64edf7b2, 0x01efe220),
+     TOBN(0x97091dcb, 0xd53c5c9d), TOBN(0xf17624b6, 0xac0a177b),
+     TOBN(0xb0f13975, 0x2cfe2dff), TOBN(0xc1a35c0a, 0x6c7a574e),
+     TOBN(0x227d3146, 0x93e79987), TOBN(0x0575bf30, 0xe89cb80e),
+     TOBN(0x2f4e247f, 0x0d1883bb), TOBN(0xebd51226, 0x3274c3d0),
+     TOBN(0x5f3e51c8, 0x56ada97a), TOBN(0x4afc964d, 0x8f8b403e),
+     TOBN(0xa6f247ab, 0x412e2979), TOBN(0x675abd1b, 0x6f80ebda),
+     TOBN(0x66a2bd72, 0x5e485a1d), TOBN(0x4b2a5caf, 0x8f4f0b3c),
+     TOBN(0x2626927f, 0x1b847bba), TOBN(0x6c6fc7d9, 0x0502394d),
+     TOBN(0xfea912ba, 0xa5659ae8), TOBN(0x68363aba, 0x25e1a16e),
+     TOBN(0xb8842277, 0x752c41ac), TOBN(0xfe545c28, 0x2897c3fc),
+     TOBN(0x2d36e9e7, 0xdc4c696b), TOBN(0x5806244a, 0xfba977c5),
+     TOBN(0x85665e9b, 0xe39508c1), TOBN(0xf720ee25, 0x6d12597b),
+     TOBN(0x8a979129, 0xd2337a31), TOBN(0x5916868f, 0x0f862bdc),
+     TOBN(0x048099d9, 0x5dd283ba), TOBN(0xe2d1eeb6, 0xfe5bfb4e),
+     TOBN(0x82ef1c41, 0x7884005d), TOBN(0xa2d4ec17, 0xffffcbae),
+     TOBN(0x9161c53f, 0x8aa95e66), TOBN(0x5ee104e1, 0xc5fee0d0),
+     TOBN(0x562e4cec, 0xc135b208), TOBN(0x74e1b265, 0x4783f47d),
+     TOBN(0x6d2a506c, 0x5a3f3b30), TOBN(0xecead9f4, 0xc16762fc),
+     TOBN(0xf29dd4b2, 0xe286e5b9), TOBN(0x1b0fadc0, 0x83bb3c61),
+     TOBN(0x7a75023e, 0x7fac29a4), TOBN(0xc086d5f1, 0xc9477fa3),
+     TOBN(0x0fc61135, 0x2f6f3076), TOBN(0xc99ffa23, 0xe3912a9a),
+     TOBN(0x6a0b0685, 0xd2f8ba3d), TOBN(0xfdc777e8, 0xe93358a4),
+     TOBN(0x94a787bb, 0x35415f04), TOBN(0x640c2d6a, 0x4d23fea4),
+     TOBN(0x9de917da, 0x153a35b5), TOBN(0x793e8d07, 0x5d5cd074),
+     TOBN(0xf4f87653, 0x2de45068), TOBN(0x37c7a7e8, 0x9e2e1f6e),
+     TOBN(0xd0825fa2, 0xa3584069), TOBN(0xaf2cea7c, 0x1727bf42),
+     TOBN(0x0360a4fb, 0x9e4785a9), TOBN(0xe5fda49c, 0x27299f4a),
+     TOBN(0x48068e13, 0x71ac2f71), TOBN(0x83d0687b, 0x9077666f),
+     TOBN(0x6d3883b2, 0x15d02819), TOBN(0x6d0d7550, 0x40dd9a35),
+     TOBN(0x61d7cbf9, 0x1d2b469f), TOBN(0xf97b232f, 0x2efc3115),
+     TOBN(0xa551d750, 0xb24bcbc7), TOBN(0x11ea4949, 0x88a1e356),
+     TOBN(0x7669f031, 0x93cb7501), TOBN(0x595dc55e, 0xca737b8a),
+     TOBN(0xa4a319ac, 0xd837879f), TOBN(0x6fc1b49e, 0xed6b67b0),
+     TOBN(0xe3959933, 0x32f1f3af), TOBN(0x966742eb, 0x65432a2e),
+     TOBN(0x4b8dc9fe, 0xb4966228), TOBN(0x96cc6312, 0x43f43950),
+     TOBN(0x12068859, 0xc9b731ee), TOBN(0x7b948dc3, 0x56f79968),
+     TOBN(0x61e4ad32, 0xed1f8008), TOBN(0xe6c9267a, 0xd8b17538),
+     TOBN(0x1ac7c5eb, 0x857ff6fb), TOBN(0x994baaa8, 0x55f2fb10),
+     TOBN(0x84cf14e1, 0x1d248018), TOBN(0x5a39898b, 0x628ac508),
+     TOBN(0x14fde97b, 0x5fa944f5), TOBN(0xed178030, 0xd12e5ac7),
+     TOBN(0x042c2af4, 0x97e2feb4), TOBN(0xd36a42d7, 0xaebf7313),
+     TOBN(0x49d2c9eb, 0x084ffdd7), TOBN(0x9f8aa54b, 0x2ef7c76a),
+     TOBN(0x9200b7ba, 0x09895e70), TOBN(0x3bd0c66f, 0xddb7fb58),
+     TOBN(0x2d97d108, 0x78eb4cbb), TOBN(0x2d431068, 0xd84bde31),
+     TOBN(0x4b523eb7, 0x172ccd1f), TOBN(0x7323cb28, 0x30a6a892),
+     TOBN(0x97082ec0, 0xcfe153eb), TOBN(0xe97f6b6a, 0xf2aadb97),
+     TOBN(0x1d3d393e, 0xd1a83da1), TOBN(0xa6a7f9c7, 0x804b2a68),
+     TOBN(0x4a688b48, 0x2d0cb71e), TOBN(0xa9b4cc5f, 0x40585278),
+     TOBN(0x5e5db46a, 0xcb66e132), TOBN(0xf1be963a, 0x0d925880),
+     TOBN(0x944a7027, 0x0317b9e2), TOBN(0xe266f959, 0x48603d48),
+     TOBN(0x98db6673, 0x5c208899), TOBN(0x90472447, 0xa2fb18a3),
+     TOBN(0x8a966939, 0x777c619f), TOBN(0x3798142a, 0x2a3be21b),
+     TOBN(0xb4241cb1, 0x3298b343), TOBN(0xa3a14e49, 0xb44f65a1),
+     TOBN(0xc5f4d6cd, 0x3ac77acd), TOBN(0xd0288cb5, 0x52b6fc3c),
+     TOBN(0xd5cc8c2f, 0x1c040abc), TOBN(0xb675511e, 0x06bf9b4a),
+     TOBN(0xd667da37, 0x9b3aa441), TOBN(0x460d45ce, 0x51601f72),
+     TOBN(0xe2f73c69, 0x6755ff89), TOBN(0xdd3cf7e7, 0x473017e6),
+     TOBN(0x8ef5689d, 0x3cf7600d), TOBN(0x948dc4f8, 0xb1fc87b4),
+     TOBN(0xd9e9fe81, 0x4ea53299), TOBN(0x2d921ca2, 0x98eb6028),
+     TOBN(0xfaecedfd, 0x0c9803fc), TOBN(0xf38ae891, 0x4d7b4745),
+     TOBN(0xd8c5fccf, 0xc5e3a3d8), TOBN(0xbefd904c, 0x4079dfbf),
+     TOBN(0xbc6d6a58, 0xfead0197), TOBN(0x39227077, 0x695532a4),
+     TOBN(0x09e23e6d, 0xdbef42f5), TOBN(0x7e449b64, 0x480a9908),
+     TOBN(0x7b969c1a, 0xad9a2e40), TOBN(0x6231d792, 0x9591c2a4),
+     TOBN(0x87151456, 0x0f664534), TOBN(0x85ceae7c, 0x4b68f103),
+     TOBN(0xac09c4ae, 0x65578ab9), TOBN(0x33ec6868, 0xf044b10c),
+     TOBN(0x6ac4832b, 0x3a8ec1f1), TOBN(0x5509d128, 0x5847d5ef),
+     TOBN(0xf909604f, 0x763f1574), TOBN(0xb16c4303, 0xc32f63c4),
+     TOBN(0xb6ab2014, 0x7ca23cd3), TOBN(0xcaa7a5c6, 0xa391849d),
+     TOBN(0x5b0673a3, 0x75678d94), TOBN(0xc982ddd4, 0xdd303e64),
+     TOBN(0xfd7b000b, 0x5db6f971), TOBN(0xbba2cb1f, 0x6f876f92),
+     TOBN(0xc77332a3, 0x3c569426), TOBN(0xa159100c, 0x570d74f8),
+     TOBN(0xfd16847f, 0xdec67ef5), TOBN(0x742ee464, 0x233e76b7),
+     TOBN(0x0b8e4134, 0xefc2b4c8), TOBN(0xca640b86, 0x42a3e521),
+     TOBN(0x653a0190, 0x8ceb6aa9), TOBN(0x313c300c, 0x547852d5),
+     TOBN(0x24e4ab12, 0x6b237af7), TOBN(0x2ba90162, 0x8bb47af8),
+     TOBN(0x3d5e58d6, 0xa8219bb7), TOBN(0xc691d0bd, 0x1b06c57f),
+     TOBN(0x0ae4cb10, 0xd257576e), TOBN(0x3569656c, 0xd54a3dc3),
+     TOBN(0xe5ebaebd, 0x94cda03a), TOBN(0x934e82d3, 0x162bfe13),
+     TOBN(0x450ac0ba, 0xe251a0c6), TOBN(0x480b9e11, 0xdd6da526),
+     TOBN(0x00467bc5, 0x8cce08b5), TOBN(0xb636458c, 0x7f178d55),
+     TOBN(0xc5748bae, 0xa677d806), TOBN(0x2763a387, 0xdfa394eb),
+     TOBN(0xa12b448a, 0x7d3cebb6), TOBN(0xe7adda3e, 0x6f20d850),
+     TOBN(0xf63ebce5, 0x1558462c), TOBN(0x58b36143, 0x620088a8),
+     TOBN(0x8a2cc3ca, 0x4d63c0ee), TOBN(0x51233117, 0x0fe948ce),
+     TOBN(0x7463fd85, 0x222ef33b), TOBN(0xadf0c7dc, 0x7c603d6c),
+     TOBN(0x0ec32d3b, 0xfe7765e5), TOBN(0xccaab359, 0xbf380409),
+     TOBN(0xbdaa84d6, 0x8e59319c), TOBN(0xd9a4c280, 0x9c80c34d),
+     TOBN(0xa9d89488, 0xa059c142), TOBN(0x6f5ae714, 0xff0b9346),
+     TOBN(0x068f237d, 0x16fb3664), TOBN(0x5853e4c4, 0x363186ac),
+     TOBN(0xe2d87d23, 0x63c52f98), TOBN(0x2ec4a766, 0x81828876),
+     TOBN(0x47b864fa, 0xe14e7b1c), TOBN(0x0c0bc0e5, 0x69192408),
+     TOBN(0xe4d7681d, 0xb82e9f3e), TOBN(0x83200f0b, 0xdf25e13c),
+     TOBN(0x8909984c, 0x66f27280), TOBN(0x462d7b00, 0x75f73227),
+     TOBN(0xd90ba188, 0xf2651798), TOBN(0x74c6e18c, 0x36ab1c34),
+     TOBN(0xab256ea3, 0x5ef54359), TOBN(0x03466612, 0xd1aa702f),
+     TOBN(0x624d6049, 0x2ed22e91), TOBN(0x6fdfe0b5, 0x6f072822),
+     TOBN(0xeeca1115, 0x39ce2271), TOBN(0x98100a4f, 0xdb01614f),
+     TOBN(0xb6b0daa2, 0xa35c628f), TOBN(0xb6f94d2e, 0xc87e9a47),
+     TOBN(0xc6773259, 0x1d57d9ce), TOBN(0xf70bfeec, 0x03884a7b),
+     TOBN(0x5fb35ccf, 0xed2bad01), TOBN(0xa155cbe3, 0x1da6a5c7),
+     TOBN(0xc2e2594c, 0x30a92f8f), TOBN(0x649c89ce, 0x5bfafe43),
+     TOBN(0xd158667d, 0xe9ff257a), TOBN(0x9b359611, 0xf32c50ae),
+     TOBN(0x4b00b20b, 0x906014cf), TOBN(0xf3a8cfe3, 0x89bc7d3d),
+     TOBN(0x4ff23ffd, 0x248a7d06), TOBN(0x80c5bfb4, 0x878873fa),
+     TOBN(0xb7d9ad90, 0x05745981), TOBN(0x179c85db, 0x3db01994),
+     TOBN(0xba41b062, 0x61a6966c), TOBN(0x4d82d052, 0xeadce5a8),
+     TOBN(0x9e91cd3b, 0xa5e6a318), TOBN(0x47795f4f, 0x95b2dda0),
+     TOBN(0xecfd7c1f, 0xd55a897c), TOBN(0x009194ab, 0xb29110fb),
+     TOBN(0x5f0e2046, 0xe381d3b0), TOBN(0x5f3425f6, 0xa98dd291),
+     TOBN(0xbfa06687, 0x730d50da), TOBN(0x0423446c, 0x4b083b7f),
+     TOBN(0x397a247d, 0xd69d3417), TOBN(0xeb629f90, 0x387ba42a),
+     TOBN(0x1ee426cc, 0xd5cd79bf), TOBN(0x0032940b, 0x946c6e18),
+     TOBN(0x1b1e8ae0, 0x57477f58), TOBN(0xe94f7d34, 0x6d823278),
+     TOBN(0xc747cb96, 0x782ba21a), TOBN(0xc5254469, 0xf72b33a5),
+     TOBN(0x772ef6de, 0xc7f80c81), TOBN(0xd73acbfe, 0x2cd9e6b5),
+     TOBN(0x4075b5b1, 0x49ee90d9), TOBN(0x785c339a, 0xa06e9eba),
+     TOBN(0xa1030d5b, 0xabf825e0), TOBN(0xcec684c3, 0xa42931dc),
+     TOBN(0x42ab62c9, 0xc1586e63), TOBN(0x45431d66, 0x5ab43f2b),
+     TOBN(0x57c8b2c0, 0x55f7835d), TOBN(0x033da338, 0xc1b7f865),
+     TOBN(0x283c7513, 0xcaa76097), TOBN(0x0a624fa9, 0x36c83906),
+     TOBN(0x6b20afec, 0x715af2c7), TOBN(0x4b969974, 0xeba78bfd),
+     TOBN(0x220755cc, 0xd921d60e), TOBN(0x9b944e10, 0x7baeca13),
+     TOBN(0x04819d51, 0x5ded93d4), TOBN(0x9bbff86e, 0x6dddfd27),
+     TOBN(0x6b344130, 0x77adc612), TOBN(0xa7496529, 0xbbd803a0),
+     TOBN(0x1a1baaa7, 0x6d8805bd), TOBN(0xc8403902, 0x470343ad),
+     TOBN(0x39f59f66, 0x175adff1), TOBN(0x0b26d7fb, 0xb7d8c5b7),
+     TOBN(0xa875f5ce, 0x529d75e3), TOBN(0x85efc7e9, 0x41325cc2),
+     TOBN(0x21950b42, 0x1ff6acd3), TOBN(0xffe70484, 0x53dc6909),
+     TOBN(0xff4cd0b2, 0x28766127), TOBN(0xabdbe608, 0x4fb7db2b),
+     TOBN(0x837c9228, 0x5e1109e8), TOBN(0x26147d27, 0xf4645b5a),
+     TOBN(0x4d78f592, 0xf7818ed8), TOBN(0xd394077e, 0xf247fa36),
+     TOBN(0x0fb9c2d0, 0x488c171a), TOBN(0xa78bfbaa, 0x13685278),
+     TOBN(0xedfbe268, 0xd5b1fa6a), TOBN(0x0dceb8db, 0x2b7eaba7),
+     TOBN(0xbf9e8089, 0x9ae2b710), TOBN(0xefde7ae6, 0xa4449c96),
+     TOBN(0x43b7716b, 0xcc143a46), TOBN(0xd7d34194, 0xc3628c13),
+     TOBN(0x508cec1c, 0x3b3f64c9), TOBN(0xe20bc0ba, 0x1e5edf3f),
+     TOBN(0xda1deb85, 0x2f4318d4), TOBN(0xd20ebe0d, 0x5c3fa443),
+     TOBN(0x370b4ea7, 0x73241ea3), TOBN(0x61f1511c, 0x5e1a5f65),
+     TOBN(0x99a5e23d, 0x82681c62), TOBN(0xd731e383, 0xa2f54c2d),
+     TOBN(0x2692f36e, 0x83445904), TOBN(0x2e0ec469, 0xaf45f9c0),
+     TOBN(0x905a3201, 0xc67528b7), TOBN(0x88f77f34, 0xd0e5e542),
+     TOBN(0xf67a8d29, 0x5864687c), TOBN(0x23b92eae, 0x22df3562),
+     TOBN(0x5c27014b, 0x9bbec39e), TOBN(0x7ef2f226, 0x9c0f0f8d),
+     TOBN(0x97359638, 0x546c4d8d), TOBN(0x5f9c3fc4, 0x92f24679),
+     TOBN(0x912e8bed, 0xa8c8acd9), TOBN(0xec3a318d, 0x306634b0),
+     TOBN(0x80167f41, 0xc31cb264), TOBN(0x3db82f6f, 0x522113f2),
+     TOBN(0xb155bcd2, 0xdcafe197), TOBN(0xfba1da59, 0x43465283),
+     TOBN(0xa0425b8e, 0xb212cf53), TOBN(0x4f2e512e, 0xf8557c5f),
+     TOBN(0xc1286ff9, 0x25c4d56c), TOBN(0xbb8a0fea, 0xee26c851),
+     TOBN(0xc28f70d2, 0xe7d6107e), TOBN(0x7ee0c444, 0xe76265aa),
+     TOBN(0x3df277a4, 0x1d1936b1), TOBN(0x1a556e3f, 0xea9595eb),
+     TOBN(0x258bbbf9, 0xe7305683), TOBN(0x31eea5bf, 0x07ef5be6),
+     TOBN(0x0deb0e4a, 0x46c814c1), TOBN(0x5cee8449, 0xa7b730dd),
+     TOBN(0xeab495c5, 0xa0182bde), TOBN(0xee759f87, 0x9e27a6b4),
+     TOBN(0xc2cf6a68, 0x80e518ca), TOBN(0x25e8013f, 0xf14cf3f4),
+     TOBN(0x8fc44140, 0x7e8d7a14), TOBN(0xbb1ff3ca, 0x9556f36a),
+     TOBN(0x6a844385, 0x14600044), TOBN(0xba3f0c4a, 0x7451ae63),
+     TOBN(0xdfcac25b, 0x1f9af32a), TOBN(0x01e0db86, 0xb1f2214b),
+     TOBN(0x4e9a5bc2, 0xa4b596ac), TOBN(0x83927681, 0x026c2c08),
+     TOBN(0x3ec832e7, 0x7acaca28), TOBN(0x1bfeea57, 0xc7385b29),
+     TOBN(0x068212e3, 0xfd1eaf38), TOBN(0xc1329830, 0x6acf8ccc),
+     TOBN(0xb909f2db, 0x2aac9e59), TOBN(0x5748060d, 0xb661782a),
+     TOBN(0xc5ab2632, 0xc79b7a01), TOBN(0xda44c6c6, 0x00017626),
+     TOBN(0xf26c00e8, 0xa7ea82f0), TOBN(0x99cac80d, 0xe4299aaf),
+     TOBN(0xd66fe3b6, 0x7ed78be1), TOBN(0x305f725f, 0x648d02cd),
+     TOBN(0x33ed1bc4, 0x623fb21b), TOBN(0xfa70533e, 0x7a6319ad),
+     TOBN(0x17ab562d, 0xbe5ffb3e), TOBN(0x06374994, 0x56674741),
+     TOBN(0x69d44ed6, 0x5c46aa8e), TOBN(0x2100d5d3, 0xa8d063d1),
+     TOBN(0xcb9727ea, 0xa2d17c36), TOBN(0x4c2bab1b, 0x8add53b7),
+     TOBN(0xa084e90c, 0x15426704), TOBN(0x778afcd3, 0xa837ebea),
+     TOBN(0x6651f701, 0x7ce477f8), TOBN(0xa0624998, 0x46fb7a8b),
+     TOBN(0xdc1e6828, 0xed8a6e19), TOBN(0x33fc2336, 0x4189d9c7),
+     TOBN(0x026f8fe2, 0x671c39bc), TOBN(0xd40c4ccd, 0xbc6f9915),
+     TOBN(0xafa135bb, 0xf80e75ca), TOBN(0x12c651a0, 0x22adff2c),
+     TOBN(0xc40a04bd, 0x4f51ad96), TOBN(0x04820109, 0xbbe4e832),
+     TOBN(0x3667eb1a, 0x7f4c04cc), TOBN(0x59556621, 0xa9404f84),
+     TOBN(0x71cdf653, 0x7eceb50a), TOBN(0x994a44a6, 0x9b8335fa),
+     TOBN(0xd7faf819, 0xdbeb9b69), TOBN(0x473c5680, 0xeed4350d),
+     TOBN(0xb6658466, 0xda44bba2), TOBN(0x0d1bc780, 0x872bdbf3),
+     TOBN(0xe535f175, 0xa1962f91), TOBN(0x6ed7e061, 0xed58f5a7),
+     TOBN(0x177aa4c0, 0x2089a233), TOBN(0x0dbcb03a, 0xe539b413),
+     TOBN(0xe3dc424e, 0xbb32e38e), TOBN(0x6472e5ef, 0x6806701e),
+     TOBN(0xdd47ff98, 0x814be9ee), TOBN(0x6b60cfff, 0x35ace009),
+     TOBN(0xb8d3d931, 0x9ff91fe5), TOBN(0x039c4800, 0xf0518eed),
+     TOBN(0x95c37632, 0x9182cb26), TOBN(0x0763a434, 0x82fc568d),
+     TOBN(0x707c04d5, 0x383e76ba), TOBN(0xac98b930, 0x824e8197),
+     TOBN(0x92bf7c8f, 0x91230de0), TOBN(0x90876a01, 0x40959b70),
+     TOBN(0xdb6d96f3, 0x05968b80), TOBN(0x380a0913, 0x089f73b9),
+     TOBN(0x7da70b83, 0xc2c61e01), TOBN(0x95fb8394, 0x569b38c7),
+     TOBN(0x9a3c6512, 0x80edfe2f), TOBN(0x8f726bb9, 0x8faeaf82),
+     TOBN(0x8010a4a0, 0x78424bf8), TOBN(0x29672044, 0x0e844970)}
+    ,
+    {TOBN(0x63c5cb81, 0x7a2ad62a), TOBN(0x7ef2b6b9, 0xac62ff54),
+     TOBN(0x3749bba4, 0xb3ad9db5), TOBN(0xad311f2c, 0x46d5a617),
+     TOBN(0xb77a8087, 0xc2ff3b6d), TOBN(0xb46feaf3, 0x367834ff),
+     TOBN(0xf8aa266d, 0x75d6b138), TOBN(0xfa38d320, 0xec008188),
+     TOBN(0x486d8ffa, 0x696946fc), TOBN(0x50fbc6d8, 0xb9cba56d),
+     TOBN(0x7e3d423e, 0x90f35a15), TOBN(0x7c3da195, 0xc0dd962c),
+     TOBN(0xe673fdb0, 0x3cfd5d8b), TOBN(0x0704b7c2, 0x889dfca5),
+     TOBN(0xf6ce581f, 0xf52305aa), TOBN(0x399d49eb, 0x914d5e53),
+     TOBN(0x380a496d, 0x6ec293cd), TOBN(0x733dbda7, 0x8e7051f5),
+     TOBN(0x037e388d, 0xb849140a), TOBN(0xee4b32b0, 0x5946dbf6),
+     TOBN(0xb1c4fda9, 0xcae368d1), TOBN(0x5001a7b0, 0xfdb0b2f3),
+     TOBN(0x6df59374, 0x2e3ac46e), TOBN(0x4af675f2, 0x39b3e656),
+     TOBN(0x44e38110, 0x39949296), TOBN(0x5b63827b, 0x361db1b5),
+     TOBN(0x3e5323ed, 0x206eaff5), TOBN(0x942370d2, 0xc21f4290),
+     TOBN(0xf2caaf2e, 0xe0d985a1), TOBN(0x192cc64b, 0x7239846d),
+     TOBN(0x7c0b8f47, 0xae6312f8), TOBN(0x7dc61f91, 0x96620108),
+     TOBN(0xb830fb5b, 0xc2da7de9), TOBN(0xd0e643df, 0x0ff8d3be),
+     TOBN(0x31ee77ba, 0x188a9641), TOBN(0x4e8aa3aa, 0xbcf6d502),
+     TOBN(0xf9fb6532, 0x9a49110f), TOBN(0xd18317f6, 0x2dd6b220),
+     TOBN(0x7e3ced41, 0x52c3ea5a), TOBN(0x0d296a14, 0x7d579c4a),
+     TOBN(0x35d6a53e, 0xed4c3717), TOBN(0x9f8240cf, 0x3d0ed2a3),
+     TOBN(0x8c0d4d05, 0xe5543aa5), TOBN(0x45d5bbfb, 0xdd33b4b4),
+     TOBN(0xfa04cc73, 0x137fd28e), TOBN(0x862ac6ef, 0xc73b3ffd),
+     TOBN(0x403ff9f5, 0x31f51ef2), TOBN(0x34d5e0fc, 0xbc73f5a2),
+     TOBN(0xf2526820, 0x08913f4f), TOBN(0xea20ed61, 0xeac93d95),
+     TOBN(0x51ed38b4, 0x6ca6b26c), TOBN(0x8662dcbc, 0xea4327b0),
+     TOBN(0x6daf295c, 0x725d2aaa), TOBN(0xbad2752f, 0x8e52dcda),
+     TOBN(0x2210e721, 0x0b17dacc), TOBN(0xa37f7912, 0xd51e8232),
+     TOBN(0x4f7081e1, 0x44cc3add), TOBN(0xd5ffa1d6, 0x87be82cf),
+     TOBN(0x89890b6c, 0x0edd6472), TOBN(0xada26e1a, 0x3ed17863),
+     TOBN(0x276f2715, 0x63483caa), TOBN(0xe6924cd9, 0x2f6077fd),
+     TOBN(0x05a7fe98, 0x0a466e3c), TOBN(0xf1c794b0, 0xb1902d1f),
+     TOBN(0xe5213688, 0x82a8042c), TOBN(0xd931cfaf, 0xcd278298),
+     TOBN(0x069a0ae0, 0xf597a740), TOBN(0x0adbb3f3, 0xeb59107c),
+     TOBN(0x983e951e, 0x5eaa8eb8), TOBN(0xe663a8b5, 0x11b48e78),
+     TOBN(0x1631cc0d, 0x8a03f2c5), TOBN(0x7577c11e, 0x11e271e2),
+     TOBN(0x33b2385c, 0x08369a90), TOBN(0x2990c59b, 0x190eb4f8),
+     TOBN(0x819a6145, 0xc68eac80), TOBN(0x7a786d62, 0x2ec4a014),
+     TOBN(0x33faadbe, 0x20ac3a8d), TOBN(0x31a21781, 0x5aba2d30),
+     TOBN(0x209d2742, 0xdba4f565), TOBN(0xdb2ce9e3, 0x55aa0fbb),
+     TOBN(0x8cef334b, 0x168984df), TOBN(0xe81dce17, 0x33879638),
+     TOBN(0xf6e6949c, 0x263720f0), TOBN(0x5c56feaf, 0xf593cbec),
+     TOBN(0x8bff5601, 0xfde58c84), TOBN(0x74e24117, 0x2eccb314),
+     TOBN(0xbcf01b61, 0x4c9a8a78), TOBN(0xa233e35e, 0x544c9868),
+     TOBN(0xb3156bf3, 0x8bd7aff1), TOBN(0x1b5ee4cb, 0x1d81b146),
+     TOBN(0x7ba1ac41, 0xd628a915), TOBN(0x8f3a8f9c, 0xfd89699e),
+     TOBN(0x7329b9c9, 0xa0748be7), TOBN(0x1d391c95, 0xa92e621f),
+     TOBN(0xe51e6b21, 0x4d10a837), TOBN(0xd255f53a, 0x4947b435),
+     TOBN(0x07669e04, 0xf1788ee3), TOBN(0xc14f27af, 0xa86938a2),
+     TOBN(0x8b47a334, 0xe93a01c0), TOBN(0xff627438, 0xd9366808),
+     TOBN(0x7a0985d8, 0xca2a5965), TOBN(0x3d9a5542, 0xd6e9b9b3),
+     TOBN(0xc23eb80b, 0x4cf972e8), TOBN(0x5c1c33bb, 0x4fdf72fd),
+     TOBN(0x0c4a58d4, 0x74a86108), TOBN(0xf8048a8f, 0xee4c5d90),
+     TOBN(0xe3c7c924, 0xe86d4c80), TOBN(0x28c889de, 0x056a1e60),
+     TOBN(0x57e2662e, 0xb214a040), TOBN(0xe8c48e98, 0x37e10347),
+     TOBN(0x87742862, 0x80ac748a), TOBN(0xf1c24022, 0x186b06f2),
+     TOBN(0xac2dd4c3, 0x5f74040a), TOBN(0x409aeb71, 0xfceac957),
+     TOBN(0x4fbad782, 0x55c4ec23), TOBN(0xb359ed61, 0x8a7b76ec),
+     TOBN(0x12744926, 0xed6f4a60), TOBN(0xe21e8d7f, 0x4b912de3),
+     TOBN(0xe2575a59, 0xfc705a59), TOBN(0x72f1d4de, 0xed2dbc0e),
+     TOBN(0x3d2b24b9, 0xeb7926b8), TOBN(0xbff88cb3, 0xcdbe5509),
+     TOBN(0xd0f399af, 0xe4dd640b), TOBN(0x3c5fe130, 0x2f76ed45),
+     TOBN(0x6f3562f4, 0x3764fb3d), TOBN(0x7b5af318, 0x3151b62d),
+     TOBN(0xd5bd0bc7, 0xd79ce5f3), TOBN(0xfdaf6b20, 0xec66890f),
+     TOBN(0x735c67ec, 0x6063540c), TOBN(0x50b259c2, 0xe5f9cb8f),
+     TOBN(0xb8734f9a, 0x3f99c6ab), TOBN(0xf8cc13d5, 0xa3a7bc85),
+     TOBN(0x80c1b305, 0xc5217659), TOBN(0xfe5364d4, 0x4ec12a54),
+     TOBN(0xbd87045e, 0x681345fe), TOBN(0x7f8efeb1, 0x582f897f),
+     TOBN(0xe8cbf1e5, 0xd5923359), TOBN(0xdb0cea9d, 0x539b9fb0),
+     TOBN(0x0c5b34cf, 0x49859b98), TOBN(0x5e583c56, 0xa4403cc6),
+     TOBN(0x11fc1a2d, 0xd48185b7), TOBN(0xc93fbc7e, 0x6e521787),
+     TOBN(0x47e7a058, 0x05105b8b), TOBN(0x7b4d4d58, 0xdb8260c8),
+     TOBN(0xe33930b0, 0x46eb842a), TOBN(0x8e844a9a, 0x7bdae56d),
+     TOBN(0x34ef3a9e, 0x13f7fdfc), TOBN(0xb3768f82, 0x636ca176),
+     TOBN(0x2821f4e0, 0x4e09e61c), TOBN(0x414dc3a1, 0xa0c7cddc),
+     TOBN(0xd5379437, 0x54945fcd), TOBN(0x151b6eef, 0xb3555ff1),
+     TOBN(0xb31bd613, 0x6339c083), TOBN(0x39ff8155, 0xdfb64701),
+     TOBN(0x7c3388d2, 0xe29604ab), TOBN(0x1e19084b, 0xa6b10442),
+     TOBN(0x17cf54c0, 0xeccd47ef), TOBN(0x89693385, 0x4a5dfb30),
+     TOBN(0x69d023fb, 0x47daf9f6), TOBN(0x9222840b, 0x7d91d959),
+     TOBN(0x439108f5, 0x803bac62), TOBN(0x0b7dd91d, 0x379bd45f),
+     TOBN(0xd651e827, 0xca63c581), TOBN(0x5c5d75f6, 0x509c104f),
+     TOBN(0x7d5fc738, 0x1f2dc308), TOBN(0x20faa7bf, 0xd98454be),
+     TOBN(0x95374bee, 0xa517b031), TOBN(0xf036b9b1, 0x642692ac),
+     TOBN(0xc5106109, 0x39842194), TOBN(0xb7e2353e, 0x49d05295),
+     TOBN(0xfc8c1d5c, 0xefb42ee0), TOBN(0xe04884eb, 0x08ce811c),
+     TOBN(0xf1f75d81, 0x7419f40e), TOBN(0x5b0ac162, 0xa995c241),
+     TOBN(0x120921bb, 0xc4c55646), TOBN(0x713520c2, 0x8d33cf97),
+     TOBN(0xb4a65a5c, 0xe98c5100), TOBN(0x6cec871d, 0x2ddd0f5a),
+     TOBN(0x251f0b7f, 0x9ba2e78b), TOBN(0x224a8434, 0xce3a2a5f),
+     TOBN(0x26827f61, 0x25f5c46f), TOBN(0x6a22bedc, 0x48545ec0),
+     TOBN(0x25ae5fa0, 0xb1bb5cdc), TOBN(0xd693682f, 0xfcb9b98f),
+     TOBN(0x32027fe8, 0x91e5d7d3), TOBN(0xf14b7d17, 0x73a07678),
+     TOBN(0xf88497b3, 0xc0dfdd61), TOBN(0xf7c2eec0, 0x2a8c4f48),
+     TOBN(0xaa5573f4, 0x3756e621), TOBN(0xc013a240, 0x1825b948),
+     TOBN(0x1c03b345, 0x63878572), TOBN(0xa0472bea, 0x653a4184),
+     TOBN(0xf4222e27, 0x0ac69a80), TOBN(0x34096d25, 0xf51e54f6),
+     TOBN(0x00a648cb, 0x8fffa591), TOBN(0x4e87acdc, 0x69b6527f),
+     TOBN(0x0575e037, 0xe285ccb4), TOBN(0x188089e4, 0x50ddcf52),
+     TOBN(0xaa96c9a8, 0x870ff719), TOBN(0x74a56cd8, 0x1fc7e369),
+     TOBN(0x41d04ee2, 0x1726931a), TOBN(0x0bbbb2c8, 0x3660ecfd),
+     TOBN(0xa6ef6de5, 0x24818e18), TOBN(0xe421cc51, 0xe7d57887),
+     TOBN(0xf127d208, 0xbea87be6), TOBN(0x16a475d3, 0xb1cdd682),
+     TOBN(0x9db1b684, 0x439b63f7), TOBN(0x5359b3db, 0xf0f113b6),
+     TOBN(0xdfccf1de, 0x8bf06e31), TOBN(0x1fdf8f44, 0xdd383901),
+     TOBN(0x10775cad, 0x5017e7d2), TOBN(0xdfc3a597, 0x58d11eef),
+     TOBN(0x6ec9c8a0, 0xb1ecff10), TOBN(0xee6ed6cc, 0x28400549),
+     TOBN(0xb5ad7bae, 0x1b4f8d73), TOBN(0x61b4f11d, 0xe00aaab9),
+     TOBN(0x7b32d69b, 0xd4eff2d7), TOBN(0x88ae6771, 0x4288b60f),
+     TOBN(0x159461b4, 0x37a1e723), TOBN(0x1f3d4789, 0x570aae8c),
+     TOBN(0x869118c0, 0x7f9871da), TOBN(0x35fbda78, 0xf635e278),
+     TOBN(0x738f3641, 0xe1541dac), TOBN(0x6794b13a, 0xc0dae45f),
+     TOBN(0x065064ac, 0x09cc0917), TOBN(0x27c53729, 0xc68540fd),
+     TOBN(0x0d2d4c8e, 0xef227671), TOBN(0xd23a9f80, 0xa1785a04),
+     TOBN(0x98c59528, 0x52650359), TOBN(0xfa09ad01, 0x74a1acad),
+     TOBN(0x082d5a29, 0x0b55bf5c), TOBN(0xa40f1c67, 0x419b8084),
+     TOBN(0x3a5c752e, 0xdcc18770), TOBN(0x4baf1f2f, 0x8825c3a5),
+     TOBN(0xebd63f74, 0x21b153ed), TOBN(0xa2383e47, 0xb2f64723),
+     TOBN(0xe7bf620a, 0x2646d19a), TOBN(0x56cb44ec, 0x03c83ffd),
+     TOBN(0xaf7267c9, 0x4f6be9f1), TOBN(0x8b2dfd7b, 0xc06bb5e9),
+     TOBN(0xb87072f2, 0xa672c5c7), TOBN(0xeacb11c8, 0x0d53c5e2),
+     TOBN(0x22dac29d, 0xff435932), TOBN(0x37bdb99d, 0x4408693c),
+     TOBN(0xf6e62fb6, 0x2899c20f), TOBN(0x3535d512, 0x447ece24),
+     TOBN(0xfbdc6b88, 0xff577ce3), TOBN(0x726693bd, 0x190575f2),
+     TOBN(0x6772b0e5, 0xab4b35a2), TOBN(0x1d8b6001, 0xf5eeaacf),
+     TOBN(0x728f7ce4, 0x795b9580), TOBN(0x4a20ed2a, 0x41fb81da),
+     TOBN(0x9f685cd4, 0x4fec01e6), TOBN(0x3ed7ddcc, 0xa7ff50ad),
+     TOBN(0x460fd264, 0x0c2d97fd), TOBN(0x3a241426, 0xeb82f4f9),
+     TOBN(0x17d1df2c, 0x6a8ea820), TOBN(0xb2b50d3b, 0xf22cc254),
+     TOBN(0x03856cba, 0xb7291426), TOBN(0x87fd26ae, 0x04f5ee39),
+     TOBN(0x9cb696cc, 0x02bee4ba), TOBN(0x53121804, 0x06820fd6),
+     TOBN(0xa5dfc269, 0x0212e985), TOBN(0x666f7ffa, 0x160f9a09),
+     TOBN(0xc503cd33, 0xbccd9617), TOBN(0x365dede4, 0xba7730a3),
+     TOBN(0x798c6355, 0x5ddb0786), TOBN(0xa6c3200e, 0xfc9cd3bc),
+     TOBN(0x060ffb2c, 0xe5e35efd), TOBN(0x99a4e25b, 0x5555a1c1),
+     TOBN(0x11d95375, 0xf70b3751), TOBN(0x0a57354a, 0x160e1bf6),
+     TOBN(0xecb3ae4b, 0xf8e4b065), TOBN(0x07a834c4, 0x2e53022b),
+     TOBN(0x1cd300b3, 0x8692ed96), TOBN(0x16a6f792, 0x61ee14ec),
+     TOBN(0x8f1063c6, 0x6a8649ed), TOBN(0xfbcdfcfe, 0x869f3e14),
+     TOBN(0x2cfb97c1, 0x00a7b3ec), TOBN(0xcea49b3c, 0x7130c2f1),
+     TOBN(0x462d044f, 0xe9d96488), TOBN(0x4b53d52e, 0x8182a0c1),
+     TOBN(0x84b6ddd3, 0x0391e9e9), TOBN(0x80ab7b48, 0xb1741a09),
+     TOBN(0xec0e15d4, 0x27d3317f), TOBN(0x8dfc1ddb, 0x1a64671e),
+     TOBN(0x93cc5d5f, 0xd49c5b92), TOBN(0xc995d53d, 0x3674a331),
+     TOBN(0x302e41ec, 0x090090ae), TOBN(0x2278a0cc, 0xedb06830),
+     TOBN(0x1d025932, 0xfbc99690), TOBN(0x0c32fbd2, 0xb80d68da),
+     TOBN(0xd79146da, 0xf341a6c1), TOBN(0xae0ba139, 0x1bef68a0),
+     TOBN(0xc6b8a563, 0x8d774b3a), TOBN(0x1cf307bd, 0x880ba4d7),
+     TOBN(0xc033bdc7, 0x19803511), TOBN(0xa9f97b3b, 0x8888c3be),
+     TOBN(0x3d68aebc, 0x85c6d05e), TOBN(0xc3b88a9d, 0x193919eb),
+     TOBN(0x2d300748, 0xc48b0ee3), TOBN(0x7506bc7c, 0x07a746c1),
+     TOBN(0xfc48437c, 0x6e6d57f3), TOBN(0x5bd71587, 0xcfeaa91a),
+     TOBN(0xa4ed0408, 0xc1bc5225), TOBN(0xd0b946db, 0x2719226d),
+     TOBN(0x109ecd62, 0x758d2d43), TOBN(0x75c8485a, 0x2751759b),
+     TOBN(0xb0b75f49, 0x9ce4177a), TOBN(0x4fa61a1e, 0x79c10c3d),
+     TOBN(0xc062d300, 0xa167fcd7), TOBN(0x4df3874c, 0x750f0fa8),
+     TOBN(0x29ae2cf9, 0x83dfedc9), TOBN(0xf8437134, 0x8d87631a),
+     TOBN(0xaf571711, 0x7429c8d2), TOBN(0x18d15867, 0x146d9272),
+     TOBN(0x83053ecf, 0x69769bb7), TOBN(0xc55eb856, 0xc479ab82),
+     TOBN(0x5ef7791c, 0x21b0f4b2), TOBN(0xaa5956ba, 0x3d491525),
+     TOBN(0x407a96c2, 0x9fe20eba), TOBN(0xf27168bb, 0xe52a5ad3),
+     TOBN(0x43b60ab3, 0xbf1d9d89), TOBN(0xe45c51ef, 0x710e727a),
+     TOBN(0xdfca5276, 0x099b4221), TOBN(0x8dc6407c, 0x2557a159),
+     TOBN(0x0ead8335, 0x91035895), TOBN(0x0a9db957, 0x9c55dc32),
+     TOBN(0xe40736d3, 0xdf61bc76), TOBN(0x13a619c0, 0x3f778cdb),
+     TOBN(0x6dd921a4, 0xc56ea28f), TOBN(0x76a52433, 0x2fa647b4),
+     TOBN(0x23591891, 0xac5bdc5d), TOBN(0xff4a1a72, 0xbac7dc01),
+     TOBN(0x9905e261, 0x62df8453), TOBN(0x3ac045df, 0xe63b265f),
+     TOBN(0x8a3f341b, 0xad53dba7), TOBN(0x8ec269cc, 0x837b625a),
+     TOBN(0xd71a2782, 0x3ae31189), TOBN(0x8fb4f9a3, 0x55e96120),
+     TOBN(0x804af823, 0xff9875cf), TOBN(0x23224f57, 0x5d442a9b),
+     TOBN(0x1c4d3b9e, 0xecc62679), TOBN(0x91da22fb, 0xa0e7ddb1),
+     TOBN(0xa370324d, 0x6c04a661), TOBN(0x9710d3b6, 0x5e376d17),
+     TOBN(0xed8c98f0, 0x3044e357), TOBN(0xc364ebbe, 0x6422701c),
+     TOBN(0x347f5d51, 0x7733d61c), TOBN(0xd55644b9, 0xcea826c3),
+     TOBN(0x80c6e0ad, 0x55a25548), TOBN(0x0aa7641d, 0x844220a7),
+     TOBN(0x1438ec81, 0x31810660), TOBN(0x9dfa6507, 0xde4b4043),
+     TOBN(0x10b515d8, 0xcc3e0273), TOBN(0x1b6066dd, 0x28d8cfb2),
+     TOBN(0xd3b04591, 0x9c9efebd), TOBN(0x425d4bdf, 0xa21c1ff4),
+     TOBN(0x5fe5af19, 0xd57607d3), TOBN(0xbbf773f7, 0x54481084),
+     TOBN(0x8435bd69, 0x94b03ed1), TOBN(0xd9ad1de3, 0x634cc546),
+     TOBN(0x2cf423fc, 0x00e420ca), TOBN(0xeed26d80, 0xa03096dd),
+     TOBN(0xd7f60be7, 0xa4db09d2), TOBN(0xf47f569d, 0x960622f7),
+     TOBN(0xe5925fd7, 0x7296c729), TOBN(0xeff2db26, 0x26ca2715),
+     TOBN(0xa6fcd014, 0xb913e759), TOBN(0x53da4786, 0x8ff4de93),
+     TOBN(0x14616d79, 0xc32068e1), TOBN(0xb187d664, 0xccdf352e),
+     TOBN(0xf7afb650, 0x1dc90b59), TOBN(0x8170e943, 0x7daa1b26),
+     TOBN(0xc8e3bdd8, 0x700c0a84), TOBN(0x6e8d345f, 0x6482bdfa),
+     TOBN(0x84cfbfa1, 0xc5c5ea50), TOBN(0xd3baf14c, 0x67960681),
+     TOBN(0x26398403, 0x0dd50942), TOBN(0xe4b7839c, 0x4716a663),
+     TOBN(0xd5f1f794, 0xe7de6dc0), TOBN(0x5cd0f4d4, 0x622aa7ce),
+     TOBN(0x5295f3f1, 0x59acfeec), TOBN(0x8d933552, 0x953e0607),
+     TOBN(0xc7db8ec5, 0x776c5722), TOBN(0xdc467e62, 0x2b5f290c),
+     TOBN(0xd4297e70, 0x4ff425a9), TOBN(0x4be924c1, 0x0cf7bb72),
+     TOBN(0x0d5dc5ae, 0xa1892131), TOBN(0x8bf8a8e3, 0xa705c992),
+     TOBN(0x73a0b064, 0x7a305ac5), TOBN(0x00c9ca4e, 0x9a8c77a8),
+     TOBN(0x5dfee80f, 0x83774bdd), TOBN(0x63131602, 0x85734485),
+     TOBN(0xa1b524ae, 0x914a69a9), TOBN(0xebc2ffaf, 0xd4e300d7),
+     TOBN(0x52c93db7, 0x7cfa46a5), TOBN(0x71e6161f, 0x21653b50),
+     TOBN(0x3574fc57, 0xa4bc580a), TOBN(0xc09015dd, 0xe1bc1253),
+     TOBN(0x4b7b47b2, 0xd174d7aa), TOBN(0x4072d8e8, 0xf3a15d04),
+     TOBN(0xeeb7d47f, 0xd6fa07ed), TOBN(0x6f2b9ff9, 0xedbdafb1),
+     TOBN(0x18c51615, 0x3760fe8a), TOBN(0x7a96e6bf, 0xf06c6c13),
+     TOBN(0x4d7a0410, 0x0ea2d071), TOBN(0xa1914e9b, 0x0be2a5ce),
+     TOBN(0x5726e357, 0xd8a3c5cf), TOBN(0x1197ecc3, 0x2abb2b13),
+     TOBN(0x6c0d7f7f, 0x31ae88dd), TOBN(0x15b20d1a, 0xfdbb3efe),
+     TOBN(0xcd06aa26, 0x70584039), TOBN(0x2277c969, 0xa7dc9747),
+     TOBN(0xbca69587, 0x7855d815), TOBN(0x899ea238, 0x5188b32a),
+     TOBN(0x37d9228b, 0x760c1c9d), TOBN(0xc7efbb11, 0x9b5c18da),
+     TOBN(0x7f0d1bc8, 0x19f6dbc5), TOBN(0x4875384b, 0x07e6905b),
+     TOBN(0xc7c50baa, 0x3ba8cd86), TOBN(0xb0ce40fb, 0xc2905de0),
+     TOBN(0x70840673, 0x7a231952), TOBN(0xa912a262, 0xcf43de26),
+     TOBN(0x9c38ddcc, 0xeb5b76c1), TOBN(0x746f5285, 0x26fc0ab4),
+     TOBN(0x52a63a50, 0xd62c269f), TOBN(0x60049c55, 0x99458621),
+     TOBN(0xe7f48f82, 0x3c2f7c9e), TOBN(0x6bd99043, 0x917d5cf3),
+     TOBN(0xeb1317a8, 0x8701f469), TOBN(0xbd3fe2ed, 0x9a449fe0),
+     TOBN(0x421e79ca, 0x12ef3d36), TOBN(0x9ee3c36c, 0x3e7ea5de),
+     TOBN(0xe48198b5, 0xcdff36f7), TOBN(0xaff4f967, 0xc6b82228),
+     TOBN(0x15e19dd0, 0xc47adb7e), TOBN(0x45699b23, 0x032e7dfa),
+     TOBN(0x40680c8b, 0x1fae026a), TOBN(0x5a347a48, 0x550dbf4d),
+     TOBN(0xe652533b, 0x3cef0d7d), TOBN(0xd94f7b18, 0x2bbb4381),
+     TOBN(0x838752be, 0x0e80f500), TOBN(0x8e6e2488, 0x9e9c9bfb),
+     TOBN(0xc9751697, 0x16caca6a), TOBN(0x866c49d8, 0x38531ad9),
+     TOBN(0xc917e239, 0x7151ade1), TOBN(0x2d016ec1, 0x6037c407),
+     TOBN(0xa407ccc9, 0x00eac3f9), TOBN(0x835f6280, 0xe2ed4748),
+     TOBN(0xcc54c347, 0x1cc98e0d), TOBN(0x0e969937, 0xdcb572eb),
+     TOBN(0x1b16c8e8, 0x8f30c9cb), TOBN(0xa606ae75, 0x373c4661),
+     TOBN(0x47aa689b, 0x35502cab), TOBN(0xf89014ae, 0x4d9bb64f),
+     TOBN(0x202f6a9c, 0x31c71f7b), TOBN(0x01f95aa3, 0x296ffe5c),
+     TOBN(0x5fc06014, 0x53cec3a3), TOBN(0xeb991237, 0x5f498a45),
+     TOBN(0xae9a935e, 0x5d91ba87), TOBN(0xc6ac6281, 0x0b564a19),
+     TOBN(0x8a8fe81c, 0x3bd44e69), TOBN(0x7c8b467f, 0x9dd11d45),
+     TOBN(0xf772251f, 0xea5b8e69), TOBN(0xaeecb3bd, 0xc5b75fbc),
+     TOBN(0x1aca3331, 0x887ff0e5), TOBN(0xbe5d49ff, 0x19f0a131),
+     TOBN(0x582c13aa, 0xe5c8646f), TOBN(0xdbaa12e8, 0x20e19980),
+     TOBN(0x8f40f31a, 0xf7abbd94), TOBN(0x1f13f5a8, 0x1dfc7663),
+     TOBN(0x5d81f1ee, 0xaceb4fc0), TOBN(0x36256002, 0x5e6f0f42),
+     TOBN(0x4b67d6d7, 0x751370c8), TOBN(0x2608b698, 0x03e80589),
+     TOBN(0xcfc0d2fc, 0x05268301), TOBN(0xa6943d39, 0x40309212),
+     TOBN(0x192a90c2, 0x1fd0e1c2), TOBN(0xb209f113, 0x37f1dc76),
+     TOBN(0xefcc5e06, 0x97bf1298), TOBN(0xcbdb6730, 0x219d639e),
+     TOBN(0xd009c116, 0xb81e8c6f), TOBN(0xa3ffdde3, 0x1a7ce2e5),
+     TOBN(0xc53fbaaa, 0xa914d3ba), TOBN(0x836d500f, 0x88df85ee),
+     TOBN(0xd98dc71b, 0x66ee0751), TOBN(0x5a3d7005, 0x714516fd),
+     TOBN(0x21d3634d, 0x39eedbba), TOBN(0x35cd2e68, 0x0455a46d),
+     TOBN(0xc8cafe65, 0xf9d7eb0c), TOBN(0xbda3ce9e, 0x00cefb3e),
+     TOBN(0xddc17a60, 0x2c9cf7a4), TOBN(0x01572ee4, 0x7bcb8773),
+     TOBN(0xa92b2b01, 0x8c7548df), TOBN(0x732fd309, 0xa84600e3),
+     TOBN(0xe22109c7, 0x16543a40), TOBN(0x9acafd36, 0xfede3c6c),
+     TOBN(0xfb206852, 0x6824e614), TOBN(0x2a4544a9, 0xda25dca0),
+     TOBN(0x25985262, 0x91d60b06), TOBN(0x281b7be9, 0x28753545),
+     TOBN(0xec667b1a, 0x90f13b27), TOBN(0x33a83aff, 0x940e2eb4),
+     TOBN(0x80009862, 0xd5d721d5), TOBN(0x0c3357a3, 0x5bd3a182),
+     TOBN(0x27f3a83b, 0x7aa2cda4), TOBN(0xb58ae74e, 0xf6f83085),
+     TOBN(0x2a911a81, 0x2e6dad6b), TOBN(0xde286051, 0xf43d6c5b),
+     TOBN(0x4bdccc41, 0xf996c4d8), TOBN(0xe7312ec0, 0x0ae1e24e)}
+    ,
+    {TOBN(0xf8d112e7, 0x6e6485b3), TOBN(0x4d3e24db, 0x771c52f8),
+     TOBN(0x48e3ee41, 0x684a2f6d), TOBN(0x7161957d, 0x21d95551),
+     TOBN(0x19631283, 0xcdb12a6c), TOBN(0xbf3fa882, 0x2e50e164),
+     TOBN(0xf6254b63, 0x3166cc73), TOBN(0x3aefa7ae, 0xaee8cc38),
+     TOBN(0x79b0fe62, 0x3b36f9fd), TOBN(0x26543b23, 0xfde19fc0),
+     TOBN(0x136e64a0, 0x958482ef), TOBN(0x23f63771, 0x9b095825),
+     TOBN(0x14cfd596, 0xb6a1142e), TOBN(0x5ea6aac6, 0x335aac0b),
+     TOBN(0x86a0e8bd, 0xf3081dd5), TOBN(0x5fb89d79, 0x003dc12a),
+     TOBN(0xf615c33a, 0xf72e34d4), TOBN(0x0bd9ea40, 0x110eec35),
+     TOBN(0x1c12bc5b, 0xc1dea34e), TOBN(0x686584c9, 0x49ae4699),
+     TOBN(0x13ad95d3, 0x8c97b942), TOBN(0x4609561a, 0x4e5c7562),
+     TOBN(0x9e94a4ae, 0xf2737f89), TOBN(0xf57594c6, 0x371c78b6),
+     TOBN(0x0f0165fc, 0xe3779ee3), TOBN(0xe00e7f9d, 0xbd495d9e),
+     TOBN(0x1fa4efa2, 0x20284e7a), TOBN(0x4564bade, 0x47ac6219),
+     TOBN(0x90e6312a, 0xc4708e8e), TOBN(0x4f5725fb, 0xa71e9adf),
+     TOBN(0xe95f55ae, 0x3d684b9f), TOBN(0x47f7ccb1, 0x1e94b415),
+     TOBN(0x7322851b, 0x8d946581), TOBN(0xf0d13133, 0xbdf4a012),
+     TOBN(0xa3510f69, 0x6584dae0), TOBN(0x03a7c171, 0x3c9f6c6d),
+     TOBN(0x5be97f38, 0xe475381a), TOBN(0xca1ba422, 0x85823334),
+     TOBN(0xf83cc5c7, 0x0be17dda), TOBN(0x158b1494, 0x0b918c0f),
+     TOBN(0xda3a77e5, 0x522e6b69), TOBN(0x69c908c3, 0xbbcd6c18),
+     TOBN(0x1f1b9e48, 0xd924fd56), TOBN(0x37c64e36, 0xaa4bb3f7),
+     TOBN(0x5a4fdbdf, 0xee478d7d), TOBN(0xba75c8bc, 0x0193f7a0),
+     TOBN(0x84bc1e84, 0x56cd16df), TOBN(0x1fb08f08, 0x46fad151),
+     TOBN(0x8a7cabf9, 0x842e9f30), TOBN(0xa331d4bf, 0x5eab83af),
+     TOBN(0xd272cfba, 0x017f2a6a), TOBN(0x27560abc, 0x83aba0e3),
+     TOBN(0x94b83387, 0x0e3a6b75), TOBN(0x25c6aea2, 0x6b9f50f5),
+     TOBN(0x803d691d, 0xb5fdf6d0), TOBN(0x03b77509, 0xe6333514),
+     TOBN(0x36178903, 0x61a341c1), TOBN(0x3604dc60, 0x0cfd6142),
+     TOBN(0x022295eb, 0x8533316c), TOBN(0x3dbde4ac, 0x44af2922),
+     TOBN(0x898afc5d, 0x1c7eef69), TOBN(0x58896805, 0xd14f4fa1),
+     TOBN(0x05002160, 0x203c21ca), TOBN(0x6f0d1f30, 0x40ef730b),
+     TOBN(0x8e8c44d4, 0x196224f8), TOBN(0x75a4ab95, 0x374d079d),
+     TOBN(0x79085ecc, 0x7d48f123), TOBN(0x56f04d31, 0x1bf65ad8),
+     TOBN(0xe220bf1c, 0xbda602b2), TOBN(0x73ee1742, 0xf9612c69),
+     TOBN(0x76008fc8, 0x084fd06b), TOBN(0x4000ef9f, 0xf11380d1),
+     TOBN(0x48201b4b, 0x12cfe297), TOBN(0x3eee129c, 0x292f74e5),
+     TOBN(0xe1fe114e, 0xc9e874e8), TOBN(0x899b055c, 0x92c5fc41),
+     TOBN(0x4e477a64, 0x3a39c8cf), TOBN(0x82f09efe, 0x78963cc9),
+     TOBN(0x6fd3fd8f, 0xd333f863), TOBN(0x85132b2a, 0xdc949c63),
+     TOBN(0x7e06a3ab, 0x516eb17b), TOBN(0x73bec06f, 0xd2c7372b),
+     TOBN(0xe4f74f55, 0xba896da6), TOBN(0xbb4afef8, 0x8e9eb40f),
+     TOBN(0x2d75bec8, 0xe61d66b0), TOBN(0x02bda4b4, 0xef29300b),
+     TOBN(0x8bbaa8de, 0x026baa5a), TOBN(0xff54befd, 0xa07f4440),
+     TOBN(0xbd9b8b1d, 0xbe7a2af3), TOBN(0xec51caa9, 0x4fb74a72),
+     TOBN(0xb9937a4b, 0x63879697), TOBN(0x7c9a9d20, 0xec2687d5),
+     TOBN(0x1773e44f, 0x6ef5f014), TOBN(0x8abcf412, 0xe90c6900),
+     TOBN(0x387bd022, 0x8142161e), TOBN(0x50393755, 0xfcb6ff2a),
+     TOBN(0x9813fd56, 0xed6def63), TOBN(0x53cf6482, 0x7d53106c),
+     TOBN(0x991a35bd, 0x431f7ac1), TOBN(0xf1e274dd, 0x63e65faf),
+     TOBN(0xf63ffa3c, 0x44cc7880), TOBN(0x411a426b, 0x7c256981),
+     TOBN(0xb698b9fd, 0x93a420e0), TOBN(0x89fdddc0, 0xae53f8fe),
+     TOBN(0x766e0722, 0x32398baa), TOBN(0x205fee42, 0x5cfca031),
+     TOBN(0xa49f5341, 0x7a029cf2), TOBN(0xa88c68b8, 0x4023890d),
+     TOBN(0xbc275041, 0x7337aaa8), TOBN(0x9ed364ad, 0x0eb384f4),
+     TOBN(0xe0816f85, 0x29aba92f), TOBN(0x2e9e1941, 0x04e38a88),
+     TOBN(0x57eef44a, 0x3dafd2d5), TOBN(0x35d1fae5, 0x97ed98d8),
+     TOBN(0x50628c09, 0x2307f9b1), TOBN(0x09d84aae, 0xd6cba5c6),
+     TOBN(0x67071bc7, 0x88aaa691), TOBN(0x2dea57a9, 0xafe6cb03),
+     TOBN(0xdfe11bb4, 0x3d78ac01), TOBN(0x7286418c, 0x7fd7aa51),
+     TOBN(0xfabf7709, 0x77f7195a), TOBN(0x8ec86167, 0xadeb838f),
+     TOBN(0xea1285a8, 0xbb4f012d), TOBN(0xd6883503, 0x9a3eab3f),
+     TOBN(0xee5d24f8, 0x309004c2), TOBN(0xa96e4b76, 0x13ffe95e),
+     TOBN(0x0cdffe12, 0xbd223ea4), TOBN(0x8f5c2ee5, 0xb6739a53),
+     TOBN(0x5cb4aaa5, 0xdd968198), TOBN(0xfa131c52, 0x72413a6c),
+     TOBN(0x53d46a90, 0x9536d903), TOBN(0xb270f0d3, 0x48606d8e),
+     TOBN(0x518c7564, 0xa053a3bc), TOBN(0x088254b7, 0x1a86caef),
+     TOBN(0xb3ba8cb4, 0x0ab5efd0), TOBN(0x5c59900e, 0x4605945d),
+     TOBN(0xecace1dd, 0xa1887395), TOBN(0x40960f36, 0x932a65de),
+     TOBN(0x9611ff5c, 0x3aa95529), TOBN(0xc58215b0, 0x7c1e5a36),
+     TOBN(0xd48c9b58, 0xf0e1a524), TOBN(0xb406856b, 0xf590dfb8),
+     TOBN(0xc7605e04, 0x9cd95662), TOBN(0x0dd036ee, 0xa33ecf82),
+     TOBN(0xa50171ac, 0xc33156b3), TOBN(0xf09d24ea, 0x4a80172e),
+     TOBN(0x4e1f72c6, 0x76dc8eef), TOBN(0xe60caadc, 0x5e3d44ee),
+     TOBN(0x006ef8a6, 0x979b1d8f), TOBN(0x60908a1c, 0x97788d26),
+     TOBN(0x6e08f95b, 0x266feec0), TOBN(0x618427c2, 0x22e8c94e),
+     TOBN(0x3d613339, 0x59145a65), TOBN(0xcd9bc368, 0xfa406337),
+     TOBN(0x82d11be3, 0x2d8a52a0), TOBN(0xf6877b27, 0x97a1c590),
+     TOBN(0x837a819b, 0xf5cbdb25), TOBN(0x2a4fd1d8, 0xde090249),
+     TOBN(0x622a7de7, 0x74990e5f), TOBN(0x840fa5a0, 0x7945511b),
+     TOBN(0x30b974be, 0x6558842d), TOBN(0x70df8c64, 0x17f3d0a6),
+     TOBN(0x7c803520, 0x7542e46d), TOBN(0x7251fe7f, 0xe4ecc823),
+     TOBN(0xe59134cb, 0x5e9aac9a), TOBN(0x11bb0934, 0xf0045d71),
+     TOBN(0x53e5d9b5, 0xdbcb1d4e), TOBN(0x8d97a905, 0x92defc91),
+     TOBN(0xfe289327, 0x7946d3f9), TOBN(0xe132bd24, 0x07472273),
+     TOBN(0xeeeb510c, 0x1eb6ae86), TOBN(0x777708c5, 0xf0595067),
+     TOBN(0x18e2c8cd, 0x1297029e), TOBN(0x2c61095c, 0xbbf9305e),
+     TOBN(0xe466c258, 0x6b85d6d9), TOBN(0x8ac06c36, 0xda1ea530),
+     TOBN(0xa365dc39, 0xa1304668), TOBN(0xe4a9c885, 0x07f89606),
+     TOBN(0x65a4898f, 0xacc7228d), TOBN(0x3e2347ff, 0x84ca8303),
+     TOBN(0xa5f6fb77, 0xea7d23a3), TOBN(0x2fac257d, 0x672a71cd),
+     TOBN(0x6908bef8, 0x7e6a44d3), TOBN(0x8ff87566, 0x891d3d7a),
+     TOBN(0xe58e90b3, 0x6b0cf82e), TOBN(0x6438d246, 0x2615b5e7),
+     TOBN(0x07b1f8fc, 0x669c145a), TOBN(0xb0d8b2da, 0x36f1e1cb),
+     TOBN(0x54d5dadb, 0xd9184c4d), TOBN(0x3dbb18d5, 0xf93d9976),
+     TOBN(0x0a3e0f56, 0xd1147d47), TOBN(0x2afa8c8d, 0xa0a48609),
+     TOBN(0x275353e8, 0xbc36742c), TOBN(0x898f427e, 0xeea0ed90),
+     TOBN(0x26f4947e, 0x3e477b00), TOBN(0x8ad8848a, 0x308741e3),
+     TOBN(0x6c703c38, 0xd74a2a46), TOBN(0x5e3e05a9, 0x9ba17ba2),
+     TOBN(0xc1fa6f66, 0x4ab9a9e4), TOBN(0x474a2d9a, 0x3841d6ec),
+     TOBN(0x871239ad, 0x653ae326), TOBN(0x14bcf72a, 0xa74cbb43),
+     TOBN(0x8737650e, 0x20d4c083), TOBN(0x3df86536, 0x110ed4af),
+     TOBN(0xd2d86fe7, 0xb53ca555), TOBN(0x688cb00d, 0xabd5d538),
+     TOBN(0xcf81bda3, 0x1ad38468), TOBN(0x7ccfe3cc, 0xf01167b6),
+     TOBN(0xcf4f47e0, 0x6c4c1fe6), TOBN(0x557e1f1a, 0x298bbb79),
+     TOBN(0xf93b974f, 0x30d45a14), TOBN(0x174a1d2d, 0x0baf97c4),
+     TOBN(0x7a003b30, 0xc51fbf53), TOBN(0xd8940991, 0xee68b225),
+     TOBN(0x5b0aa7b7, 0x1c0f4173), TOBN(0x975797c9, 0xa20a7153),
+     TOBN(0x26e08c07, 0xe3533d77), TOBN(0xd7222e6a, 0x2e341c99),
+     TOBN(0x9d60ec3d, 0x8d2dc4ed), TOBN(0xbdfe0d8f, 0x7c476cf8),
+     TOBN(0x1fe59ab6, 0x1d056605), TOBN(0xa9ea9df6, 0x86a8551f),
+     TOBN(0x8489941e, 0x47fb8d8c), TOBN(0xfeb874eb, 0x4a7f1b10),
+     TOBN(0xfe5fea86, 0x7ee0d98f), TOBN(0x201ad34b, 0xdbf61864),
+     TOBN(0x45d8fe47, 0x37c031d4), TOBN(0xd5f49fae, 0x795f0822),
+     TOBN(0xdb0fb291, 0xc7f4a40c), TOBN(0x2e69d9c1, 0x730ddd92),
+     TOBN(0x754e1054, 0x49d76987), TOBN(0x8a24911d, 0x7662db87),
+     TOBN(0x61fc1810, 0x60a71676), TOBN(0xe852d1a8, 0xf66a8ad1),
+     TOBN(0x172bbd65, 0x6417231e), TOBN(0x0d6de7bd, 0x3babb11f),
+     TOBN(0x6fde6f88, 0xc8e347f8), TOBN(0x1c587547, 0x9bd99cc3),
+     TOBN(0x78e54ed0, 0x34076950), TOBN(0x97f0f334, 0x796e83ba),
+     TOBN(0xe4dbe1ce, 0x4924867a), TOBN(0xbd5f51b0, 0x60b84917),
+     TOBN(0x37530040, 0x3cb09a79), TOBN(0xdb3fe0f8, 0xff1743d8),
+     TOBN(0xed7894d8, 0x556fa9db), TOBN(0xfa262169, 0x23412fbf),
+     TOBN(0x563be0db, 0xba7b9291), TOBN(0x6ca8b8c0, 0x0c9fb234),
+     TOBN(0xed406aa9, 0xbd763802), TOBN(0xc21486a0, 0x65303da1),
+     TOBN(0x61ae291e, 0xc7e62ec4), TOBN(0x622a0492, 0xdf99333e),
+     TOBN(0x7fd80c9d, 0xbb7a8ee0), TOBN(0xdc2ed3bc, 0x6c01aedb),
+     TOBN(0x35c35a12, 0x08be74ec), TOBN(0xd540cb1a, 0x469f671f),
+     TOBN(0xd16ced4e, 0xcf84f6c7), TOBN(0x8561fb9c, 0x2d090f43),
+     TOBN(0x7e693d79, 0x6f239db4), TOBN(0xa736f928, 0x77bd0d94),
+     TOBN(0x07b4d929, 0x2c1950ee), TOBN(0xda177543, 0x56dc11b3),
+     TOBN(0xa5dfbbaa, 0x7a6a878e), TOBN(0x1c70cb29, 0x4decb08a),
+     TOBN(0xfba28c8b, 0x6f0f7c50), TOBN(0xa8eba2b8, 0x854dcc6d),
+     TOBN(0x5ff8e89a, 0x36b78642), TOBN(0x070c1c8e, 0xf6873adf),
+     TOBN(0xbbd3c371, 0x6484d2e4), TOBN(0xfb78318f, 0x0d414129),
+     TOBN(0x2621a39c, 0x6ad93b0b), TOBN(0x979d74c2, 0xa9e917f7),
+     TOBN(0xfc195647, 0x61fb0428), TOBN(0x4d78954a, 0xbee624d4),
+     TOBN(0xb94896e0, 0xb8ae86fd), TOBN(0x6667ac0c, 0xc91c8b13),
+     TOBN(0x9f180512, 0x43bcf832), TOBN(0xfbadf8b7, 0xa0010137),
+     TOBN(0xc69b4089, 0xb3ba8aa7), TOBN(0xfac4bacd, 0xe687ce85),
+     TOBN(0x9164088d, 0x977eab40), TOBN(0x51f4c5b6, 0x2760b390),
+     TOBN(0xd238238f, 0x340dd553), TOBN(0x358566c3, 0xdb1d31c9),
+     TOBN(0x3a5ad69e, 0x5068f5ff), TOBN(0xf31435fc, 0xdaff6b06),
+     TOBN(0xae549a5b, 0xd6debff0), TOBN(0x59e5f0b7, 0x75e01331),
+     TOBN(0x5d492fb8, 0x98559acf), TOBN(0x96018c2e, 0x4db79b50),
+     TOBN(0x55f4a48f, 0x609f66aa), TOBN(0x1943b3af, 0x4900a14f),
+     TOBN(0xc22496df, 0x15a40d39), TOBN(0xb2a44684, 0x4c20f7c5),
+     TOBN(0x76a35afa, 0x3b98404c), TOBN(0xbec75725, 0xff5d1b77),
+     TOBN(0xb67aa163, 0xbea06444), TOBN(0x27e95bb2, 0xf724b6f2),
+     TOBN(0x3c20e3e9, 0xd238c8ab), TOBN(0x1213754e, 0xddd6ae17),
+     TOBN(0x8c431020, 0x716e0f74), TOBN(0x6679c82e, 0xffc095c2),
+     TOBN(0x2eb3adf4, 0xd0ac2932), TOBN(0x2cc970d3, 0x01bb7a76),
+     TOBN(0x70c71f2f, 0x740f0e66), TOBN(0x545c616b, 0x2b6b23cc),
+     TOBN(0x4528cfcb, 0xb40a8bd7), TOBN(0xff839633, 0x2ab27722),
+     TOBN(0x049127d9, 0x025ac99a), TOBN(0xd314d4a0, 0x2b63e33b),
+     TOBN(0xc8c310e7, 0x28d84519), TOBN(0x0fcb8983, 0xb3bc84ba),
+     TOBN(0x2cc52261, 0x38634818), TOBN(0x501814f4, 0xb44c2e0b),
+     TOBN(0xf7e181aa, 0x54dfdba3), TOBN(0xcfd58ff0, 0xe759718c),
+     TOBN(0xf90cdb14, 0xd3b507a8), TOBN(0x57bd478e, 0xc50bdad8),
+     TOBN(0x29c197e2, 0x50e5f9aa), TOBN(0x4db6eef8, 0xe40bc855),
+     TOBN(0x2cc8f21a, 0xd1fc0654), TOBN(0xc71cc963, 0x81269d73),
+     TOBN(0xecfbb204, 0x077f49f9), TOBN(0xdde92571, 0xca56b793),
+     TOBN(0x9abed6a3, 0xf97ad8f7), TOBN(0xe6c19d3f, 0x924de3bd),
+     TOBN(0x8dce92f4, 0xa140a800), TOBN(0x85f44d1e, 0x1337af07),
+     TOBN(0x5953c08b, 0x09d64c52), TOBN(0xa1b5e49f, 0xf5df9749),
+     TOBN(0x336a8fb8, 0x52735f7d), TOBN(0xb332b6db, 0x9add676b),
+     TOBN(0x558b88a0, 0xb4511aa4), TOBN(0x09788752, 0xdbd5cc55),
+     TOBN(0x16b43b9c, 0xd8cd52bd), TOBN(0x7f0bc5a0, 0xc2a2696b),
+     TOBN(0x146e12d4, 0xc11f61ef), TOBN(0x9ce10754, 0x3a83e79e),
+     TOBN(0x08ec73d9, 0x6cbfca15), TOBN(0x09ff29ad, 0x5b49653f),
+     TOBN(0xe31b72bd, 0xe7da946e), TOBN(0xebf9eb3b, 0xee80a4f2),
+     TOBN(0xd1aabd08, 0x17598ce4), TOBN(0x18b5fef4, 0x53f37e80),
+     TOBN(0xd5d5cdd3, 0x5958cd79), TOBN(0x3580a1b5, 0x1d373114),
+     TOBN(0xa36e4c91, 0xfa935726), TOBN(0xa38c534d, 0xef20d760),
+     TOBN(0x7088e40a, 0x2ff5845b), TOBN(0xe5bb40bd, 0xbd78177f),
+     TOBN(0x4f06a7a8, 0x857f9920), TOBN(0xe3cc3e50, 0xe968f05d),
+     TOBN(0x1d68b7fe, 0xe5682d26), TOBN(0x5206f76f, 0xaec7f87c),
+     TOBN(0x41110530, 0x041951ab), TOBN(0x58ec52c1, 0xd4b5a71a),
+     TOBN(0xf3488f99, 0x0f75cf9a), TOBN(0xf411951f, 0xba82d0d5),
+     TOBN(0x27ee75be, 0x618895ab), TOBN(0xeae060d4, 0x6d8aab14),
+     TOBN(0x9ae1df73, 0x7fb54dc2), TOBN(0x1f3e391b, 0x25963649),
+     TOBN(0x242ec32a, 0xfe055081), TOBN(0x5bd450ef, 0x8491c9bd),
+     TOBN(0x367efc67, 0x981eb389), TOBN(0xed7e1928, 0x3a0550d5),
+     TOBN(0x362e776b, 0xab3ce75c), TOBN(0xe890e308, 0x1f24c523),
+     TOBN(0xb961b682, 0xfeccef76), TOBN(0x8b8e11f5, 0x8bba6d92),
+     TOBN(0x8f2ccc4c, 0x2b2375c4), TOBN(0x0d7f7a52, 0xe2f86cfa),
+     TOBN(0xfd94d30a, 0x9efe5633), TOBN(0x2d8d246b, 0x5451f934),
+     TOBN(0x2234c6e3, 0x244e6a00), TOBN(0xde2b5b0d, 0xddec8c50),
+     TOBN(0x2ce53c5a, 0xbf776f5b), TOBN(0x6f724071, 0x60357b05),
+     TOBN(0xb2593717, 0x71bf3f7a), TOBN(0x87d2501c, 0x440c4a9f),
+     TOBN(0x440552e1, 0x87b05340), TOBN(0xb7bf7cc8, 0x21624c32),
+     TOBN(0x4155a6ce, 0x22facddb), TOBN(0x5a4228cb, 0x889837ef),
+     TOBN(0xef87d6d6, 0xfd4fd671), TOBN(0xa233687e, 0xc2daa10e),
+     TOBN(0x75622244, 0x03c0eb96), TOBN(0x7632d184, 0x8bf19be6),
+     TOBN(0x05d0f8e9, 0x40735ff4), TOBN(0x3a3e6e13, 0xc00931f1),
+     TOBN(0x31ccde6a, 0xdafe3f18), TOBN(0xf381366a, 0xcfe51207),
+     TOBN(0x24c222a9, 0x60167d92), TOBN(0x62f9d6f8, 0x7529f18c),
+     TOBN(0x412397c0, 0x0353b114), TOBN(0x334d89dc, 0xef808043),
+     TOBN(0xd9ec63ba, 0x2a4383ce), TOBN(0xcec8e937, 0x5cf92ba0),
+     TOBN(0xfb8b4288, 0xc8be74c0), TOBN(0x67d6912f, 0x105d4391),
+     TOBN(0x7b996c46, 0x1b913149), TOBN(0x36aae2ef, 0x3a4e02da),
+     TOBN(0xb68aa003, 0x972de594), TOBN(0x284ec70d, 0x4ec6d545),
+     TOBN(0xf3d2b2d0, 0x61391d54), TOBN(0x69c5d5d6, 0xfe114e92),
+     TOBN(0xbe0f00b5, 0xb4482dff), TOBN(0xe1596fa5, 0xf5bf33c5),
+     TOBN(0x10595b56, 0x96a71cba), TOBN(0x944938b2, 0xfdcadeb7),
+     TOBN(0xa282da4c, 0xfccd8471), TOBN(0x98ec05f3, 0x0d37bfe1),
+     TOBN(0xe171ce1b, 0x0698304a), TOBN(0x2d691444, 0x21bdf79b),
+     TOBN(0xd0cd3b74, 0x1b21dec1), TOBN(0x712ecd8b, 0x16a15f71),
+     TOBN(0x8d4c00a7, 0x00fd56e1), TOBN(0x02ec9692, 0xf9527c18),
+     TOBN(0x21c44937, 0x4a3e42e1), TOBN(0x9176fbab, 0x1392ae0a),
+     TOBN(0x8726f1ba, 0x44b7b618), TOBN(0xb4d7aae9, 0xf1de491c),
+     TOBN(0xf91df7b9, 0x07b582c0), TOBN(0x7e116c30, 0xef60aa3a),
+     TOBN(0x99270f81, 0x466265d7), TOBN(0xb15b6fe2, 0x4df7adf0),
+     TOBN(0xfe33b2d3, 0xf9738f7f), TOBN(0x48553ab9, 0xd6d70f95),
+     TOBN(0x2cc72ac8, 0xc21e94db), TOBN(0x795ac38d, 0xbdc0bbee),
+     TOBN(0x0a1be449, 0x2e40478f), TOBN(0x81bd3394, 0x052bde55),
+     TOBN(0x63c8dbe9, 0x56b3c4f2), TOBN(0x017a99cf, 0x904177cc),
+     TOBN(0x947bbddb, 0x4d010fc1), TOBN(0xacf9b00b, 0xbb2c9b21),
+     TOBN(0x2970bc8d, 0x47173611), TOBN(0x1a4cbe08, 0xac7d756f),
+     TOBN(0x06d9f4aa, 0x67d541a2), TOBN(0xa3e8b689, 0x59c2cf44),
+     TOBN(0xaad066da, 0x4d88f1dd), TOBN(0xc604f165, 0x7ad35dea),
+     TOBN(0x7edc0720, 0x4478ca67), TOBN(0xa10dfae0, 0xba02ce06),
+     TOBN(0xeceb1c76, 0xaf36f4e4), TOBN(0x994b2292, 0xaf3f8f48),
+     TOBN(0xbf9ed77b, 0x77c8a68c), TOBN(0x74f544ea, 0x51744c9d),
+     TOBN(0x82d05bb9, 0x8113a757), TOBN(0x4ef2d2b4, 0x8a9885e4),
+     TOBN(0x1e332be5, 0x1aa7865f), TOBN(0x22b76b18, 0x290d1a52),
+     TOBN(0x308a2310, 0x44351683), TOBN(0x9d861896, 0xa3f22840),
+     TOBN(0x5959ddcd, 0x841ed947), TOBN(0x0def0c94, 0x154b73bf),
+     TOBN(0xf0105417, 0x4c7c15e0), TOBN(0x539bfb02, 0x3a277c32),
+     TOBN(0xe699268e, 0xf9dccf5f), TOBN(0x9f5796a5, 0x0247a3bd),
+     TOBN(0x8b839de8, 0x4f157269), TOBN(0xc825c1e5, 0x7a30196b),
+     TOBN(0x6ef0aabc, 0xdc8a5a91), TOBN(0xf4a8ce6c, 0x498b7fe6),
+     TOBN(0x1cce35a7, 0x70cbac78), TOBN(0x83488e9b, 0xf6b23958),
+     TOBN(0x0341a070, 0xd76cb011), TOBN(0xda6c9d06, 0xae1b2658),
+     TOBN(0xb701fb30, 0xdd648c52), TOBN(0x994ca02c, 0x52fb9fd1),
+     TOBN(0x06933117, 0x6f563086), TOBN(0x3d2b8100, 0x17856bab),
+     TOBN(0xe89f48c8, 0x5963a46e), TOBN(0x658ab875, 0xa99e61c7),
+     TOBN(0x6e296f87, 0x4b8517b4), TOBN(0x36c4fcdc, 0xfc1bc656),
+     TOBN(0xde5227a1, 0xa3906def), TOBN(0x9fe95f57, 0x62418945),
+     TOBN(0x20c91e81, 0xfdd96cde), TOBN(0x5adbe47e, 0xda4480de),
+     TOBN(0xa009370f, 0x396de2b6), TOBN(0x98583d4b, 0xf0ecc7bd),
+     TOBN(0xf44f6b57, 0xe51d0672), TOBN(0x03d6b078, 0x556b1984),
+     TOBN(0x27dbdd93, 0xb0b64912), TOBN(0x9b3a3434, 0x15687b09),
+     TOBN(0x0dba6461, 0x51ec20a9), TOBN(0xec93db7f, 0xff28187c),
+     TOBN(0x00ff8c24, 0x66e48bdd), TOBN(0x2514f2f9, 0x11ccd78e),
+     TOBN(0xeba11f4f, 0xe1250603), TOBN(0x8a22cd41, 0x243fa156),
+     TOBN(0xa4e58df4, 0xb283e4c6), TOBN(0x78c29859, 0x8b39783f),
+     TOBN(0x5235aee2, 0xa5259809), TOBN(0xc16284b5, 0x0e0227dd),
+     TOBN(0xa5f57916, 0x1338830d), TOBN(0x6d4b8a6b, 0xd2123fca),
+     TOBN(0x236ea68a, 0xf9c546f8), TOBN(0xc1d36873, 0xfa608d36),
+     TOBN(0xcd76e495, 0x8d436d13), TOBN(0xd4d9c221, 0x8fb080af),
+     TOBN(0x665c1728, 0xe8ad3fb5), TOBN(0xcf1ebe4d, 0xb3d572e0),
+     TOBN(0xa7a8746a, 0x584c5e20), TOBN(0x267e4ea1, 0xb9dc7035),
+     TOBN(0x593a15cf, 0xb9548c9b), TOBN(0x5e6e2135, 0x4bd012f3),
+     TOBN(0xdf31cc6a, 0x8c8f936e), TOBN(0x8af84d04, 0xb5c241dc),
+     TOBN(0x63990a6f, 0x345efb86), TOBN(0x6fef4e61, 0xb9b962cb)}
+    ,
+    {TOBN(0xf6368f09, 0x25722608), TOBN(0x131260db, 0x131cf5c6),
+     TOBN(0x40eb353b, 0xfab4f7ac), TOBN(0x85c78880, 0x37eee829),
+     TOBN(0x4c1581ff, 0xc3bdf24e), TOBN(0x5bff75cb, 0xf5c3c5a8),
+     TOBN(0x35e8c83f, 0xa14e6f40), TOBN(0xb81d1c0f, 0x0295e0ca),
+     TOBN(0xfcde7cc8, 0xf43a730f), TOBN(0xe89b6f3c, 0x33ab590e),
+     TOBN(0xc823f529, 0xad03240b), TOBN(0x82b79afe, 0x98bea5db),
+     TOBN(0x568f2856, 0x962fe5de), TOBN(0x0c590adb, 0x60c591f3),
+     TOBN(0x1fc74a14, 0x4a28a858), TOBN(0x3b662498, 0xb3203f4c),
+     TOBN(0x91e3cf0d, 0x6c39765a), TOBN(0xa2db3acd, 0xac3cca0b),
+     TOBN(0x288f2f08, 0xcb953b50), TOBN(0x2414582c, 0xcf43cf1a),
+     TOBN(0x8dec8bbc, 0x60eee9a8), TOBN(0x54c79f02, 0x729aa042),
+     TOBN(0xd81cd5ec, 0x6532f5d5), TOBN(0xa672303a, 0xcf82e15f),
+     TOBN(0x376aafa8, 0x719c0563), TOBN(0xcd8ad2dc, 0xbc5fc79f),
+     TOBN(0x303fdb9f, 0xcb750cd3), TOBN(0x14ff052f, 0x4418b08e),
+     TOBN(0xf75084cf, 0x3e2d6520), TOBN(0x7ebdf0f8, 0x144ed509),
+     TOBN(0xf43bf0f2, 0xd3f25b98), TOBN(0x86ad71cf, 0xa354d837),
+     TOBN(0xb827fe92, 0x26f43572), TOBN(0xdfd3ab5b, 0x5d824758),
+     TOBN(0x315dd23a, 0x539094c1), TOBN(0x85c0e37a, 0x66623d68),
+     TOBN(0x575c7972, 0x7be19ae0), TOBN(0x616a3396, 0xdf0d36b5),
+     TOBN(0xa1ebb3c8, 0x26b1ff7e), TOBN(0x635b9485, 0x140ad453),
+     TOBN(0x92bf3cda, 0xda430c0b), TOBN(0x4702850e, 0x3a96dac6),
+     TOBN(0xc91cf0a5, 0x15ac326a), TOBN(0x95de4f49, 0xab8c25e4),
+     TOBN(0xb01bad09, 0xe265c17c), TOBN(0x24e45464, 0x087b3881),
+     TOBN(0xd43e583c, 0xe1fac5ca), TOBN(0xe17cb318, 0x6ead97a6),
+     TOBN(0x6cc39243, 0x74dcec46), TOBN(0x33cfc02d, 0x54c2b73f),
+     TOBN(0x82917844, 0xf26cd99c), TOBN(0x8819dd95, 0xd1773f89),
+     TOBN(0x09572aa6, 0x0871f427), TOBN(0x8e0cf365, 0xf6f01c34),
+     TOBN(0x7fa52988, 0xbff1f5af), TOBN(0x4eb357ea, 0xe75e8e50),
+     TOBN(0xd9d0c8c4, 0x868af75d), TOBN(0xd7325cff, 0x45c8c7ea),
+     TOBN(0xab471996, 0xcc81ecb0), TOBN(0xff5d55f3, 0x611824ed),
+     TOBN(0xbe314541, 0x1977a0ee), TOBN(0x5085c4c5, 0x722038c6),
+     TOBN(0x2d5335bf, 0xf94bb495), TOBN(0x894ad8a6, 0xc8e2a082),
+     TOBN(0x5c3e2341, 0xada35438), TOBN(0xf4a9fc89, 0x049b8c4e),
+     TOBN(0xbeeb355a, 0x9f17cf34), TOBN(0x3f311e0e, 0x6c91fe10),
+     TOBN(0xc2d20038, 0x92ab9891), TOBN(0x257bdcc1, 0x3e8ce9a9),
+     TOBN(0x1b2d9789, 0x88c53bee), TOBN(0x927ce89a, 0xcdba143a),
+     TOBN(0xb0a32cca, 0x523db280), TOBN(0x5c889f8a, 0x50d43783),
+     TOBN(0x503e04b3, 0x4897d16f), TOBN(0x8cdb6e78, 0x08f5f2e8),
+     TOBN(0x6ab91cf0, 0x179c8e74), TOBN(0xd8874e52, 0x48211d60),
+     TOBN(0xf948d4d5, 0xea851200), TOBN(0x4076d41e, 0xe6f9840a),
+     TOBN(0xc20e263c, 0x47b517ea), TOBN(0x79a448fd, 0x30685e5e),
+     TOBN(0xe55f6f78, 0xf90631a0), TOBN(0x88a790b1, 0xa79e6346),
+     TOBN(0x62160c7d, 0x80969fe8), TOBN(0x54f92fd4, 0x41491bb9),
+     TOBN(0xa6645c23, 0x5c957526), TOBN(0xf44cc5ae, 0xbea3ce7b),
+     TOBN(0xf7628327, 0x8b1e68b7), TOBN(0xc731ad7a, 0x303f29d3),
+     TOBN(0xfe5a9ca9, 0x57d03ecb), TOBN(0x96c0d50c, 0x41bc97a7),
+     TOBN(0xc4669fe7, 0x9b4f7f24), TOBN(0xfdd781d8, 0x3d9967ef),
+     TOBN(0x7892c7c3, 0x5d2c208d), TOBN(0x8bf64f7c, 0xae545cb3),
+     TOBN(0xc01f862c, 0x467be912), TOBN(0xf4c85ee9, 0xc73d30cc),
+     TOBN(0x1fa6f4be, 0x6ab83ec7), TOBN(0xa07a3c1c, 0x4e3e3cf9),
+     TOBN(0x87f8ef45, 0x0c00beb3), TOBN(0x30e2c2b3, 0x000d4c3e),
+     TOBN(0x1aa00b94, 0xfe08bf5b), TOBN(0x32c133aa, 0x9224ef52),
+     TOBN(0x38df16bb, 0x32e5685d), TOBN(0x68a9e069, 0x58e6f544),
+     TOBN(0x495aaff7, 0xcdc5ebc6), TOBN(0xf894a645, 0x378b135f),
+     TOBN(0xf316350a, 0x09e27ecf), TOBN(0xeced201e, 0x58f7179d),
+     TOBN(0x2eec273c, 0xe97861ba), TOBN(0x47ec2cae, 0xd693be2e),
+     TOBN(0xfa4c97c4, 0xf68367ce), TOBN(0xe4f47d0b, 0xbe5a5755),
+     TOBN(0x17de815d, 0xb298a979), TOBN(0xd7eca659, 0xc177dc7d),
+     TOBN(0x20fdbb71, 0x49ded0a3), TOBN(0x4cb2aad4, 0xfb34d3c5),
+     TOBN(0x2cf31d28, 0x60858a33), TOBN(0x3b6873ef, 0xa24aa40f),
+     TOBN(0x540234b2, 0x2c11bb37), TOBN(0x2d0366dd, 0xed4c74a3),
+     TOBN(0xf9a968da, 0xeec5f25d), TOBN(0x36601068, 0x67b63142),
+     TOBN(0x07cd6d2c, 0x68d7b6d4), TOBN(0xa8f74f09, 0x0c842942),
+     TOBN(0xe2751404, 0x7768b1ee), TOBN(0x4b5f7e89, 0xfe62aee4),
+     TOBN(0xc6a77177, 0x89070d26), TOBN(0xa1f28e4e, 0xdd1c8bc7),
+     TOBN(0xea5f4f06, 0x469e1f17), TOBN(0x78fc242a, 0xfbdb78e0),
+     TOBN(0xc9c7c592, 0x8b0588f1), TOBN(0xb6b7a0fd, 0x1535921e),
+     TOBN(0xcc5bdb91, 0xbde5ae35), TOBN(0xb42c485e, 0x12ff1864),
+     TOBN(0xa1113e13, 0xdbab98aa), TOBN(0xde9d469b, 0xa17b1024),
+     TOBN(0x23f48b37, 0xc0462d3a), TOBN(0x3752e537, 0x7c5c078d),
+     TOBN(0xe3a86add, 0x15544eb9), TOBN(0xf013aea7, 0x80fba279),
+     TOBN(0x8b5bb76c, 0xf22001b5), TOBN(0xe617ba14, 0xf02891ab),
+     TOBN(0xd39182a6, 0x936219d3), TOBN(0x5ce1f194, 0xae51cb19),
+     TOBN(0xc78f8598, 0xbf07a74c), TOBN(0x6d7158f2, 0x22cbf1bc),
+     TOBN(0x3b846b21, 0xe300ce18), TOBN(0x35fba630, 0x2d11275d),
+     TOBN(0x5fe25c36, 0xa0239b9b), TOBN(0xd8beb35d, 0xdf05d940),
+     TOBN(0x4db02bb0, 0x1f7e320d), TOBN(0x0641c364, 0x6da320ea),
+     TOBN(0x6d95fa5d, 0x821389a3), TOBN(0x92699748, 0x8fcd8e3d),
+     TOBN(0x316fef17, 0xceb6c143), TOBN(0x67fcb841, 0xd933762b),
+     TOBN(0xbb837e35, 0x118b17f8), TOBN(0x4b92552f, 0x9fd24821),
+     TOBN(0xae6bc70e, 0x46aca793), TOBN(0x1cf0b0e4, 0xe579311b),
+     TOBN(0x8dc631be, 0x5802f716), TOBN(0x099bdc6f, 0xbddbee4d),
+     TOBN(0xcc352bb2, 0x0caf8b05), TOBN(0xf74d505a, 0x72d63df2),
+     TOBN(0xb9876d4b, 0x91c4f408), TOBN(0x1ce18473, 0x9e229b2d),
+     TOBN(0x49507597, 0x83abdb4a), TOBN(0x850fbcb6, 0xdee84b18),
+     TOBN(0x6325236e, 0x609e67dc), TOBN(0x04d831d9, 0x9336c6d8),
+     TOBN(0x8deaae3b, 0xfa12d45d), TOBN(0xe425f8ce, 0x4746e246),
+     TOBN(0x8004c175, 0x24f5f31e), TOBN(0xaca16d8f, 0xad62c3b7),
+     TOBN(0x0dc15a6a, 0x9152f934), TOBN(0xf1235e5d, 0xed0e12c1),
+     TOBN(0xc33c06ec, 0xda477dac), TOBN(0x76be8732, 0xb2ea0006),
+     TOBN(0xcf3f7831, 0x0c0cd313), TOBN(0x3c524553, 0xa614260d),
+     TOBN(0x31a756f8, 0xcab22d15), TOBN(0x03ee10d1, 0x77827a20),
+     TOBN(0xd1e059b2, 0x1994ef20), TOBN(0x2a653b69, 0x638ae318),
+     TOBN(0x70d5eb58, 0x2f699010), TOBN(0x279739f7, 0x09f5f84a),
+     TOBN(0x5da4663c, 0x8b799336), TOBN(0xfdfdf14d, 0x203c37eb),
+     TOBN(0x32d8a9dc, 0xa1dbfb2d), TOBN(0xab40cff0, 0x77d48f9b),
+     TOBN(0xc018b383, 0xd20b42d5), TOBN(0xf9a810ef, 0x9f78845f),
+     TOBN(0x40af3753, 0xbdba9df0), TOBN(0xb90bdcfc, 0x131dfdf9),
+     TOBN(0x18720591, 0xf01ab782), TOBN(0xc823f211, 0x6af12a88),
+     TOBN(0xa51b80f3, 0x0dc14401), TOBN(0xde248f77, 0xfb2dfbe3),
+     TOBN(0xef5a44e5, 0x0cafe751), TOBN(0x73997c9c, 0xd4dcd221),
+     TOBN(0x32fd86d1, 0xde854024), TOBN(0xd5b53adc, 0xa09b84bb),
+     TOBN(0x008d7a11, 0xdcedd8d1), TOBN(0x406bd1c8, 0x74b32c84),
+     TOBN(0x5d4472ff, 0x05dde8b1), TOBN(0x2e25f2cd, 0xfce2b32f),
+     TOBN(0xbec0dd5e, 0x29dfc254), TOBN(0x4455fcf6, 0x2b98b267),
+     TOBN(0x0b4d43a5, 0xc72df2ad), TOBN(0xea70e6be, 0x48a75397),
+     TOBN(0x2aad6169, 0x5820f3bf), TOBN(0xf410d2dd, 0x9e37f68f),
+     TOBN(0x70fb7dba, 0x7be5ac83), TOBN(0x636bb645, 0x36ec3eec),
+     TOBN(0x27104ea3, 0x9754e21c), TOBN(0xbc87a3e6, 0x8d63c373),
+     TOBN(0x483351d7, 0x4109db9a), TOBN(0x0fa724e3, 0x60134da7),
+     TOBN(0x9ff44c29, 0xb0720b16), TOBN(0x2dd0cf13, 0x06aceead),
+     TOBN(0x5942758c, 0xe26929a6), TOBN(0x96c5db92, 0xb766a92b),
+     TOBN(0xcec7d4c0, 0x5f18395e), TOBN(0xd3f22744, 0x1f80d032),
+     TOBN(0x7a68b37a, 0xcb86075b), TOBN(0x074764dd, 0xafef92db),
+     TOBN(0xded1e950, 0x7bc7f389), TOBN(0xc580c850, 0xb9756460),
+     TOBN(0xaeeec2a4, 0x7da48157), TOBN(0x3f0b4e7f, 0x82c587b3),
+     TOBN(0x231c6de8, 0xa9f19c53), TOBN(0x5717bd73, 0x6974e34e),
+     TOBN(0xd9e1d216, 0xf1508fa9), TOBN(0x9f112361, 0xdadaa124),
+     TOBN(0x80145e31, 0x823b7348), TOBN(0x4dd8f0d5, 0xac634069),
+     TOBN(0xe3d82fc7, 0x2297c258), TOBN(0x276fcfee, 0x9cee7431),
+     TOBN(0x8eb61b5e, 0x2bc0aea9), TOBN(0x4f668fd5, 0xde329431),
+     TOBN(0x03a32ab1, 0x38e4b87e), TOBN(0xe1374517, 0x73d0ef0b),
+     TOBN(0x1a46f7e6, 0x853ac983), TOBN(0xc3bdf42e, 0x68e78a57),
+     TOBN(0xacf20785, 0x2ea96dd1), TOBN(0xa10649b9, 0xf1638460),
+     TOBN(0xf2369f0b, 0x879fbbed), TOBN(0x0ff0ae86, 0xda9d1869),
+     TOBN(0x5251d759, 0x56766f45), TOBN(0x4984d8c0, 0x2be8d0fc),
+     TOBN(0x7ecc95a6, 0xd21008f0), TOBN(0x29bd54a0, 0x3a1a1c49),
+     TOBN(0xab9828c5, 0xd26c50f3), TOBN(0x32c0087c, 0x51d0d251),
+     TOBN(0x9bac3ce6, 0x0c1cdb26), TOBN(0xcd94d947, 0x557ca205),
+     TOBN(0x1b1bd598, 0x9db1fdcd), TOBN(0x0eda0108, 0xa3d8b149),
+     TOBN(0x95066610, 0x56152fcc), TOBN(0xc2f037e6, 0xe7192b33),
+     TOBN(0xdeffb41a, 0xc92e05a4), TOBN(0x1105f6c2, 0xc2f6c62e),
+     TOBN(0x68e73500, 0x8733913c), TOBN(0xcce86163, 0x3f3adc40),
+     TOBN(0xf407a942, 0x38a278e9), TOBN(0xd13c1b9d, 0x2ab21292),
+     TOBN(0x93ed7ec7, 0x1c74cf5c), TOBN(0x8887dc48, 0xf1a4c1b4),
+     TOBN(0x3830ff30, 0x4b3a11f1), TOBN(0x358c5a3c, 0x58937cb6),
+     TOBN(0x027dc404, 0x89022829), TOBN(0x40e93977, 0x3b798f79),
+     TOBN(0x90ad3337, 0x38be6ead), TOBN(0x9c23f6bc, 0xf34c0a5d),
+     TOBN(0xd1711a35, 0xfbffd8bb), TOBN(0x60fcfb49, 0x1949d3dd),
+     TOBN(0x09c8ef4b, 0x7825d93a), TOBN(0x24233cff, 0xa0a8c968),
+     TOBN(0x67ade46c, 0xe6d982af), TOBN(0xebb6bf3e, 0xe7544d7c),
+     TOBN(0xd6b9ba76, 0x3d8bd087), TOBN(0x46fe382d, 0x4dc61280),
+     TOBN(0xbd39a7e8, 0xb5bdbd75), TOBN(0xab381331, 0xb8f228fe),
+     TOBN(0x0709a77c, 0xce1c4300), TOBN(0x6a247e56, 0xf337ceac),
+     TOBN(0x8f34f21b, 0x636288be), TOBN(0x9dfdca74, 0xc8a7c305),
+     TOBN(0x6decfd1b, 0xea919e04), TOBN(0xcdf2688d, 0x8e1991f8),
+     TOBN(0xe607df44, 0xd0f8a67e), TOBN(0xd985df4b, 0x0b58d010),
+     TOBN(0x57f834c5, 0x0c24f8f4), TOBN(0xe976ef56, 0xa0bf01ae),
+     TOBN(0x536395ac, 0xa1c32373), TOBN(0x351027aa, 0x734c0a13),
+     TOBN(0xd2f1b5d6, 0x5e6bd5bc), TOBN(0x2b539e24, 0x223debed),
+     TOBN(0xd4994cec, 0x0eaa1d71), TOBN(0x2a83381d, 0x661dcf65),
+     TOBN(0x5f1aed2f, 0x7b54c740), TOBN(0x0bea3fa5, 0xd6dda5ee),
+     TOBN(0x9d4fb684, 0x36cc6134), TOBN(0x8eb9bbf3, 0xc0a443dd),
+     TOBN(0xfc500e2e, 0x383b7d2a), TOBN(0x7aad621c, 0x5b775257),
+     TOBN(0x69284d74, 0x0a8f7cc0), TOBN(0xe820c2ce, 0x07562d65),
+     TOBN(0xbf9531b9, 0x499758ee), TOBN(0x73e95ca5, 0x6ee0cc2d),
+     TOBN(0xf61790ab, 0xfbaf50a5), TOBN(0xdf55e76b, 0x684e0750),
+     TOBN(0xec516da7, 0xf176b005), TOBN(0x575553bb, 0x7a2dddc7),
+     TOBN(0x37c87ca3, 0x553afa73), TOBN(0x315f3ffc, 0x4d55c251),
+     TOBN(0xe846442a, 0xaf3e5d35), TOBN(0x61b91149, 0x6495ff28),
+     TOBN(0x23cc95d3, 0xfa326dc3), TOBN(0x1df4da1f, 0x18fc2cea),
+     TOBN(0x24bf9adc, 0xd0a37d59), TOBN(0xb6710053, 0x320d6e1e),
+     TOBN(0x96f9667e, 0x618344d1), TOBN(0xcc7ce042, 0xa06445af),
+     TOBN(0xa02d8514, 0xd68dbc3a), TOBN(0x4ea109e4, 0x280b5a5b),
+     TOBN(0x5741a7ac, 0xb40961bf), TOBN(0x4ada5937, 0x6aa56bfa),
+     TOBN(0x7feb9145, 0x02b765d1), TOBN(0x561e97be, 0xe6ad1582),
+     TOBN(0xbbc4a5b6, 0xda3982f5), TOBN(0x0c2659ed, 0xb546f468),
+     TOBN(0xb8e7e6aa, 0x59612d20), TOBN(0xd83dfe20, 0xac19e8e0),
+     TOBN(0x8530c45f, 0xb835398c), TOBN(0x6106a8bf, 0xb38a41c2),
+     TOBN(0x21e8f9a6, 0x35f5dcdb), TOBN(0x39707137, 0xcae498ed),
+     TOBN(0x70c23834, 0xd8249f00), TOBN(0x9f14b58f, 0xab2537a0),
+     TOBN(0xd043c365, 0x5f61c0c2), TOBN(0xdc5926d6, 0x09a194a7),
+     TOBN(0xddec0339, 0x8e77738a), TOBN(0xd07a63ef, 0xfba46426),
+     TOBN(0x2e58e79c, 0xee7f6e86), TOBN(0xe59b0459, 0xff32d241),
+     TOBN(0xc5ec84e5, 0x20fa0338), TOBN(0x97939ac8, 0xeaff5ace),
+     TOBN(0x0310a4e3, 0xb4a38313), TOBN(0x9115fba2, 0x8f9d9885),
+     TOBN(0x8dd710c2, 0x5fadf8c3), TOBN(0x66be38a2, 0xce19c0e2),
+     TOBN(0xd42a279c, 0x4cfe5022), TOBN(0x597bb530, 0x0e24e1b8),
+     TOBN(0x3cde86b7, 0xc153ca7f), TOBN(0xa8d30fb3, 0x707d63bd),
+     TOBN(0xac905f92, 0xbd60d21e), TOBN(0x98e7ffb6, 0x7b9a54ab),
+     TOBN(0xd7147df8, 0xe9726a30), TOBN(0xb5e216ff, 0xafce3533),
+     TOBN(0xb550b799, 0x2ff1ec40), TOBN(0x6b613b87, 0xa1e953fd),
+     TOBN(0x87b88dba, 0x792d5610), TOBN(0x2ee1270a, 0xa190fbe1),
+     TOBN(0x02f4e2dc, 0x2ef581da), TOBN(0x016530e4, 0xeff82a95),
+     TOBN(0xcbb93dfd, 0x8fd6ee89), TOBN(0x16d3d986, 0x46848fff),
+     TOBN(0x600eff24, 0x1da47adf), TOBN(0x1b9754a0, 0x0ad47a71),
+     TOBN(0x8f9266df, 0x70c33b98), TOBN(0xaadc87ae, 0xdf34186e),
+     TOBN(0x0d2ce8e1, 0x4ad24132), TOBN(0x8a47cbfc, 0x19946eba),
+     TOBN(0x47feeb66, 0x62b5f3af), TOBN(0xcefab561, 0x0abb3734),
+     TOBN(0x449de60e, 0x19f35cb1), TOBN(0x39f8db14, 0x157f0eb9),
+     TOBN(0xffaecc5b, 0x3c61bfd6), TOBN(0xa5a4d41d, 0x41216703),
+     TOBN(0x7f8fabed, 0x224e1cc2), TOBN(0x0d5a8186, 0x871ad953),
+     TOBN(0xf10774f7, 0xd22da9a9), TOBN(0x45b8a678, 0xcc8a9b0d),
+     TOBN(0xd9c2e722, 0xbdc32cff), TOBN(0xbf71b5f5, 0x337202a5),
+     TOBN(0x95c57f2f, 0x69fc4db9), TOBN(0xb6dad34c, 0x765d01e1),
+     TOBN(0x7e0bd13f, 0xcb904635), TOBN(0x61751253, 0x763a588c),
+     TOBN(0xd85c2997, 0x81af2c2d), TOBN(0xc0f7d9c4, 0x81b9d7da),
+     TOBN(0x838a34ae, 0x08533e8d), TOBN(0x15c4cb08, 0x311d8311),
+     TOBN(0x97f83285, 0x8e121e14), TOBN(0xeea7dc1e, 0x85000a5f),
+     TOBN(0x0c6059b6, 0x5d256274), TOBN(0xec9beace, 0xb95075c0),
+     TOBN(0x173daad7, 0x1df97828), TOBN(0xbf851cb5, 0xa8937877),
+     TOBN(0xb083c594, 0x01646f3c), TOBN(0x3bad30cf, 0x50c6d352),
+     TOBN(0xfeb2b202, 0x496bbcea), TOBN(0x3cf9fd4f, 0x18a1e8ba),
+     TOBN(0xd26de7ff, 0x1c066029), TOBN(0x39c81e9e, 0x4e9ed4f8),
+     TOBN(0xd8be0cb9, 0x7b390d35), TOBN(0x01df2bbd, 0x964aab27),
+     TOBN(0x3e8c1a65, 0xc3ef64f8), TOBN(0x567291d1, 0x716ed1dd),
+     TOBN(0x95499c6c, 0x5f5406d3), TOBN(0x71fdda39, 0x5ba8e23f),
+     TOBN(0xcfeb320e, 0xd5096ece), TOBN(0xbe7ba92b, 0xca66dd16),
+     TOBN(0x4608d36b, 0xc6fb5a7d), TOBN(0xe3eea15a, 0x6d2dd0e0),
+     TOBN(0x75b0a3eb, 0x8f97a36a), TOBN(0xf59814cc, 0x1c83de1e),
+     TOBN(0x56c9c5b0, 0x1c33c23f), TOBN(0xa96c1da4, 0x6faa4136),
+     TOBN(0x46bf2074, 0xde316551), TOBN(0x3b866e7b, 0x1f756c8f),
+     TOBN(0x727727d8, 0x1495ed6b), TOBN(0xb2394243, 0xb682dce7),
+     TOBN(0x8ab8454e, 0x758610f3), TOBN(0xc243ce84, 0x857d72a4),
+     TOBN(0x7b320d71, 0xdbbf370f), TOBN(0xff9afa37, 0x78e0f7ca),
+     TOBN(0x0119d1e0, 0xea7b523f), TOBN(0xb997f8cb, 0x058c7d42),
+     TOBN(0x285bcd2a, 0x37bbb184), TOBN(0x51dcec49, 0xa45d1fa6),
+     TOBN(0x6ade3b64, 0xe29634cb), TOBN(0x080c94a7, 0x26b86ef1),
+     TOBN(0xba583db1, 0x2283fbe3), TOBN(0x902bddc8, 0x5a9315ed),
+     TOBN(0x07c1ccb3, 0x86964bec), TOBN(0x78f4eacf, 0xb6258301),
+     TOBN(0x4bdf3a49, 0x56f90823), TOBN(0xba0f5080, 0x741d777b),
+     TOBN(0x091d71c3, 0xf38bf760), TOBN(0x9633d50f, 0x9b625b02),
+     TOBN(0x03ecb743, 0xb8c9de61), TOBN(0xb4751254, 0x5de74720),
+     TOBN(0x9f9defc9, 0x74ce1cb2), TOBN(0x774a4f6a, 0x00bd32ef),
+     TOBN(0xaca385f7, 0x73848f22), TOBN(0x53dad716, 0xf3f8558e),
+     TOBN(0xab7b34b0, 0x93c471f9), TOBN(0xf530e069, 0x19644bc7),
+     TOBN(0x3d9fb1ff, 0xdd59d31a), TOBN(0x4382e0df, 0x08daa795),
+     TOBN(0x165c6f4b, 0xd5cc88d7), TOBN(0xeaa392d5, 0x4a18c900),
+     TOBN(0x94203c67, 0x648024ee), TOBN(0x188763f2, 0x8c2fabcd),
+     TOBN(0xa80f87ac, 0xbbaec835), TOBN(0x632c96e0, 0xf29d8d54),
+     TOBN(0x29b0a60e, 0x4c00a95e), TOBN(0x2ef17f40, 0xe011e9fa),
+     TOBN(0xf6c0e1d1, 0x15b77223), TOBN(0xaaec2c62, 0x14b04e32),
+     TOBN(0xd35688d8, 0x3d84e58c), TOBN(0x2af5094c, 0x958571db),
+     TOBN(0x4fff7e19, 0x760682a6), TOBN(0x4cb27077, 0xe39a407c),
+     TOBN(0x0f59c547, 0x4ff0e321), TOBN(0x169f34a6, 0x1b34c8ff),
+     TOBN(0x2bff1096, 0x52bc1ba7), TOBN(0xa25423b7, 0x83583544),
+     TOBN(0x5d55d5d5, 0x0ac8b782), TOBN(0xff6622ec, 0x2db3c892),
+     TOBN(0x48fce741, 0x6b8bb642), TOBN(0x31d6998c, 0x69d7e3dc),
+     TOBN(0xdbaf8004, 0xcadcaed0), TOBN(0x801b0142, 0xd81d053c),
+     TOBN(0x94b189fc, 0x59630ec6), TOBN(0x120e9934, 0xaf762c8e),
+     TOBN(0x53a29aa4, 0xfdc6a404), TOBN(0x19d8e01e, 0xa1909948),
+     TOBN(0x3cfcabf1, 0xd7e89681), TOBN(0x3321a50d, 0x4e132d37),
+     TOBN(0xd0496863, 0xe9a86111), TOBN(0x8c0cde61, 0x06a3bc65),
+     TOBN(0xaf866c49, 0xfc9f8eef), TOBN(0x2066350e, 0xff7f5141),
+     TOBN(0x4f8a4689, 0xe56ddfbd), TOBN(0xea1b0c07, 0xfe32983a),
+     TOBN(0x2b317462, 0x873cb8cb), TOBN(0x658deddc, 0x2d93229f),
+     TOBN(0x65efaf4d, 0x0f64ef58), TOBN(0xfe43287d, 0x730cc7a8),
+     TOBN(0xaebc0c72, 0x3d047d70), TOBN(0x92efa539, 0xd92d26c9),
+     TOBN(0x06e78457, 0x94b56526), TOBN(0x415cb80f, 0x0961002d),
+     TOBN(0x89e5c565, 0x76dcb10f), TOBN(0x8bbb6982, 0xff9259fe),
+     TOBN(0x4fe8795b, 0x9abc2668), TOBN(0xb5d4f534, 0x1e678fb1),
+     TOBN(0x6601f3be, 0x7b7da2b9), TOBN(0x98da59e2, 0xa13d6805),
+     TOBN(0x190d8ea6, 0x01799a52), TOBN(0xa20cec41, 0xb86d2952),
+     TOBN(0x3062ffb2, 0x7fff2a7c), TOBN(0x741b32e5, 0x79f19d37),
+     TOBN(0xf80d8181, 0x4eb57d47), TOBN(0x7a2d0ed4, 0x16aef06b),
+     TOBN(0x09735fb0, 0x1cecb588), TOBN(0x1641caaa, 0xc6061f5b)}
+    ,
+    {TOBN(0x7f99824f, 0x20151427), TOBN(0x206828b6, 0x92430206),
+     TOBN(0xaa9097d7, 0xe1112357), TOBN(0xacf9a2f2, 0x09e414ec),
+     TOBN(0xdbdac9da, 0x27915356), TOBN(0x7e0734b7, 0x001efee3),
+     TOBN(0x54fab5bb, 0xd2b288e2), TOBN(0x4c630fc4, 0xf62dd09c),
+     TOBN(0x8537107a, 0x1ac2703b), TOBN(0xb49258d8, 0x6bc857b5),
+     TOBN(0x57df14de, 0xbcdaccd1), TOBN(0x24ab68d7, 0xc4ae8529),
+     TOBN(0x7ed8b5d4, 0x734e59d0), TOBN(0x5f8740c8, 0xc495cc80),
+     TOBN(0x84aedd5a, 0x291db9b3), TOBN(0x80b360f8, 0x4fb995be),
+     TOBN(0xae915f5d, 0x5fa067d1), TOBN(0x4134b57f, 0x9668960c),
+     TOBN(0xbd3656d6, 0xa48edaac), TOBN(0xdac1e3e4, 0xfc1d7436),
+     TOBN(0x674ff869, 0xd81fbb26), TOBN(0x449ed3ec, 0xb26c33d4),
+     TOBN(0x85138705, 0xd94203e8), TOBN(0xccde538b, 0xbeeb6f4a),
+     TOBN(0x55d5c68d, 0xa61a76fa), TOBN(0x598b441d, 0xca1554dc),
+     TOBN(0xd39923b9, 0x773b279c), TOBN(0x33331d3c, 0x36bf9efc),
+     TOBN(0x2d4c848e, 0x298de399), TOBN(0xcfdb8e77, 0xa1a27f56),
+     TOBN(0x94c855ea, 0x57b8ab70), TOBN(0xdcdb9dae, 0x6f7879ba),
+     TOBN(0x7bdff8c2, 0x019f2a59), TOBN(0xb3ce5bb3, 0xcb4fbc74),
+     TOBN(0xea907f68, 0x8a9173dd), TOBN(0x6cd3d0d3, 0x95a75439),
+     TOBN(0x92ecc4d6, 0xefed021c), TOBN(0x09a9f9b0, 0x6a77339a),
+     TOBN(0x87ca6b15, 0x7188c64a), TOBN(0x10c29968, 0x44899158),
+     TOBN(0x5859a229, 0xed6e82ef), TOBN(0x16f338e3, 0x65ebaf4e),
+     TOBN(0x0cd31387, 0x5ead67ae), TOBN(0x1c73d228, 0x54ef0bb4),
+     TOBN(0x4cb55131, 0x74a5c8c7), TOBN(0x01cd2970, 0x7f69ad6a),
+     TOBN(0xa04d00dd, 0xe966f87e), TOBN(0xd96fe447, 0x0b7b0321),
+     TOBN(0x342ac06e, 0x88fbd381), TOBN(0x02cd4a84, 0x5c35a493),
+     TOBN(0xe8fa89de, 0x54f1bbcd), TOBN(0x341d6367, 0x2575ed4c),
+     TOBN(0xebe357fb, 0xd238202b), TOBN(0x600b4d1a, 0xa984ead9),
+     TOBN(0xc35c9f44, 0x52436ea0), TOBN(0x96fe0a39, 0xa370751b),
+     TOBN(0x4c4f0736, 0x7f636a38), TOBN(0x9f943fb7, 0x0e76d5cb),
+     TOBN(0xb03510ba, 0xa8b68b8b), TOBN(0xc246780a, 0x9ed07a1f),
+     TOBN(0x3c051415, 0x6d549fc2), TOBN(0xc2953f31, 0x607781ca),
+     TOBN(0x955e2c69, 0xd8d95413), TOBN(0xb300fadc, 0x7bd282e3),
+     TOBN(0x81fe7b50, 0x87e9189f), TOBN(0xdb17375c, 0xf42dda27),
+     TOBN(0x22f7d896, 0xcf0a5904), TOBN(0xa0e57c5a, 0xebe348e6),
+     TOBN(0xa61011d3, 0xf40e3c80), TOBN(0xb1189321, 0x8db705c5),
+     TOBN(0x4ed9309e, 0x50fedec3), TOBN(0xdcf14a10, 0x4d6d5c1d),
+     TOBN(0x056c265b, 0x55691342), TOBN(0xe8e08504, 0x91049dc7),
+     TOBN(0x131329f5, 0xc9bae20a), TOBN(0x96c8b3e8, 0xd9dccdb4),
+     TOBN(0x8c5ff838, 0xfb4ee6b4), TOBN(0xfc5a9aeb, 0x41e8ccf0),
+     TOBN(0x7417b764, 0xfae050c6), TOBN(0x0953c3d7, 0x00452080),
+     TOBN(0x21372682, 0x38dfe7e8), TOBN(0xea417e15, 0x2bb79d4b),
+     TOBN(0x59641f1c, 0x76e7cf2d), TOBN(0x271e3059, 0xea0bcfcc),
+     TOBN(0x624c7dfd, 0x7253ecbd), TOBN(0x2f552e25, 0x4fca6186),
+     TOBN(0xcbf84ecd, 0x4d866e9c), TOBN(0x73967709, 0xf68d4610),
+     TOBN(0xa14b1163, 0xc27901b4), TOBN(0xfd9236e0, 0x899b8bf3),
+     TOBN(0x42b091ec, 0xcbc6da0a), TOBN(0xbb1dac6f, 0x5ad1d297),
+     TOBN(0x80e61d53, 0xa91cf76e), TOBN(0x4110a412, 0xd31f1ee7),
+     TOBN(0x2d87c3ba, 0x13efcf77), TOBN(0x1f374bb4, 0xdf450d76),
+     TOBN(0x5e78e2f2, 0x0d188dab), TOBN(0xe3968ed0, 0xf4b885ef),
+     TOBN(0x46c0568e, 0x7314570f), TOBN(0x31616338, 0x01170521),
+     TOBN(0x18e1e7e2, 0x4f0c8afe), TOBN(0x4caa75ff, 0xdeea78da),
+     TOBN(0x82db67f2, 0x7c5d8a51), TOBN(0x36a44d86, 0x6f505370),
+     TOBN(0xd72c5bda, 0x0333974f), TOBN(0x5db516ae, 0x27a70146),
+     TOBN(0x34705281, 0x210ef921), TOBN(0xbff17a8f, 0x0c9c38e5),
+     TOBN(0x78f4814e, 0x12476da1), TOBN(0xc1e16613, 0x33c16980),
+     TOBN(0x9e5b386f, 0x424d4bca), TOBN(0x4c274e87, 0xc85740de),
+     TOBN(0xb6a9b88d, 0x6c2f5226), TOBN(0x14d1b944, 0x550d7ca8),
+     TOBN(0x580c85fc, 0x1fc41709), TOBN(0xc1da368b, 0x54c6d519),
+     TOBN(0x2b0785ce, 0xd5113cf7), TOBN(0x0670f633, 0x5a34708f),
+     TOBN(0x46e23767, 0x15cc3f88), TOBN(0x1b480cfa, 0x50c72c8f),
+     TOBN(0x20288602, 0x4147519a), TOBN(0xd0981eac, 0x26b372f0),
+     TOBN(0xa9d4a7ca, 0xa785ebc8), TOBN(0xd953c50d, 0xdbdf58e9),
+     TOBN(0x9d6361cc, 0xfd590f8f), TOBN(0x72e9626b, 0x44e6c917),
+     TOBN(0x7fd96110, 0x22eb64cf), TOBN(0x863ebb7e, 0x9eb288f3),
+     TOBN(0x6e6ab761, 0x6aca8ee7), TOBN(0x97d10b39, 0xd7b40358),
+     TOBN(0x1687d377, 0x1e5feb0d), TOBN(0xc83e50e4, 0x8265a27a),
+     TOBN(0x8f75a9fe, 0xc954b313), TOBN(0xcc2e8f47, 0x310d1f61),
+     TOBN(0xf5ba81c5, 0x6557d0e0), TOBN(0x25f9680c, 0x3eaf6207),
+     TOBN(0xf95c6609, 0x4354080b), TOBN(0x5225bfa5, 0x7bf2fe1c),
+     TOBN(0xc5c004e2, 0x5c7d98fa), TOBN(0x3561bf1c, 0x019aaf60),
+     TOBN(0x5e6f9f17, 0xba151474), TOBN(0xdec2f934, 0xb04f6eca),
+     TOBN(0x64e368a1, 0x269acb1e), TOBN(0x1332d9e4, 0x0cdda493),
+     TOBN(0x60d6cf69, 0xdf23de05), TOBN(0x66d17da2, 0x009339a0),
+     TOBN(0x9fcac985, 0x0a693923), TOBN(0xbcf057fc, 0xed7c6a6d),
+     TOBN(0xc3c5c8c5, 0xf0b5662c), TOBN(0x25318dd8, 0xdcba4f24),
+     TOBN(0x60e8cb75, 0x082b69ff), TOBN(0x7c23b3ee, 0x1e728c01),
+     TOBN(0x15e10a0a, 0x097e4403), TOBN(0xcb3d0a86, 0x19854665),
+     TOBN(0x88d8e211, 0xd67d4826), TOBN(0xb39af66e, 0x0b9d2839),
+     TOBN(0xa5f94588, 0xbd475ca8), TOBN(0xe06b7966, 0xc077b80b),
+     TOBN(0xfedb1485, 0xda27c26c), TOBN(0xd290d33a, 0xfe0fd5e0),
+     TOBN(0xa40bcc47, 0xf34fb0fa), TOBN(0xb4760cc8, 0x1fb1ab09),
+     TOBN(0x8fca0993, 0xa273bfe3), TOBN(0x13e4fe07, 0xf70b213c),
+     TOBN(0x3bcdb992, 0xfdb05163), TOBN(0x8c484b11, 0x0c2b19b6),
+     TOBN(0x1acb815f, 0xaaf2e3e2), TOBN(0xc6905935, 0xb89ff1b4),
+     TOBN(0xb2ad6f9d, 0x586e74e1), TOBN(0x488883ad, 0x67b80484),
+     TOBN(0x758aa2c7, 0x369c3ddb), TOBN(0x8ab74e69, 0x9f9afd31),
+     TOBN(0x10fc2d28, 0x5e21beb1), TOBN(0x3484518a, 0x318c42f9),
+     TOBN(0x377427dc, 0x53cf40c3), TOBN(0x9de0781a, 0x391bc1d9),
+     TOBN(0x8faee858, 0x693807e1), TOBN(0xa3865327, 0x4e81ccc7),
+     TOBN(0x02c30ff2, 0x6f835b84), TOBN(0xb604437b, 0x0d3d38d4),
+     TOBN(0xb3fc8a98, 0x5ca1823d), TOBN(0xb82f7ec9, 0x03be0324),
+     TOBN(0xee36d761, 0xcf684a33), TOBN(0x5a01df0e, 0x9f29bf7d),
+     TOBN(0x686202f3, 0x1306583d), TOBN(0x05b10da0, 0x437c622e),
+     TOBN(0xbf9aaa0f, 0x076a7bc8), TOBN(0x25e94efb, 0x8f8f4e43),
+     TOBN(0x8a35c9b7, 0xfa3dc26d), TOBN(0xe0e5fb93, 0x96ff03c5),
+     TOBN(0xa77e3843, 0xebc394ce), TOBN(0xcede6595, 0x8361de60),
+     TOBN(0xd27c22f6, 0xa1993545), TOBN(0xab01cc36, 0x24d671ba),
+     TOBN(0x63fa2877, 0xa169c28e), TOBN(0x925ef904, 0x2eb08376),
+     TOBN(0x3b2fa3cf, 0x53aa0b32), TOBN(0xb27beb5b, 0x71c49d7a),
+     TOBN(0xb60e1834, 0xd105e27f), TOBN(0xd6089788, 0x4f68570d),
+     TOBN(0x23094ce0, 0xd6fbc2ac), TOBN(0x738037a1, 0x815ff551),
+     TOBN(0xda73b1bb, 0x6bef119c), TOBN(0xdcf6c430, 0xeef506ba),
+     TOBN(0x00e4fe7b, 0xe3ef104a), TOBN(0xebdd9a2c, 0x0a065628),
+     TOBN(0x853a81c3, 0x8792043e), TOBN(0x22ad6ece, 0xb3b59108),
+     TOBN(0x9fb813c0, 0x39cd297d), TOBN(0x8ec7e16e, 0x05bda5d9),
+     TOBN(0x2834797c, 0x0d104b96), TOBN(0xcc11a2e7, 0x7c511510),
+     TOBN(0x96ca5a53, 0x96ee6380), TOBN(0x054c8655, 0xcea38742),
+     TOBN(0xb5946852, 0xd54dfa7d), TOBN(0x97c422e7, 0x1f4ab207),
+     TOBN(0xbf907509, 0x0c22b540), TOBN(0x2cde42aa, 0xb7c267d4),
+     TOBN(0xba18f9ed, 0x5ab0d693), TOBN(0x3ba62aa6, 0x6e4660d9),
+     TOBN(0xb24bf97b, 0xab9ea96a), TOBN(0x5d039642, 0xe3b60e32),
+     TOBN(0x4e6a4506, 0x7c4d9bd5), TOBN(0x666c5b9e, 0x7ed4a6a4),
+     TOBN(0xfa3fdcd9, 0x8edbd7cc), TOBN(0x4660bb87, 0xc6ccd753),
+     TOBN(0x9ae90820, 0x21e6b64f), TOBN(0x8a56a713, 0xb36bfb3f),
+     TOBN(0xabfce096, 0x5726d47f), TOBN(0x9eed01b2, 0x0b1a9a7f),
+     TOBN(0x30e9cad4, 0x4eb74a37), TOBN(0x7b2524cc, 0x53e9666d),
+     TOBN(0x6a29683b, 0x8f4b002f), TOBN(0xc2200d7a, 0x41f4fc20),
+     TOBN(0xcf3af47a, 0x3a338acc), TOBN(0x6539a4fb, 0xe7128975),
+     TOBN(0xcec31c14, 0xc33c7fcf), TOBN(0x7eb6799b, 0xc7be322b),
+     TOBN(0x119ef4e9, 0x6646f623), TOBN(0x7b7a26a5, 0x54d7299b),
+     TOBN(0xcb37f08d, 0x403f46f2), TOBN(0x94b8fc43, 0x1a0ec0c7),
+     TOBN(0xbb8514e3, 0xc332142f), TOBN(0xf3ed2c33, 0xe80d2a7a),
+     TOBN(0x8d2080af, 0xb639126c), TOBN(0xf7b6be60, 0xe3553ade),
+     TOBN(0x3950aa9f, 0x1c7e2b09), TOBN(0x847ff958, 0x6410f02b),
+     TOBN(0x877b7cf5, 0x678a31b0), TOBN(0xd50301ae, 0x3998b620),
+     TOBN(0x734257c5, 0xc00fb396), TOBN(0xf9fb18a0, 0x04e672a6),
+     TOBN(0xff8bd8eb, 0xe8758851), TOBN(0x1e64e4c6, 0x5d99ba44),
+     TOBN(0x4b8eaedf, 0x7dfd93b7), TOBN(0xba2f2a98, 0x04e76b8c),
+     TOBN(0x7d790cba, 0xe8053433), TOBN(0xc8e725a0, 0x3d2c9585),
+     TOBN(0x58c5c476, 0xcdd8f5ed), TOBN(0xd106b952, 0xefa9fe1d),
+     TOBN(0x3c5c775b, 0x0eff13a9), TOBN(0x242442ba, 0xe057b930),
+     TOBN(0xe9f458d4, 0xc9b70cbd), TOBN(0x69b71448, 0xa3cdb89a),
+     TOBN(0x41ee46f6, 0x0e2ed742), TOBN(0x573f1045, 0x40067493),
+     TOBN(0xb1e154ff, 0x9d54c304), TOBN(0x2ad0436a, 0x8d3a7502),
+     TOBN(0xee4aaa2d, 0x431a8121), TOBN(0xcd38b3ab, 0x886f11ed),
+     TOBN(0x57d49ea6, 0x034a0eb7), TOBN(0xd2b773bd, 0xf7e85e58),
+     TOBN(0x4a559ac4, 0x9b5c1f14), TOBN(0xc444be1a, 0x3e54df2b),
+     TOBN(0x13aad704, 0xeda41891), TOBN(0xcd927bec, 0x5eb5c788),
+     TOBN(0xeb3c8516, 0xe48c8a34), TOBN(0x1b7ac812, 0x4b546669),
+     TOBN(0x1815f896, 0x594df8ec), TOBN(0x87c6a79c, 0x79227865),
+     TOBN(0xae02a2f0, 0x9b56ddbd), TOBN(0x1339b5ac, 0x8a2f1cf3),
+     TOBN(0xf2b569c7, 0x839dff0d), TOBN(0xb0b9e864, 0xfee9a43d),
+     TOBN(0x4ff8ca41, 0x77bb064e), TOBN(0x145a2812, 0xfd249f63),
+     TOBN(0x3ab7beac, 0xf86f689a), TOBN(0x9bafec27, 0x01d35f5e),
+     TOBN(0x28054c65, 0x4265aa91), TOBN(0xa4b18304, 0x035efe42),
+     TOBN(0x6887b0e6, 0x9639dec7), TOBN(0xf4b8f6ad, 0x3d52aea5),
+     TOBN(0xfb9293cc, 0x971a8a13), TOBN(0x3f159e5d, 0x4c934d07),
+     TOBN(0x2c50e9b1, 0x09acbc29), TOBN(0x08eb65e6, 0x7154d129),
+     TOBN(0x4feff589, 0x30b75c3e), TOBN(0x0bb82fe2, 0x94491c93),
+     TOBN(0xd8ac377a, 0x89af62bb), TOBN(0xd7b51490, 0x9685e49f),
+     TOBN(0xabca9a7b, 0x04497f19), TOBN(0x1b35ed0a, 0x1a7ad13f),
+     TOBN(0x6b601e21, 0x3ec86ed6), TOBN(0xda91fcb9, 0xce0c76f1),
+     TOBN(0x9e28507b, 0xd7ab27e1), TOBN(0x7c19a555, 0x63945b7b),
+     TOBN(0x6b43f0a1, 0xaafc9827), TOBN(0x443b4fbd, 0x3aa55b91),
+     TOBN(0x962b2e65, 0x6962c88f), TOBN(0x139da8d4, 0xce0db0ca),
+     TOBN(0xb93f05dd, 0x1b8d6c4f), TOBN(0x779cdff7, 0x180b9824),
+     TOBN(0xbba23fdd, 0xae57c7b7), TOBN(0x345342f2, 0x1b932522),
+     TOBN(0xfd9c80fe, 0x556d4aa3), TOBN(0xa03907ba, 0x6525bb61),
+     TOBN(0x38b010e1, 0xff218933), TOBN(0xc066b654, 0xaa52117b),
+     TOBN(0x8e141920, 0x94f2e6ea), TOBN(0x66a27dca, 0x0d32f2b2),
+     TOBN(0x69c7f993, 0x048b3717), TOBN(0xbf5a989a, 0xb178ae1c),
+     TOBN(0x49fa9058, 0x564f1d6b), TOBN(0x27ec6e15, 0xd31fde4e),
+     TOBN(0x4cce0373, 0x7276e7fc), TOBN(0x64086d79, 0x89d6bf02),
+     TOBN(0x5a72f046, 0x4ccdd979), TOBN(0x909c3566, 0x47775631),
+     TOBN(0x1c07bc6b, 0x75dd7125), TOBN(0xb4c6bc97, 0x87a0428d),
+     TOBN(0x507ece52, 0xfdeb6b9d), TOBN(0xfca56512, 0xb2c95432),
+     TOBN(0x15d97181, 0xd0e8bd06), TOBN(0x384dd317, 0xc6bb46ea),
+     TOBN(0x5441ea20, 0x3952b624), TOBN(0xbcf70dee, 0x4e7dc2fb),
+     TOBN(0x372b016e, 0x6628e8c3), TOBN(0x07a0d667, 0xb60a7522),
+     TOBN(0xcf05751b, 0x0a344ee2), TOBN(0x0ec09a48, 0x118bdeec),
+     TOBN(0x6e4b3d4e, 0xd83dce46), TOBN(0x43a6316d, 0x99d2fc6e),
+     TOBN(0xa99d8989, 0x56cf044c), TOBN(0x7c7f4454, 0xae3e5fb7),
+     TOBN(0xb2e6b121, 0xfbabbe92), TOBN(0x281850fb, 0xe1330076),
+     TOBN(0x093581ec, 0x97890015), TOBN(0x69b1dded, 0x75ff77f5),
+     TOBN(0x7cf0b18f, 0xab105105), TOBN(0x953ced31, 0xa89ccfef),
+     TOBN(0x3151f85f, 0xeb914009), TOBN(0x3c9f1b87, 0x88ed48ad),
+     TOBN(0xc9aba1a1, 0x4a7eadcb), TOBN(0x928e7501, 0x522e71cf),
+     TOBN(0xeaede727, 0x3a2e4f83), TOBN(0x467e10d1, 0x1ce3bbd3),
+     TOBN(0xf3442ac3, 0xb955dcf0), TOBN(0xba96307d, 0xd3d5e527),
+     TOBN(0xf763a10e, 0xfd77f474), TOBN(0x5d744bd0, 0x6a6e1ff0),
+     TOBN(0xd287282a, 0xa777899e), TOBN(0xe20eda8f, 0xd03f3cde),
+     TOBN(0x6a7e75bb, 0x50b07d31), TOBN(0x0b7e2a94, 0x6f379de4),
+     TOBN(0x31cb64ad, 0x19f593cf), TOBN(0x7b1a9e4f, 0x1e76ef1d),
+     TOBN(0xe18c9c9d, 0xb62d609c), TOBN(0x439bad6d, 0xe779a650),
+     TOBN(0x219d9066, 0xe032f144), TOBN(0x1db632b8, 0xe8b2ec6a),
+     TOBN(0xff0d0fd4, 0xfda12f78), TOBN(0x56fb4c2d, 0x2a25d265),
+     TOBN(0x5f4e2ee1, 0x255a03f1), TOBN(0x61cd6af2, 0xe96af176),
+     TOBN(0xe0317ba8, 0xd068bc97), TOBN(0x927d6bab, 0x264b988e),
+     TOBN(0xa18f07e0, 0xe90fb21e), TOBN(0x00fd2b80, 0xbba7fca1),
+     TOBN(0x20387f27, 0x95cd67b5), TOBN(0x5b89a4e7, 0xd39707f7),
+     TOBN(0x8f83ad3f, 0x894407ce), TOBN(0xa0025b94, 0x6c226132),
+     TOBN(0xc79563c7, 0xf906c13b), TOBN(0x5f548f31, 0x4e7bb025),
+     TOBN(0x2b4c6b8f, 0xeac6d113), TOBN(0xa67e3f9c, 0x0e813c76),
+     TOBN(0x3982717c, 0x3fe1f4b9), TOBN(0x58865819, 0x26d8050e),
+     TOBN(0x99f3640c, 0xf7f06f20), TOBN(0xdc610216, 0x2a66ebc2),
+     TOBN(0x52f2c175, 0x767a1e08), TOBN(0x05660e1a, 0x5999871b),
+     TOBN(0x6b0f1762, 0x6d3c4693), TOBN(0xf0e7d627, 0x37ed7bea),
+     TOBN(0xc51758c7, 0xb75b226d), TOBN(0x40a88628, 0x1f91613b),
+     TOBN(0x889dbaa7, 0xbbb38ce0), TOBN(0xe0404b65, 0xbddcad81),
+     TOBN(0xfebccd3a, 0x8bc9671f), TOBN(0xfbf9a357, 0xee1f5375),
+     TOBN(0x5dc169b0, 0x28f33398), TOBN(0xb07ec11d, 0x72e90f65),
+     TOBN(0xae7f3b4a, 0xfaab1eb1), TOBN(0xd970195e, 0x5f17538a),
+     TOBN(0x52b05cbe, 0x0181e640), TOBN(0xf5debd62, 0x2643313d),
+     TOBN(0x76148154, 0x5df31f82), TOBN(0x23e03b33, 0x3a9e13c5),
+     TOBN(0xff758949, 0x4fde0c1f), TOBN(0xbf8a1abe, 0xe5b6ec20),
+     TOBN(0x702278fb, 0x87e1db6c), TOBN(0xc447ad7a, 0x35ed658f),
+     TOBN(0x48d4aa38, 0x03d0ccf2), TOBN(0x80acb338, 0x819a7c03),
+     TOBN(0x9bc7c89e, 0x6e17cecc), TOBN(0x46736b8b, 0x03be1d82),
+     TOBN(0xd65d7b60, 0xc0432f96), TOBN(0xddebe7a3, 0xdeb5442f),
+     TOBN(0x79a25307, 0x7dff69a2), TOBN(0x37a56d94, 0x02cf3122),
+     TOBN(0x8bab8aed, 0xf2350d0a), TOBN(0x13c3f276, 0x037b0d9a),
+     TOBN(0xc664957c, 0x44c65cae), TOBN(0x88b44089, 0xc2e71a88),
+     TOBN(0xdb88e5a3, 0x5cb02664), TOBN(0x5d4c0bf1, 0x8686c72e),
+     TOBN(0xea3d9b62, 0xa682d53e), TOBN(0x9b605ef4, 0x0b2ad431),
+     TOBN(0x71bac202, 0xc69645d0), TOBN(0xa115f03a, 0x6a1b66e7),
+     TOBN(0xfe2c563a, 0x158f4dc4), TOBN(0xf715b3a0, 0x4d12a78c),
+     TOBN(0x8f7f0a48, 0xd413213a), TOBN(0x2035806d, 0xc04becdb),
+     TOBN(0xecd34a99, 0x5d8587f5), TOBN(0x4d8c3079, 0x9f6d3a71),
+     TOBN(0x1b2a2a67, 0x8d95a8f6), TOBN(0xc58c9d7d, 0xf2110d0d),
+     TOBN(0xdeee81d5, 0xcf8fba3f), TOBN(0xa42be3c0, 0x0c7cdf68),
+     TOBN(0x2126f742, 0xd43b5eaa), TOBN(0x054a0766, 0xdfa59b85),
+     TOBN(0x9d0d5e36, 0x126bfd45), TOBN(0xa1f8fbd7, 0x384f8a8f),
+     TOBN(0x317680f5, 0xd563fccc), TOBN(0x48ca5055, 0xf280a928),
+     TOBN(0xe00b81b2, 0x27b578cf), TOBN(0x10aad918, 0x2994a514),
+     TOBN(0xd9e07b62, 0xb7bdc953), TOBN(0x9f0f6ff2, 0x5bc086dd),
+     TOBN(0x09d1ccff, 0x655eee77), TOBN(0x45475f79, 0x5bef7df1),
+     TOBN(0x3faa28fa, 0x86f702cc), TOBN(0x92e60905, 0x0f021f07),
+     TOBN(0xe9e62968, 0x7f8fa8c6), TOBN(0xbd71419a, 0xf036ea2c),
+     TOBN(0x171ee1cc, 0x6028da9a), TOBN(0x5352fe1a, 0xc251f573),
+     TOBN(0xf8ff236e, 0x3fa997f4), TOBN(0xd831b6c9, 0xa5749d5f),
+     TOBN(0x7c872e1d, 0xe350e2c2), TOBN(0xc56240d9, 0x1e0ce403),
+     TOBN(0xf9deb077, 0x6974f5cb), TOBN(0x7d50ba87, 0x961c3728),
+     TOBN(0xd6f89426, 0x5a3a2518), TOBN(0xcf817799, 0xc6303d43),
+     TOBN(0x510a0471, 0x619e5696), TOBN(0xab049ff6, 0x3a5e307b),
+     TOBN(0xe4cdf9b0, 0xfeb13ec7), TOBN(0xd5e97117, 0x9d8ff90c),
+     TOBN(0xf6f64d06, 0x9afa96af), TOBN(0x00d0bf5e, 0x9d2012a2),
+     TOBN(0xe63f301f, 0x358bcdc0), TOBN(0x07689e99, 0x0a9d47f8),
+     TOBN(0x1f689e2f, 0x4f43d43a), TOBN(0x4d542a16, 0x90920904),
+     TOBN(0xaea293d5, 0x9ca0a707), TOBN(0xd061fe45, 0x8ac68065),
+     TOBN(0x1033bf1b, 0x0090008c), TOBN(0x29749558, 0xc08a6db6),
+     TOBN(0x74b5fc59, 0xc1d5d034), TOBN(0xf712e9f6, 0x67e215e0),
+     TOBN(0xfd520cbd, 0x860200e6), TOBN(0x0229acb4, 0x3ea22588),
+     TOBN(0x9cd1e14c, 0xfff0c82e), TOBN(0x87684b62, 0x59c69e73),
+     TOBN(0xda85e61c, 0x96ccb989), TOBN(0x2d5dbb02, 0xa3d06493),
+     TOBN(0xf22ad33a, 0xe86b173c), TOBN(0xe8e41ea5, 0xa79ff0e3),
+     TOBN(0x01d2d725, 0xdd0d0c10), TOBN(0x31f39088, 0x032d28f9),
+     TOBN(0x7b3f71e1, 0x7829839e), TOBN(0x0cf691b4, 0x4502ae58),
+     TOBN(0xef658dbd, 0xbefc6115), TOBN(0xa5cd6ee5, 0xb3ab5314),
+     TOBN(0x206c8d7b, 0x5f1d2347), TOBN(0x794645ba, 0x4cc2253a),
+     TOBN(0xd517d8ff, 0x58389e08), TOBN(0x4fa20dee, 0x9f847288),
+     TOBN(0xeba072d8, 0xd797770a), TOBN(0x7360c91d, 0xbf429e26),
+     TOBN(0x7200a3b3, 0x80af8279), TOBN(0x6a1c9150, 0x82dadce3),
+     TOBN(0x0ee6d3a7, 0xc35d8794), TOBN(0x042e6558, 0x0356bae5),
+     TOBN(0x9f59698d, 0x643322fd), TOBN(0x9379ae15, 0x50a61967),
+     TOBN(0x64b9ae62, 0xfcc9981e), TOBN(0xaed3d631, 0x6d2934c6),
+     TOBN(0x2454b302, 0x5e4e65eb), TOBN(0xab09f647, 0xf9950428)}
+    ,
+    {TOBN(0xb2083a12, 0x22248acc), TOBN(0x1f6ec0ef, 0x3264e366),
+     TOBN(0x5659b704, 0x5afdee28), TOBN(0x7a823a40, 0xe6430bb5),
+     TOBN(0x24592a04, 0xe1900a79), TOBN(0xcde09d4a, 0xc9ee6576),
+     TOBN(0x52b6463f, 0x4b5ea54a), TOBN(0x1efe9ed3, 0xd3ca65a7),
+     TOBN(0xe27a6dbe, 0x305406dd), TOBN(0x8eb7dc7f, 0xdd5d1957),
+     TOBN(0xf54a6876, 0x387d4d8f), TOBN(0x9c479409, 0xc7762de4),
+     TOBN(0xbe4d5b5d, 0x99b30778), TOBN(0x25380c56, 0x6e793682),
+     TOBN(0x602d37f3, 0xdac740e3), TOBN(0x140deabe, 0x1566e4ae),
+     TOBN(0x4481d067, 0xafd32acf), TOBN(0xd8f0fcca, 0xe1f71ccf),
+     TOBN(0xd208dd0c, 0xb596f2da), TOBN(0xd049d730, 0x9aad93f9),
+     TOBN(0xc79f263d, 0x42ab580e), TOBN(0x09411bb1, 0x23f707b4),
+     TOBN(0x8cfde1ff, 0x835e0eda), TOBN(0x72707490, 0x90f03402),
+     TOBN(0xeaee6126, 0xc49a861e), TOBN(0x024f3b65, 0xe14f0d06),
+     TOBN(0x51a3f1e8, 0xc69bfc17), TOBN(0xc3c3a8e9, 0xa7686381),
+     TOBN(0x3400752c, 0xb103d4c8), TOBN(0x02bc4613, 0x9218b36b),
+     TOBN(0xc67f75eb, 0x7651504a), TOBN(0xd6848b56, 0xd02aebfa),
+     TOBN(0xbd9802e6, 0xc30fa92b), TOBN(0x5a70d96d, 0x9a552784),
+     TOBN(0x9085c4ea, 0x3f83169b), TOBN(0xfa9423bb, 0x06908228),
+     TOBN(0x2ffebe12, 0xfe97a5b9), TOBN(0x85da6049, 0x71b99118),
+     TOBN(0x9cbc2f7f, 0x63178846), TOBN(0xfd96bc70, 0x9153218e),
+     TOBN(0x958381db, 0x1782269b), TOBN(0xae34bf79, 0x2597e550),
+     TOBN(0xbb5c6064, 0x5f385153), TOBN(0x6f0e96af, 0xe3088048),
+     TOBN(0xbf6a0215, 0x77884456), TOBN(0xb3b5688c, 0x69310ea7),
+     TOBN(0x17c94295, 0x04fad2de), TOBN(0xe020f0e5, 0x17896d4d),
+     TOBN(0x730ba0ab, 0x0976505f), TOBN(0x567f6813, 0x095e2ec5),
+     TOBN(0x47062010, 0x6331ab71), TOBN(0x72cfa977, 0x41d22b9f),
+     TOBN(0x33e55ead, 0x8a2373da), TOBN(0xa8d0d5f4, 0x7ba45a68),
+     TOBN(0xba1d8f9c, 0x03029d15), TOBN(0x8f34f1cc, 0xfc55b9f3),
+     TOBN(0xcca4428d, 0xbbe5a1a9), TOBN(0x8187fd5f, 0x3126bd67),
+     TOBN(0x0036973a, 0x48105826), TOBN(0xa39b6663, 0xb8bd61a0),
+     TOBN(0x6d42deef, 0x2d65a808), TOBN(0x4969044f, 0x94636b19),
+     TOBN(0xf611ee47, 0xdd5d564c), TOBN(0x7b2f3a49, 0xd2873077),
+     TOBN(0x94157d45, 0x300eb294), TOBN(0x2b2a656e, 0x169c1494),
+     TOBN(0xc000dd76, 0xd3a47aa9), TOBN(0xa2864e4f, 0xa6243ea4),
+     TOBN(0x82716c47, 0xdb89842e), TOBN(0x12dfd7d7, 0x61479fb7),
+     TOBN(0x3b9a2c56, 0xe0b2f6dc), TOBN(0x46be862a, 0xd7f85d67),
+     TOBN(0x03b0d8dd, 0x0f82b214), TOBN(0x460c34f9, 0xf103cbc6),
+     TOBN(0xf32e5c03, 0x18d79e19), TOBN(0x8b8888ba, 0xa84117f8),
+     TOBN(0x8f3c37dc, 0xc0722677), TOBN(0x10d21be9, 0x1c1c0f27),
+     TOBN(0xd47c8468, 0xe0f7a0c6), TOBN(0x9bf02213, 0xadecc0e0),
+     TOBN(0x0baa7d12, 0x42b48b99), TOBN(0x1bcb665d, 0x48424096),
+     TOBN(0x8b847cd6, 0xebfb5cfb), TOBN(0x87c2ae56, 0x9ad4d10d),
+     TOBN(0xf1cbb122, 0x0de36726), TOBN(0xe7043c68, 0x3fdfbd21),
+     TOBN(0x4bd0826a, 0x4e79d460), TOBN(0x11f5e598, 0x4bd1a2cb),
+     TOBN(0x97554160, 0xb7fe7b6e), TOBN(0x7d16189a, 0x400a3fb2),
+     TOBN(0xd73e9bea, 0xe328ca1e), TOBN(0x0dd04b97, 0xe793d8cc),
+     TOBN(0xa9c83c9b, 0x506db8cc), TOBN(0x5cd47aae, 0xcf38814c),
+     TOBN(0x26fc430d, 0xb64b45e6), TOBN(0x079b5499, 0xd818ea84),
+     TOBN(0xebb01102, 0xc1c24a3b), TOBN(0xca24e568, 0x1c161c1a),
+     TOBN(0x103eea69, 0x36f00a4a), TOBN(0x9ad76ee8, 0x76176c7b),
+     TOBN(0x97451fc2, 0x538e0ff7), TOBN(0x94f89809, 0x6604b3b0),
+     TOBN(0x6311436e, 0x3249cfd7), TOBN(0x27b4a7bd, 0x41224f69),
+     TOBN(0x03b5d21a, 0xe0ac2941), TOBN(0x279b0254, 0xc2d31937),
+     TOBN(0x3307c052, 0xcac992d0), TOBN(0x6aa7cb92, 0xefa8b1f3),
+     TOBN(0x5a182580, 0x0d37c7a5), TOBN(0x13380c37, 0x342d5422),
+     TOBN(0x92ac2d66, 0xd5d2ef92), TOBN(0x035a70c9, 0x030c63c6),
+     TOBN(0xc16025dd, 0x4ce4f152), TOBN(0x1f419a71, 0xf9df7c06),
+     TOBN(0x6d5b2214, 0x91e4bb14), TOBN(0xfc43c6cc, 0x839fb4ce),
+     TOBN(0x49f06591, 0x925d6b2d), TOBN(0x4b37d9d3, 0x62186598),
+     TOBN(0x8c54a971, 0xd01b1629), TOBN(0xe1a9c29f, 0x51d50e05),
+     TOBN(0x5109b785, 0x71ba1861), TOBN(0x48b22d5c, 0xd0c8f93d),
+     TOBN(0xe8fa84a7, 0x8633bb93), TOBN(0x53fba6ba, 0x5aebbd08),
+     TOBN(0x7ff27df3, 0xe5eea7d8), TOBN(0x521c8796, 0x68ca7158),
+     TOBN(0xb9d5133b, 0xce6f1a05), TOBN(0x2d50cd53, 0xfd0ebee4),
+     TOBN(0xc82115d6, 0xc5a3ef16), TOBN(0x993eff9d, 0xba079221),
+     TOBN(0xe4da2c5e, 0x4b5da81c), TOBN(0x9a89dbdb, 0x8033fd85),
+     TOBN(0x60819ebf, 0x2b892891), TOBN(0x53902b21, 0x5d14a4d5),
+     TOBN(0x6ac35051, 0xd7fda421), TOBN(0xcc6ab885, 0x61c83284),
+     TOBN(0x14eba133, 0xf74cff17), TOBN(0x240aaa03, 0xecb813f2),
+     TOBN(0xcfbb6540, 0x6f665bee), TOBN(0x084b1fe4, 0xa425ad73),
+     TOBN(0x009d5d16, 0xd081f6a6), TOBN(0x35304fe8, 0xeef82c90),
+     TOBN(0xf20346d5, 0xaa9eaa22), TOBN(0x0ada9f07, 0xac1c91e3),
+     TOBN(0xa6e21678, 0x968a6144), TOBN(0x54c1f77c, 0x07b31a1e),
+     TOBN(0xd6bb787e, 0x5781fbe1), TOBN(0x61bd2ee0, 0xe31f1c4a),
+     TOBN(0xf25aa1e9, 0x781105fc), TOBN(0x9cf2971f, 0x7b2f8e80),
+     TOBN(0x26d15412, 0xcdff919b), TOBN(0x01db4ebe, 0x34bc896e),
+     TOBN(0x7d9b3e23, 0xb40df1cf), TOBN(0x59337373, 0x94e971b4),
+     TOBN(0xbf57bd14, 0x669cf921), TOBN(0x865daedf, 0x0c1a1064),
+     TOBN(0x3eb70bd3, 0x83279125), TOBN(0xbc3d5b9f, 0x34ecdaab),
+     TOBN(0x91e3ed7e, 0x5f755caf), TOBN(0x49699f54, 0xd41e6f02),
+     TOBN(0x185770e1, 0xd4a7a15b), TOBN(0x08f3587a, 0xeaac87e7),
+     TOBN(0x352018db, 0x473133ea), TOBN(0x674ce719, 0x04fd30fc),
+     TOBN(0x7b8d9835, 0x088b3e0e), TOBN(0x7a0356a9, 0x5d0d47a1),
+     TOBN(0x9d9e7659, 0x6474a3c4), TOBN(0x61ea48a7, 0xff66966c),
+     TOBN(0x30417758, 0x0f3e4834), TOBN(0xfdbb21c2, 0x17a9afcb),
+     TOBN(0x756fa17f, 0x2f9a67b3), TOBN(0x2a6b2421, 0xa245c1a8),
+     TOBN(0x64be2794, 0x4af02291), TOBN(0xade465c6, 0x2a5804fe),
+     TOBN(0x8dffbd39, 0xa6f08fd7), TOBN(0xc4efa84c, 0xaa14403b),
+     TOBN(0xa1b91b2a, 0x442b0f5c), TOBN(0xb748e317, 0xcf997736),
+     TOBN(0x8d1b62bf, 0xcee90e16), TOBN(0x907ae271, 0x0b2078c0),
+     TOBN(0xdf31534b, 0x0c9bcddd), TOBN(0x043fb054, 0x39adce83),
+     TOBN(0x99031043, 0xd826846a), TOBN(0x61a9c0d6, 0xb144f393),
+     TOBN(0xdab48046, 0x47718427), TOBN(0xdf17ff9b, 0x6e830f8b),
+     TOBN(0x408d7ee8, 0xe49a1347), TOBN(0x6ac71e23, 0x91c1d4ae),
+     TOBN(0xc8cbb9fd, 0x1defd73c), TOBN(0x19840657, 0xbbbbfec5),
+     TOBN(0x39db1cb5, 0x9e7ef8ea), TOBN(0x78aa8296, 0x64105f30),
+     TOBN(0xa3d9b7f0, 0xa3738c29), TOBN(0x0a2f235a, 0xbc3250a3),
+     TOBN(0x55e506f6, 0x445e4caf), TOBN(0x0974f73d, 0x33475f7a),
+     TOBN(0xd37dbba3, 0x5ba2f5a8), TOBN(0x542c6e63, 0x6af40066),
+     TOBN(0x26d99b53, 0xc5d73e2c), TOBN(0x06060d7d, 0x6c3ca33e),
+     TOBN(0xcdbef1c2, 0x065fef4a), TOBN(0x77e60f7d, 0xfd5b92e3),
+     TOBN(0xd7c549f0, 0x26708350), TOBN(0x201b3ad0, 0x34f121bf),
+     TOBN(0x5fcac2a1, 0x0334fc14), TOBN(0x8a9a9e09, 0x344552f6),
+     TOBN(0x7dd8a1d3, 0x97653082), TOBN(0x5fc0738f, 0x79d4f289),
+     TOBN(0x787d244d, 0x17d2d8c3), TOBN(0xeffc6345, 0x70830684),
+     TOBN(0x5ddb96dd, 0xe4f73ae5), TOBN(0x8efb14b1, 0x172549a5),
+     TOBN(0x6eb73eee, 0x2245ae7a), TOBN(0xbca4061e, 0xea11f13e),
+     TOBN(0xb577421d, 0x30b01f5d), TOBN(0xaa688b24, 0x782e152c),
+     TOBN(0x67608e71, 0xbd3502ba), TOBN(0x4ef41f24, 0xb4de75a0),
+     TOBN(0xb08dde5e, 0xfd6125e5), TOBN(0xde484825, 0xa409543f),
+     TOBN(0x1f198d98, 0x65cc2295), TOBN(0x428a3771, 0x6e0edfa2),
+     TOBN(0x4f9697a2, 0xadf35fc7), TOBN(0x01a43c79, 0xf7cac3c7),
+     TOBN(0xb05d7059, 0x0fd3659a), TOBN(0x8927f30c, 0xbb7f2d9a),
+     TOBN(0x4023d1ac, 0x8cf984d3), TOBN(0x32125ed3, 0x02897a45),
+     TOBN(0xfb572dad, 0x3d414205), TOBN(0x73000ef2, 0xe3fa82a9),
+     TOBN(0x4c0868e9, 0xf10a5581), TOBN(0x5b61fc67, 0x6b0b3ca5),
+     TOBN(0xc1258d5b, 0x7cae440c), TOBN(0x21c08b41, 0x402b7531),
+     TOBN(0xf61a8955, 0xde932321), TOBN(0x3568faf8, 0x2d1408af),
+     TOBN(0x71b15e99, 0x9ecf965b), TOBN(0xf14ed248, 0xe917276f),
+     TOBN(0xc6f4caa1, 0x820cf9e2), TOBN(0x681b20b2, 0x18d83c7e),
+     TOBN(0x6cde738d, 0xc6c01120), TOBN(0x71db0813, 0xae70e0db),
+     TOBN(0x95fc0644, 0x74afe18c), TOBN(0x34619053, 0x129e2be7),
+     TOBN(0x80615cea, 0xdb2a3b15), TOBN(0x0a49a19e, 0xdb4c7073),
+     TOBN(0x0e1b84c8, 0x8fd2d367), TOBN(0xd74bf462, 0x033fb8aa),
+     TOBN(0x889f6d65, 0x533ef217), TOBN(0x7158c7e4, 0xc3ca2e87),
+     TOBN(0xfb670dfb, 0xdc2b4167), TOBN(0x75910a01, 0x844c257f),
+     TOBN(0xf336bf07, 0xcf88577d), TOBN(0x22245250, 0xe45e2ace),
+     TOBN(0x2ed92e8d, 0x7ca23d85), TOBN(0x29f8be4c, 0x2b812f58),
+     TOBN(0xdd9ebaa7, 0x076fe12b), TOBN(0x3f2400cb, 0xae1537f9),
+     TOBN(0x1aa93528, 0x17bdfb46), TOBN(0xc0f98430, 0x67883b41),
+     TOBN(0x5590ede1, 0x0170911d), TOBN(0x7562f5bb, 0x34d4b17f),
+     TOBN(0xe1fa1df2, 0x1826b8d2), TOBN(0xb40b796a, 0x6bd80d59),
+     TOBN(0xd65bf197, 0x3467ba92), TOBN(0x8c9b46db, 0xf70954b0),
+     TOBN(0x97c8a0f3, 0x0e78f15d), TOBN(0xa8f3a69a, 0x85a4c961),
+     TOBN(0x4242660f, 0x61e4ce9b), TOBN(0xbf06aab3, 0x6ea6790c),
+     TOBN(0xc6706f8e, 0xec986416), TOBN(0x9e56dec1, 0x9a9fc225),
+     TOBN(0x527c46f4, 0x9a9898d9), TOBN(0xd799e77b, 0x5633cdef),
+     TOBN(0x24eacc16, 0x7d9e4297), TOBN(0xabb61cea, 0x6b1cb734),
+     TOBN(0xbee2e8a7, 0xf778443c), TOBN(0x3bb42bf1, 0x29de2fe6),
+     TOBN(0xcbed86a1, 0x3003bb6f), TOBN(0xd3918e6c, 0xd781cdf6),
+     TOBN(0x4bee3271, 0x9a5103f1), TOBN(0x5243efc6, 0xf50eac06),
+     TOBN(0xb8e122cb, 0x6adcc119), TOBN(0x1b7faa84, 0xc0b80a08),
+     TOBN(0x32c3d1bd, 0x6dfcd08c), TOBN(0x129dec4e, 0x0be427de),
+     TOBN(0x98ab679c, 0x1d263c83), TOBN(0xafc83cb7, 0xcef64eff),
+     TOBN(0x85eb6088, 0x2fa6be76), TOBN(0x892585fb, 0x1328cbfe),
+     TOBN(0xc154d3ed, 0xcf618dda), TOBN(0xc44f601b, 0x3abaf26e),
+     TOBN(0x7bf57d0b, 0x2be1fdfd), TOBN(0xa833bd2d, 0x21137fee),
+     TOBN(0x9353af36, 0x2db591a8), TOBN(0xc76f26dc, 0x5562a056),
+     TOBN(0x1d87e47d, 0x3fdf5a51), TOBN(0x7afb5f93, 0x55c9cab0),
+     TOBN(0x91bbf58f, 0x89e0586e), TOBN(0x7c72c018, 0x0d843709),
+     TOBN(0xa9a5aafb, 0x99b5c3dc), TOBN(0xa48a0f1d, 0x3844aeb0),
+     TOBN(0x7178b7dd, 0xb667e482), TOBN(0x453985e9, 0x6e23a59a),
+     TOBN(0x4a54c860, 0x01b25dd8), TOBN(0x0dd37f48, 0xfb897c8a),
+     TOBN(0x5f8aa610, 0x0ea90cd9), TOBN(0xc8892c68, 0x16d5830d),
+     TOBN(0xeb4befc0, 0xef514ca5), TOBN(0x478eb679, 0xe72c9ee6),
+     TOBN(0x9bca20da, 0xdbc40d5f), TOBN(0xf015de21, 0xdde4f64a),
+     TOBN(0xaa6a4de0, 0xeaf4b8a5), TOBN(0x68cfd9ca, 0x4bc60e32),
+     TOBN(0x668a4b01, 0x7fd15e70), TOBN(0xd9f0694a, 0xf27dc09d),
+     TOBN(0xf6c3cad5, 0xba708bcd), TOBN(0x5cd2ba69, 0x5bb95c2a),
+     TOBN(0xaa28c1d3, 0x33c0a58f), TOBN(0x23e274e3, 0xabc77870),
+     TOBN(0x44c3692d, 0xdfd20a4a), TOBN(0x091c5fd3, 0x81a66653),
+     TOBN(0x6c0bb691, 0x09a0757d), TOBN(0x9072e8b9, 0x667343ea),
+     TOBN(0x31d40eb0, 0x80848bec), TOBN(0x95bd480a, 0x79fd36cc),
+     TOBN(0x01a77c61, 0x65ed43f5), TOBN(0xafccd127, 0x2e0d40bf),
+     TOBN(0xeccfc82d, 0x1cc1884b), TOBN(0xc85ac201, 0x5d4753b4),
+     TOBN(0xc7a6caac, 0x658e099f), TOBN(0xcf46369e, 0x04b27390),
+     TOBN(0xe2e7d049, 0x506467ea), TOBN(0x481b63a2, 0x37cdeccc),
+     TOBN(0x4029abd8, 0xed80143a), TOBN(0x28bfe3c7, 0xbcb00b88),
+     TOBN(0x3bec1009, 0x0643d84a), TOBN(0x885f3668, 0xabd11041),
+     TOBN(0xdb02432c, 0xf83a34d6), TOBN(0x32f7b360, 0x719ceebe),
+     TOBN(0xf06c7837, 0xdad1fe7a), TOBN(0x60a157a9, 0x5441a0b0),
+     TOBN(0x704970e9, 0xe2d47550), TOBN(0xcd2bd553, 0x271b9020),
+     TOBN(0xff57f82f, 0x33e24a0b), TOBN(0x9cbee23f, 0xf2565079),
+     TOBN(0x16353427, 0xeb5f5825), TOBN(0x276feec4, 0xe948d662),
+     TOBN(0xd1b62bc6, 0xda10032b), TOBN(0x718351dd, 0xf0e72a53),
+     TOBN(0x93452076, 0x2420e7ba), TOBN(0x96368fff, 0x3a00118d),
+     TOBN(0x00ce2d26, 0x150a49e4), TOBN(0x0c28b636, 0x3f04706b),
+     TOBN(0xbad65a46, 0x58b196d0), TOBN(0x6c8455fc, 0xec9f8b7c),
+     TOBN(0xe90c895f, 0x2d71867e), TOBN(0x5c0be31b, 0xedf9f38c),
+     TOBN(0x2a37a15e, 0xd8f6ec04), TOBN(0x239639e7, 0x8cd85251),
+     TOBN(0xd8975315, 0x9c7c4c6b), TOBN(0x603aa3c0, 0xd7409af7),
+     TOBN(0xb8d53d0c, 0x007132fb), TOBN(0x68d12af7, 0xa6849238),
+     TOBN(0xbe0607e7, 0xbf5d9279), TOBN(0x9aa50055, 0xaada74ce),
+     TOBN(0xe81079cb, 0xba7e8ccb), TOBN(0x610c71d1, 0xa5f4ff5e),
+     TOBN(0x9e2ee1a7, 0x5aa07093), TOBN(0xca84004b, 0xa75da47c),
+     TOBN(0x074d3951, 0x3de75401), TOBN(0xf938f756, 0xbb311592),
+     TOBN(0x96197618, 0x00a43421), TOBN(0x39a25362, 0x07bc78c8),
+     TOBN(0x278f710a, 0x0a171276), TOBN(0xb28446ea, 0x8d1a8f08),
+     TOBN(0x184781bf, 0xe3b6a661), TOBN(0x7751cb1d, 0xe6d279f7),
+     TOBN(0xf8ff95d6, 0xc59eb662), TOBN(0x186d90b7, 0x58d3dea7),
+     TOBN(0x0e4bb6c1, 0xdfb4f754), TOBN(0x5c5cf56b, 0x2b2801dc),
+     TOBN(0xc561e452, 0x1f54564d), TOBN(0xb4fb8c60, 0xf0dd7f13),
+     TOBN(0xf8849630, 0x33ff98c7), TOBN(0x9619fffa, 0xcf17769c),
+     TOBN(0xf8090bf6, 0x1bfdd80a), TOBN(0x14d9a149, 0x422cfe63),
+     TOBN(0xb354c360, 0x6f6df9ea), TOBN(0xdbcf770d, 0x218f17ea),
+     TOBN(0x207db7c8, 0x79eb3480), TOBN(0x213dbda8, 0x559b6a26),
+     TOBN(0xac4c200b, 0x29fc81b3), TOBN(0xebc3e09f, 0x171d87c1),
+     TOBN(0x91799530, 0x1481aa9e), TOBN(0x051b92e1, 0x92e114fa),
+     TOBN(0xdf8f92e9, 0xecb5537f), TOBN(0x44b1b2cc, 0x290c7483),
+     TOBN(0xa711455a, 0x2adeb016), TOBN(0x964b6856, 0x81a10c2c),
+     TOBN(0x4f159d99, 0xcec03623), TOBN(0x05532225, 0xef3271ea),
+     TOBN(0xb231bea3, 0xc5ee4849), TOBN(0x57a54f50, 0x7094f103),
+     TOBN(0x3e2d421d, 0x9598b352), TOBN(0xe865a49c, 0x67412ab4),
+     TOBN(0xd2998a25, 0x1cc3a912), TOBN(0x5d092808, 0x0c74d65d),
+     TOBN(0x73f45908, 0x4088567a), TOBN(0xeb6b280e, 0x1f214a61),
+     TOBN(0x8c9adc34, 0xcaf0c13d), TOBN(0x39d12938, 0xf561fb80),
+     TOBN(0xb2dc3a5e, 0xbc6edfb4), TOBN(0x7485b1b1, 0xfe4d210e),
+     TOBN(0x062e0400, 0xe186ae72), TOBN(0x91e32d5c, 0x6eeb3b88),
+     TOBN(0x6df574d7, 0x4be59224), TOBN(0xebc88ccc, 0x716d55f3),
+     TOBN(0x26c2e6d0, 0xcad6ed33), TOBN(0xc6e21e7d, 0x0d3e8b10),
+     TOBN(0x2cc5840e, 0x5bcc36bb), TOBN(0x9292445e, 0x7da74f69),
+     TOBN(0x8be8d321, 0x4e5193a8), TOBN(0x3ec23629, 0x8df06413),
+     TOBN(0xc7e9ae85, 0xb134defa), TOBN(0x6073b1d0, 0x1bb2d475),
+     TOBN(0xb9ad615e, 0x2863c00d), TOBN(0x9e29493d, 0x525f4ac4),
+     TOBN(0xc32b1dea, 0x4e9acf4f), TOBN(0x3e1f01c8, 0xa50db88d),
+     TOBN(0xb05d70ea, 0x04da916c), TOBN(0x714b0d0a, 0xd865803e),
+     TOBN(0x4bd493fc, 0x9920cb5e), TOBN(0x5b44b1f7, 0x92c7a3ac),
+     TOBN(0xa2a77293, 0xbcec9235), TOBN(0x5ee06e87, 0xcd378553),
+     TOBN(0xceff8173, 0xda621607), TOBN(0x2bb03e4c, 0x99f5d290),
+     TOBN(0x2945106a, 0xa6f734ac), TOBN(0xb5056604, 0xd25c4732),
+     TOBN(0x5945920c, 0xe079afee), TOBN(0x686e17a0, 0x6789831f),
+     TOBN(0x5966bee8, 0xb74a5ae5), TOBN(0x38a673a2, 0x1e258d46),
+     TOBN(0xbd1cc1f2, 0x83141c95), TOBN(0x3b2ecf4f, 0x0e96e486),
+     TOBN(0xcd3aa896, 0x74e5fc78), TOBN(0x415ec10c, 0x2482fa7a),
+     TOBN(0x15234419, 0x80503380), TOBN(0x513d917a, 0xd314b392),
+     TOBN(0xb0b52f4e, 0x63caecae), TOBN(0x07bf22ad, 0x2dc7780b),
+     TOBN(0xe761e8a1, 0xe4306839), TOBN(0x1b3be962, 0x5dd7feaa),
+     TOBN(0x4fe728de, 0x74c778f1), TOBN(0xf1fa0bda, 0x5e0070f6),
+     TOBN(0x85205a31, 0x6ec3f510), TOBN(0x2c7e4a14, 0xd2980475),
+     TOBN(0xde3c19c0, 0x6f30ebfd), TOBN(0xdb1c1f38, 0xd4b7e644),
+     TOBN(0xfe291a75, 0x5dce364a), TOBN(0xb7b22a3c, 0x058f5be3),
+     TOBN(0x2cd2c302, 0x37fea38c), TOBN(0x2930967a, 0x2e17be17),
+     TOBN(0x87f009de, 0x0c061c65), TOBN(0xcb014aac, 0xedc6ed44),
+     TOBN(0x49bd1cb4, 0x3bafb1eb), TOBN(0x81bd8b5c, 0x282d3688),
+     TOBN(0x1cdab87e, 0xf01a17af), TOBN(0x21f37ac4, 0xe710063b),
+     TOBN(0x5a6c5676, 0x42fc8193), TOBN(0xf4753e70, 0x56a6015c),
+     TOBN(0x020f795e, 0xa15b0a44), TOBN(0x8f37c8d7, 0x8958a958),
+     TOBN(0x63b7e89b, 0xa4b675b5), TOBN(0xb4fb0c0c, 0x0fc31aea),
+     TOBN(0xed95e639, 0xa7ff1f2e), TOBN(0x9880f5a3, 0x619614fb),
+     TOBN(0xdeb6ff02, 0x947151ab), TOBN(0x5bc5118c, 0xa868dcdb),
+     TOBN(0xd8da2055, 0x4c20cea5), TOBN(0xcac2776e, 0x14c4d69a),
+     TOBN(0xcccb22c1, 0x622d599b), TOBN(0xa4ddb653, 0x68a9bb50),
+     TOBN(0x2c4ff151, 0x1b4941b4), TOBN(0xe1ff19b4, 0x6efba588),
+     TOBN(0x35034363, 0xc48345e0), TOBN(0x45542e3d, 0x1e29dfc4),
+     TOBN(0xf197cb91, 0x349f7aed), TOBN(0x3b2b5a00, 0x8fca8420),
+     TOBN(0x7c175ee8, 0x23aaf6d8), TOBN(0x54dcf421, 0x35af32b6),
+     TOBN(0x0ba14307, 0x27d6561e), TOBN(0x879d5ee4, 0xd175b1e2),
+     TOBN(0xc7c43673, 0x99807db5), TOBN(0x77a54455, 0x9cd55bcd),
+     TOBN(0xe6c2ff13, 0x0105c072), TOBN(0x18f7a99f, 0x8dda7da4),
+     TOBN(0x4c301820, 0x0e2d35c1), TOBN(0x06a53ca0, 0xd9cc6c82),
+     TOBN(0xaa21cc1e, 0xf1aa1d9e), TOBN(0x32414334, 0x4a75b1e8),
+     TOBN(0x2a6d1328, 0x0ebe9fdc), TOBN(0x16bd173f, 0x98a4755a),
+     TOBN(0xfbb9b245, 0x2133ffd9), TOBN(0x39a8b2f1, 0x830f1a20),
+     TOBN(0x484bc97d, 0xd5a1f52a), TOBN(0xd6aebf56, 0xa40eddf8),
+     TOBN(0x32257acb, 0x76ccdac6), TOBN(0xaf4d36ec, 0x1586ff27),
+     TOBN(0x8eaa8863, 0xf8de7dd1), TOBN(0x0045d5cf, 0x88647c16)}
+    ,
+    {TOBN(0xa6f3d574, 0xc005979d), TOBN(0xc2072b42, 0x6a40e350),
+     TOBN(0xfca5c156, 0x8de2ecf9), TOBN(0xa8c8bf5b, 0xa515344e),
+     TOBN(0x97aee555, 0x114df14a), TOBN(0xd4374a4d, 0xfdc5ec6b),
+     TOBN(0x754cc28f, 0x2ca85418), TOBN(0x71cb9e27, 0xd3c41f78),
+     TOBN(0x89105079, 0x03605c39), TOBN(0xf0843d9e, 0xa142c96c),
+     TOBN(0xf3744934, 0x16923684), TOBN(0x732caa2f, 0xfa0a2893),
+     TOBN(0xb2e8c270, 0x61160170), TOBN(0xc32788cc, 0x437fbaa3),
+     TOBN(0x39cd818e, 0xa6eda3ac), TOBN(0xe2e94239, 0x9e2b2e07),
+     TOBN(0x6967d39b, 0x0260e52a), TOBN(0xd42585cc, 0x90653325),
+     TOBN(0x0d9bd605, 0x21ca7954), TOBN(0x4fa20877, 0x81ed57b3),
+     TOBN(0x60c1eff8, 0xe34a0bbe), TOBN(0x56b0040c, 0x84f6ef64),
+     TOBN(0x28be2b24, 0xb1af8483), TOBN(0xb2278163, 0xf5531614),
+     TOBN(0x8df27545, 0x5922ac1c), TOBN(0xa7b3ef5c, 0xa52b3f63),
+     TOBN(0x8e77b214, 0x71de57c4), TOBN(0x31682c10, 0x834c008b),
+     TOBN(0xc76824f0, 0x4bd55d31), TOBN(0xb6d1c086, 0x17b61c71),
+     TOBN(0x31db0903, 0xc2a5089d), TOBN(0x9c092172, 0x184e5d3f),
+     TOBN(0xdd7ced5b, 0xc00cc638), TOBN(0x1a2015eb, 0x61278fc2),
+     TOBN(0x2e8e5288, 0x6a37f8d6), TOBN(0xc457786f, 0xe79933ad),
+     TOBN(0xb3fe4cce, 0x2c51211a), TOBN(0xad9b10b2, 0x24c20498),
+     TOBN(0x90d87a4f, 0xd28db5e5), TOBN(0x698cd105, 0x3aca2fc3),
+     TOBN(0x4f112d07, 0xe91b536d), TOBN(0xceb982f2, 0x9eba09d6),
+     TOBN(0x3c157b2c, 0x197c396f), TOBN(0xe23c2d41, 0x7b66eb24),
+     TOBN(0x480c57d9, 0x3f330d37), TOBN(0xb3a4c8a1, 0x79108deb),
+     TOBN(0x702388de, 0xcb199ce5), TOBN(0x0b019211, 0xb944a8d4),
+     TOBN(0x24f2a692, 0x840bb336), TOBN(0x7c353bdc, 0xa669fa7b),
+     TOBN(0xda20d6fc, 0xdec9c300), TOBN(0x625fbe2f, 0xa13a4f17),
+     TOBN(0xa2b1b61a, 0xdbc17328), TOBN(0x008965bf, 0xa9515621),
+     TOBN(0x49690939, 0xc620ff46), TOBN(0x182dd27d, 0x8717e91c),
+     TOBN(0x5ace5035, 0xea6c3997), TOBN(0x54259aaa, 0xc2610bef),
+     TOBN(0xef18bb3f, 0x3c80dd39), TOBN(0x6910b95b, 0x5fc3fa39),
+     TOBN(0xfce2f510, 0x43e09aee), TOBN(0xced56c9f, 0xa7675665),
+     TOBN(0x10e265ac, 0xd872db61), TOBN(0x6982812e, 0xae9fce69),
+     TOBN(0x29be11c6, 0xce800998), TOBN(0x72bb1752, 0xb90360d9),
+     TOBN(0x2c193197, 0x5a4ad590), TOBN(0x2ba2f548, 0x9fc1dbc0),
+     TOBN(0x7fe4eebb, 0xe490ebe0), TOBN(0x12a0a4cd, 0x7fae11c0),
+     TOBN(0x7197cf81, 0xe903ba37), TOBN(0xcf7d4aa8, 0xde1c6dd8),
+     TOBN(0x92af6bf4, 0x3fd5684c), TOBN(0x2b26eecf, 0x80360aa1),
+     TOBN(0xbd960f30, 0x00546a82), TOBN(0x407b3c43, 0xf59ad8fe),
+     TOBN(0x86cae5fe, 0x249c82ba), TOBN(0x9e0faec7, 0x2463744c),
+     TOBN(0x87f551e8, 0x94916272), TOBN(0x033f9344, 0x6ceb0615),
+     TOBN(0x1e5eb0d1, 0x8be82e84), TOBN(0x89967f0e, 0x7a582fef),
+     TOBN(0xbcf687d5, 0xa6e921fa), TOBN(0xdfee4cf3, 0xd37a09ba),
+     TOBN(0x94f06965, 0xb493c465), TOBN(0x638b9a1c, 0x7635c030),
+     TOBN(0x76667864, 0x66f05e9f), TOBN(0xccaf6808, 0xc04da725),
+     TOBN(0xca2eb690, 0x768fccfc), TOBN(0xf402d37d, 0xb835b362),
+     TOBN(0x0efac0d0, 0xe2fdfcce), TOBN(0xefc9cdef, 0xb638d990),
+     TOBN(0x2af12b72, 0xd1669a8b), TOBN(0x33c536bc, 0x5774ccbd),
+     TOBN(0x30b21909, 0xfb34870e), TOBN(0xc38fa2f7, 0x7df25aca),
+     TOBN(0x74c5f02b, 0xbf81f3f5), TOBN(0x0525a5ae, 0xaf7e4581),
+     TOBN(0x88d2aaba, 0x433c54ae), TOBN(0xed9775db, 0x806a56c5),
+     TOBN(0xd320738a, 0xc0edb37d), TOBN(0x25fdb6ee, 0x66cc1f51),
+     TOBN(0xac661d17, 0x10600d76), TOBN(0x931ec1f3, 0xbdd1ed76),
+     TOBN(0x65c11d62, 0x19ee43f1), TOBN(0x5cd57c3e, 0x60829d97),
+     TOBN(0xd26c91a3, 0x984be6e8), TOBN(0xf08d9309, 0x8b0c53bd),
+     TOBN(0x94bc9e5b, 0xc016e4ea), TOBN(0xd3916839, 0x11d43d2b),
+     TOBN(0x886c5ad7, 0x73701155), TOBN(0xe0377626, 0x20b00715),
+     TOBN(0x7f01c9ec, 0xaa80ba59), TOBN(0x3083411a, 0x68538e51),
+     TOBN(0x970370f1, 0xe88128af), TOBN(0x625cc3db, 0x91dec14b),
+     TOBN(0xfef9666c, 0x01ac3107), TOBN(0xb2a8d577, 0xd5057ac3),
+     TOBN(0xb0f26299, 0x92be5df7), TOBN(0xf579c8e5, 0x00353924),
+     TOBN(0xb8fa3d93, 0x1341ed7a), TOBN(0x4223272c, 0xa7b59d49),
+     TOBN(0x3dcb1947, 0x83b8c4a4), TOBN(0x4e413c01, 0xed1302e4),
+     TOBN(0x6d999127, 0xe17e44ce), TOBN(0xee86bf75, 0x33b3adfb),
+     TOBN(0xf6902fe6, 0x25aa96ca), TOBN(0xb73540e4, 0xe5aae47d),
+     TOBN(0x32801d7b, 0x1b4a158c), TOBN(0xe571c99e, 0x27e2a369),
+     TOBN(0x40cb76c0, 0x10d9f197), TOBN(0xc308c289, 0x3167c0ae),
+     TOBN(0xa6ef9dd3, 0xeb7958f2), TOBN(0xa7226dfc, 0x300879b1),
+     TOBN(0x6cd0b362, 0x7edf0636), TOBN(0x4efbce6c, 0x7bc37eed),
+     TOBN(0x75f92a05, 0x8d699021), TOBN(0x586d4c79, 0x772566e3),
+     TOBN(0x378ca5f1, 0x761ad23a), TOBN(0x650d86fc, 0x1465a8ac),
+     TOBN(0x7a4ed457, 0x842ba251), TOBN(0x6b65e3e6, 0x42234933),
+     TOBN(0xaf1543b7, 0x31aad657), TOBN(0xa4cefe98, 0xcbfec369),
+     TOBN(0xb587da90, 0x9f47befb), TOBN(0x6562e9fb, 0x41312d13),
+     TOBN(0xa691ea59, 0xeff1cefe), TOBN(0xcc30477a, 0x05fc4cf6),
+     TOBN(0xa1632461, 0x0b0ffd3d), TOBN(0xa1f16f3b, 0x5b355956),
+     TOBN(0x5b148d53, 0x4224ec24), TOBN(0xdc834e7b, 0xf977012a),
+     TOBN(0x7bfc5e75, 0xb2c69dbc), TOBN(0x3aa77a29, 0x03c3da6c),
+     TOBN(0xde0df03c, 0xca910271), TOBN(0xcbd5ca4a, 0x7806dc55),
+     TOBN(0xe1ca5807, 0x6db476cb), TOBN(0xfde15d62, 0x5f37a31e),
+     TOBN(0xf49af520, 0xf41af416), TOBN(0x96c5c5b1, 0x7d342db5),
+     TOBN(0x155c43b7, 0xeb4ceb9b), TOBN(0x2e993010, 0x4e77371a),
+     TOBN(0x1d2987da, 0x675d43af), TOBN(0xef2bc1c0, 0x8599fd72),
+     TOBN(0x96894b7b, 0x9342f6b2), TOBN(0x201eadf2, 0x7c8e71f0),
+     TOBN(0xf3479d9f, 0x4a1f3efc), TOBN(0xe0f8a742, 0x702a9704),
+     TOBN(0xeafd44b6, 0xb3eba40c), TOBN(0xf9739f29, 0xc1c1e0d0),
+     TOBN(0x0091471a, 0x619d505e), TOBN(0xc15f9c96, 0x9d7c263e),
+     TOBN(0x5be47285, 0x83afbe33), TOBN(0xa3b6d6af, 0x04f1e092),
+     TOBN(0xe76526b9, 0x751a9d11), TOBN(0x2ec5b26d, 0x9a4ae4d2),
+     TOBN(0xeb66f4d9, 0x02f6fb8d), TOBN(0x4063c561, 0x96912164),
+     TOBN(0xeb7050c1, 0x80ef3000), TOBN(0x288d1c33, 0xeaa5b3f0),
+     TOBN(0xe87c68d6, 0x07806fd8), TOBN(0xb2f7f9d5, 0x4bbbf50f),
+     TOBN(0x25972f3a, 0xac8d6627), TOBN(0xf8547774, 0x10e8c13b),
+     TOBN(0xcc50ef6c, 0x872b4a60), TOBN(0xab2a34a4, 0x4613521b),
+     TOBN(0x39c5c190, 0x983e15d1), TOBN(0x61dde5df, 0x59905512),
+     TOBN(0xe417f621, 0x9f2275f3), TOBN(0x0750c8b6, 0x451d894b),
+     TOBN(0x75b04ab9, 0x78b0bdaa), TOBN(0x3bfd9fd4, 0x458589bd),
+     TOBN(0xf1013e30, 0xee9120b6), TOBN(0x2b51af93, 0x23a4743e),
+     TOBN(0xea96ffae, 0x48d14d9e), TOBN(0x71dc0dbe, 0x698a1d32),
+     TOBN(0x914962d2, 0x0180cca4), TOBN(0x1ae60677, 0xc3568963),
+     TOBN(0x8cf227b1, 0x437bc444), TOBN(0xc650c83b, 0xc9962c7a),
+     TOBN(0x23c2c7dd, 0xfe7ccfc4), TOBN(0xf925c89d, 0x1b929d48),
+     TOBN(0x4460f74b, 0x06783c33), TOBN(0xac2c8d49, 0xa590475a),
+     TOBN(0xfb40b407, 0xb807bba0), TOBN(0x9d1e362d, 0x69ff8f3a),
+     TOBN(0xa33e9681, 0xcbef64a4), TOBN(0x67ece5fa, 0x332fb4b2),
+     TOBN(0x6900a99b, 0x739f10e3), TOBN(0xc3341ca9, 0xff525925),
+     TOBN(0xee18a626, 0xa9e2d041), TOBN(0xa5a83685, 0x29580ddd),
+     TOBN(0xf3470c81, 0x9d7de3cd), TOBN(0xedf02586, 0x2062cf9c),
+     TOBN(0xf43522fa, 0xc010edb0), TOBN(0x30314135, 0x13a4b1ae),
+     TOBN(0xc792e02a, 0xdb22b94b), TOBN(0x993d8ae9, 0xa1eaa45b),
+     TOBN(0x8aad6cd3, 0xcd1e1c63), TOBN(0x89529ca7, 0xc5ce688a),
+     TOBN(0x2ccee3aa, 0xe572a253), TOBN(0xe02b6438, 0x02a21efb),
+     TOBN(0xa7091b6e, 0xc9430358), TOBN(0x06d1b1fa, 0x9d7db504),
+     TOBN(0x58846d32, 0xc4744733), TOBN(0x40517c71, 0x379f9e34),
+     TOBN(0x2f65655f, 0x130ef6ca), TOBN(0x526e4488, 0xf1f3503f),
+     TOBN(0x8467bd17, 0x7ee4a976), TOBN(0x1d9dc913, 0x921363d1),
+     TOBN(0xd8d24c33, 0xb069e041), TOBN(0x5eb5da0a, 0x2cdf7f51),
+     TOBN(0x1c0f3cb1, 0x197b994f), TOBN(0x3c95a6c5, 0x2843eae9),
+     TOBN(0x7766ffc9, 0xa6097ea5), TOBN(0x7bea4093, 0xd723b867),
+     TOBN(0xb48e1f73, 0x4db378f9), TOBN(0x70025b00, 0xe37b77ac),
+     TOBN(0x943dc8e7, 0xaf24ad46), TOBN(0xb98a15ac, 0x16d00a85),
+     TOBN(0x3adc38ba, 0x2743b004), TOBN(0xb1c7f4f7, 0x334415ee),
+     TOBN(0xea43df8f, 0x1e62d05a), TOBN(0x32618905, 0x9d76a3b6),
+     TOBN(0x2fbd0bb5, 0xa23a0f46), TOBN(0x5bc971db, 0x6a01918c),
+     TOBN(0x7801d94a, 0xb4743f94), TOBN(0xb94df65e, 0x676ae22b),
+     TOBN(0xaafcbfab, 0xaf95894c), TOBN(0x7b9bdc07, 0x276b2241),
+     TOBN(0xeaf98362, 0x5bdda48b), TOBN(0x5977faf2, 0xa3fcb4df),
+     TOBN(0xbed042ef, 0x052c4b5b), TOBN(0x9fe87f71, 0x067591f0),
+     TOBN(0xc89c73ca, 0x22f24ec7), TOBN(0x7d37fa9e, 0xe64a9f1b),
+     TOBN(0x2710841a, 0x15562627), TOBN(0x2c01a613, 0xc243b034),
+     TOBN(0x1d135c56, 0x2bc68609), TOBN(0xc2ca1715, 0x8b03f1f6),
+     TOBN(0xc9966c2d, 0x3eb81d82), TOBN(0xc02abf4a, 0x8f6df13e),
+     TOBN(0x77b34bd7, 0x8f72b43b), TOBN(0xaff6218f, 0x360c82b0),
+     TOBN(0x0aa5726c, 0x8d55b9d2), TOBN(0xdc0adbe9, 0x99e9bffb),
+     TOBN(0x9097549c, 0xefb9e72a), TOBN(0x16755712, 0x9dfb3111),
+     TOBN(0xdd8bf984, 0xf26847f9), TOBN(0xbcb8e387, 0xdfb30cb7),
+     TOBN(0xc1fd32a7, 0x5171ef9c), TOBN(0x977f3fc7, 0x389b363f),
+     TOBN(0x116eaf2b, 0xf4babda0), TOBN(0xfeab68bd, 0xf7113c8e),
+     TOBN(0xd1e3f064, 0xb7def526), TOBN(0x1ac30885, 0xe0b3fa02),
+     TOBN(0x1c5a6e7b, 0x40142d9d), TOBN(0x839b5603, 0x30921c0b),
+     TOBN(0x48f301fa, 0x36a116a3), TOBN(0x380e1107, 0xcfd9ee6d),
+     TOBN(0x7945ead8, 0x58854be1), TOBN(0x4111c12e, 0xcbd4d49d),
+     TOBN(0xece3b1ec, 0x3a29c2ef), TOBN(0x6356d404, 0x8d3616f5),
+     TOBN(0x9f0d6a8f, 0x594d320e), TOBN(0x0989316d, 0xf651ccd2),
+     TOBN(0x6c32117a, 0x0f8fdde4), TOBN(0x9abe5cc5, 0xa26a9bbc),
+     TOBN(0xcff560fb, 0x9723f671), TOBN(0x21b2a12d, 0x7f3d593c),
+     TOBN(0xe4cb18da, 0x24ba0696), TOBN(0x186e2220, 0xc3543384),
+     TOBN(0x722f64e0, 0x88312c29), TOBN(0x94282a99, 0x17dc7752),
+     TOBN(0x62467bbf, 0x5a85ee89), TOBN(0xf435c650, 0xf10076a0),
+     TOBN(0xc9ff1539, 0x43b3a50b), TOBN(0x7132130c, 0x1a53efbc),
+     TOBN(0x31bfe063, 0xf7b0c5b7), TOBN(0xb0179a7d, 0x4ea994cc),
+     TOBN(0x12d064b3, 0xc85f455b), TOBN(0x47259328, 0x8f6e0062),
+     TOBN(0xf64e590b, 0xb875d6d9), TOBN(0x22dd6225, 0xad92bcc7),
+     TOBN(0xb658038e, 0xb9c3bd6d), TOBN(0x00cdb0d6, 0xfbba27c8),
+     TOBN(0x0c681337, 0x1062c45d), TOBN(0xd8515b8c, 0x2d33407d),
+     TOBN(0xcb8f699e, 0x8cbb5ecf), TOBN(0x8c4347f8, 0xc608d7d8),
+     TOBN(0x2c11850a, 0xbb3e00db), TOBN(0x20a8dafd, 0xecb49d19),
+     TOBN(0xbd781480, 0x45ee2f40), TOBN(0x75e354af, 0x416b60cf),
+     TOBN(0xde0b58a1, 0x8d49a8c4), TOBN(0xe40e94e2, 0xfa359536),
+     TOBN(0xbd4fa59f, 0x62accd76), TOBN(0x05cf466a, 0x8c762837),
+     TOBN(0xb5abda99, 0x448c277b), TOBN(0x5a9e01bf, 0x48b13740),
+     TOBN(0x9d457798, 0x326aad8d), TOBN(0xbdef4954, 0xc396f7e7),
+     TOBN(0x6fb274a2, 0xc253e292), TOBN(0x2800bf0a, 0x1cfe53e7),
+     TOBN(0x22426d31, 0x44438fd4), TOBN(0xef233923, 0x5e259f9a),
+     TOBN(0x4188503c, 0x03f66264), TOBN(0x9e5e7f13, 0x7f9fdfab),
+     TOBN(0x565eb76c, 0x5fcc1aba), TOBN(0xea632548, 0x59b5bff8),
+     TOBN(0x5587c087, 0xaab6d3fa), TOBN(0x92b639ea, 0x6ce39c1b),
+     TOBN(0x0706e782, 0x953b135c), TOBN(0x7308912e, 0x425268ef),
+     TOBN(0x599e92c7, 0x090e7469), TOBN(0x83b90f52, 0x9bc35e75),
+     TOBN(0x4750b3d0, 0x244975b3), TOBN(0xf3a44358, 0x11965d72),
+     TOBN(0x179c6774, 0x9c8dc751), TOBN(0xff18cdfe, 0xd23d9ff0),
+     TOBN(0xc4013833, 0x2028e247), TOBN(0x96e280e2, 0xf3bfbc79),
+     TOBN(0xf60417bd, 0xd0880a84), TOBN(0x263c9f3d, 0x2a568151),
+     TOBN(0x36be15b3, 0x2d2ce811), TOBN(0x846dc0c2, 0xf8291d21),
+     TOBN(0x5cfa0ecb, 0x789fcfdb), TOBN(0x45a0beed, 0xd7535b9a),
+     TOBN(0xec8e9f07, 0x96d69af1), TOBN(0x31a7c5b8, 0x599ab6dc),
+     TOBN(0xd36d45ef, 0xf9e2e09f), TOBN(0x3cf49ef1, 0xdcee954b),
+     TOBN(0x6be34cf3, 0x086cff9b), TOBN(0x88dbd491, 0x39a3360f),
+     TOBN(0x1e96b8cc, 0x0dbfbd1d), TOBN(0xc1e5f7bf, 0xcb7e2552),
+     TOBN(0x0547b214, 0x28819d98), TOBN(0xc770dd9c, 0x7aea9dcb),
+     TOBN(0xaef0d4c7, 0x041d68c8), TOBN(0xcc2b9818, 0x13cb9ba8),
+     TOBN(0x7fc7bc76, 0xfe86c607), TOBN(0x6b7b9337, 0x502a9a95),
+     TOBN(0x1948dc27, 0xd14dab63), TOBN(0x249dd198, 0xdae047be),
+     TOBN(0xe8356584, 0xa981a202), TOBN(0x3531dd18, 0x3a893387),
+     TOBN(0x1be11f90, 0xc85c7209), TOBN(0x93d2fe1e, 0xe2a52b5a),
+     TOBN(0x8225bfe2, 0xec6d6b97), TOBN(0x9cf6d6f4, 0xbd0aa5de),
+     TOBN(0x911459cb, 0x54779f5f), TOBN(0x5649cddb, 0x86aeb1f3),
+     TOBN(0x32133579, 0x3f26ce5a), TOBN(0xc289a102, 0x550f431e),
+     TOBN(0x559dcfda, 0x73b84c6f), TOBN(0x84973819, 0xee3ac4d7),
+     TOBN(0xb51e55e6, 0xf2606a82), TOBN(0xe25f7061, 0x90f2fb57),
+     TOBN(0xacef6c2a, 0xb1a4e37c), TOBN(0x864e359d, 0x5dcf2706),
+     TOBN(0x479e6b18, 0x7ce57316), TOBN(0x2cab2500, 0x3a96b23d),
+     TOBN(0xed489862, 0x8ef16df7), TOBN(0x2056538c, 0xef3758b5),
+     TOBN(0xa7df865e, 0xf15d3101), TOBN(0x80c5533a, 0x61b553d7),
+     TOBN(0x366e1997, 0x4ed14294), TOBN(0x6620741f, 0xb3c0bcd6),
+     TOBN(0x21d1d9c4, 0xedc45418), TOBN(0x005b859e, 0xc1cc4a9d),
+     TOBN(0xdf01f630, 0xa1c462f0), TOBN(0x15d06cf3, 0xf26820c7),
+     TOBN(0x9f7f24ee, 0x3484be47), TOBN(0x2ff33e96, 0x4a0c902f),
+     TOBN(0x00bdf457, 0x5a0bc453), TOBN(0x2378dfaf, 0x1aa238db),
+     TOBN(0x272420ec, 0x856720f2), TOBN(0x2ad9d95b, 0x96797291),
+     TOBN(0xd1242cc6, 0x768a1558), TOBN(0x2e287f8b, 0x5cc86aa8),
+     TOBN(0x796873d0, 0x990cecaa), TOBN(0xade55f81, 0x675d4080),
+     TOBN(0x2645eea3, 0x21f0cd84), TOBN(0x7a1efa0f, 0xb4e17d02),
+     TOBN(0xf6858420, 0x037cc061), TOBN(0x682e05f0, 0xd5d43e12),
+     TOBN(0x59c36994, 0x27218710), TOBN(0x85cbba4d, 0x3f7cd2fc),
+     TOBN(0x726f9729, 0x7a3cd22a), TOBN(0x9f8cd5dc, 0x4a628397),
+     TOBN(0x17b93ab9, 0xc23165ed), TOBN(0xff5f5dbf, 0x122823d4),
+     TOBN(0xc1e4e4b5, 0x654a446d), TOBN(0xd1a9496f, 0x677257ba),
+     TOBN(0x6387ba94, 0xde766a56), TOBN(0x23608bc8, 0x521ec74a),
+     TOBN(0x16a522d7, 0x6688c4d4), TOBN(0x9d6b4282, 0x07373abd),
+     TOBN(0xa62f07ac, 0xb42efaa3), TOBN(0xf73e00f7, 0xe3b90180),
+     TOBN(0x36175fec, 0x49421c3e), TOBN(0xc4e44f9b, 0x3dcf2678),
+     TOBN(0x76df436b, 0x7220f09f), TOBN(0x172755fb, 0x3aa8b6cf),
+     TOBN(0xbab89d57, 0x446139cc), TOBN(0x0a0a6e02, 0x5fe0208f),
+     TOBN(0xcdbb63e2, 0x11e5d399), TOBN(0x33ecaa12, 0xa8977f0b),
+     TOBN(0x59598b21, 0xf7c42664), TOBN(0xb3e91b32, 0xab65d08a),
+     TOBN(0x035822ee, 0xf4502526), TOBN(0x1dcf0176, 0x720a82a9),
+     TOBN(0x50f8598f, 0x3d589e02), TOBN(0xdf0478ff, 0xb1d63d2c),
+     TOBN(0x8b8068bd, 0x1571cd07), TOBN(0x30c3aa4f, 0xd79670cd),
+     TOBN(0x25e8fd4b, 0x941ade7f), TOBN(0x3d1debdc, 0x32790011),
+     TOBN(0x65b6dcbd, 0x3a3f9ff0), TOBN(0x282736a4, 0x793de69c),
+     TOBN(0xef69a0c3, 0xd41d3bd3), TOBN(0xb533b8c9, 0x07a26bde),
+     TOBN(0xe2801d97, 0xdb2edf9f), TOBN(0xdc4a8269, 0xe1877af0),
+     TOBN(0x6c1c5851, 0x3d590dbe), TOBN(0x84632f6b, 0xee4e9357),
+     TOBN(0xd36d36b7, 0x79b33374), TOBN(0xb46833e3, 0x9bbca2e6),
+     TOBN(0x37893913, 0xf7fc0586), TOBN(0x385315f7, 0x66bf4719),
+     TOBN(0x72c56293, 0xb31855dc), TOBN(0xd1416d4e, 0x849061fe),
+     TOBN(0xbeb3ab78, 0x51047213), TOBN(0x447f6e61, 0xf040c996),
+     TOBN(0xd06d310d, 0x638b1d0c), TOBN(0xe28a413f, 0xbad1522e),
+     TOBN(0x685a76cb, 0x82003f86), TOBN(0x610d07f7, 0x0bcdbca3),
+     TOBN(0x6ff66021, 0x9ca4c455), TOBN(0x7df39b87, 0xcea10eec),
+     TOBN(0xb9255f96, 0xe22db218), TOBN(0x8cc6d9eb, 0x08a34c44),
+     TOBN(0xcd4ffb86, 0x859f9276), TOBN(0x8fa15eb2, 0x50d07335),
+     TOBN(0xdf553845, 0xcf2c24b5), TOBN(0x89f66a9f, 0x52f9c3ba),
+     TOBN(0x8f22b5b9, 0xe4a7ceb3), TOBN(0xaffef809, 0x0e134686),
+     TOBN(0x3e53e1c6, 0x8eb8fac2), TOBN(0x93c1e4eb, 0x28aec98e),
+     TOBN(0xb6b91ec5, 0x32a43bcb), TOBN(0x2dbfa947, 0xb2d74a51),
+     TOBN(0xe065d190, 0xca84bad7), TOBN(0xfb13919f, 0xad58e65c),
+     TOBN(0x3c41718b, 0xf1cb6e31), TOBN(0x688969f0, 0x06d05c3f),
+     TOBN(0xd4f94ce7, 0x21264d45), TOBN(0xfdfb65e9, 0x7367532b),
+     TOBN(0x5b1be8b1, 0x0945a39d), TOBN(0x229f789c, 0x2b8baf3b),
+     TOBN(0xd8f41f3e, 0x6f49f15d), TOBN(0x678ce828, 0x907f0792),
+     TOBN(0xc69ace82, 0xfca6e867), TOBN(0x106451ae, 0xd01dcc89),
+     TOBN(0x1bb4f7f0, 0x19fc32d2), TOBN(0x64633dfc, 0xb00c52d2),
+     TOBN(0x8f13549a, 0xad9ea445), TOBN(0x99a3bf50, 0xfb323705),
+     TOBN(0x0c9625a2, 0x534d4dbc), TOBN(0x45b8f1d1, 0xc2a2fea3),
+     TOBN(0x76ec21a1, 0xa530fc1a), TOBN(0x4bac9c2a, 0x9e5bd734),
+     TOBN(0x5996d76a, 0x7b4e3587), TOBN(0x0045cdee, 0x1182d9e3),
+     TOBN(0x1aee24b9, 0x1207f13d), TOBN(0x66452e97, 0x97345a41),
+     TOBN(0x16e5b054, 0x9f950cd0), TOBN(0x9cc72fb1, 0xd7fdd075),
+     TOBN(0x6edd61e7, 0x66249663), TOBN(0xde4caa4d, 0xf043cccb),
+     TOBN(0x11b1f57a, 0x55c7ac17), TOBN(0x779cbd44, 0x1a85e24d),
+     TOBN(0x78030f86, 0xe46081e7), TOBN(0xfd4a6032, 0x8e20f643),
+     TOBN(0xcc7a6488, 0x0a750c0f), TOBN(0x39bacfe3, 0x4e548e83),
+     TOBN(0x3d418c76, 0x0c110f05), TOBN(0x3e4daa4c, 0xb1f11588),
+     TOBN(0x2733e7b5, 0x5ffc69ff), TOBN(0x46f147bc, 0x92053127),
+     TOBN(0x885b2434, 0xd722df94), TOBN(0x6a444f65, 0xe6fc6b7c)}
+    ,
+    {TOBN(0x7a1a465a, 0xc3f16ea8), TOBN(0x115a461d, 0xb2f1d11c),
+     TOBN(0x4767dd95, 0x6c68a172), TOBN(0x3392f2eb, 0xd13a4698),
+     TOBN(0xc7a99ccd, 0xe526cdc7), TOBN(0x8e537fdc, 0x22292b81),
+     TOBN(0x76d8cf69, 0xa6d39198), TOBN(0xffc5ff43, 0x2446852d),
+     TOBN(0x97b14f7e, 0xa90567e6), TOBN(0x513257b7, 0xb6ae5cb7),
+     TOBN(0x85454a3c, 0x9f10903d), TOBN(0xd8d2c9ad, 0x69bc3724),
+     TOBN(0x38da9324, 0x6b29cb44), TOBN(0xb540a21d, 0x77c8cbac),
+     TOBN(0x9bbfe435, 0x01918e42), TOBN(0xfffa707a, 0x56c3614e),
+     TOBN(0x0ce4e3f1, 0xd4e353b7), TOBN(0x062d8a14, 0xef46b0a0),
+     TOBN(0x6408d5ab, 0x574b73fd), TOBN(0xbc41d1c9, 0xd3273ffd),
+     TOBN(0x3538e1e7, 0x6be77800), TOBN(0x71fe8b37, 0xc5655031),
+     TOBN(0x1cd91621, 0x6b9b331a), TOBN(0xad825d0b, 0xbb388f73),
+     TOBN(0x56c2e05b, 0x1cb76219), TOBN(0x0ec0bf91, 0x71567e7e),
+     TOBN(0xe7076f86, 0x61c4c910), TOBN(0xd67b085b, 0xbabc04d9),
+     TOBN(0x9fb90459, 0x5e93a96a), TOBN(0x7526c1ea, 0xfbdc249a),
+     TOBN(0x0d44d367, 0xecdd0bb7), TOBN(0x95399917, 0x9dc0d695),
+     TOBN(0x61360ee9, 0x9e240d18), TOBN(0x057cdcac, 0xb4b94466),
+     TOBN(0xe7667cd1, 0x2fe5325c), TOBN(0x1fa297b5, 0x21974e3b),
+     TOBN(0xfa4081e7, 0xdb083d76), TOBN(0x31993be6, 0xf206bd15),
+     TOBN(0x8949269b, 0x14c19f8c), TOBN(0x21468d72, 0xa9d92357),
+     TOBN(0x2ccbc583, 0xa4c506ec), TOBN(0x957ed188, 0xd1acfe97),
+     TOBN(0x8baed833, 0x12f1aea2), TOBN(0xef2a6cb4, 0x8325362d),
+     TOBN(0x130dde42, 0x8e195c43), TOBN(0xc842025a, 0x0e6050c6),
+     TOBN(0x2da972a7, 0x08686a5d), TOBN(0xb52999a1, 0xe508b4a8),
+     TOBN(0xd9f090b9, 0x10a5a8bd), TOBN(0xca91d249, 0x096864da),
+     TOBN(0x8e6a93be, 0x3f67dbc1), TOBN(0xacae6fba, 0xf5f4764c),
+     TOBN(0x1563c6e0, 0xd21411a0), TOBN(0x28fa787f, 0xda0a4ad8),
+     TOBN(0xd524491c, 0x908c8030), TOBN(0x1257ba0e, 0x4c795f07),
+     TOBN(0x83f49167, 0xceca9754), TOBN(0x426d2cf6, 0x4b7939a0),
+     TOBN(0x2555e355, 0x723fd0bf), TOBN(0xa96e6d06, 0xc4f144e2),
+     TOBN(0x4768a8dd, 0x87880e61), TOBN(0x15543815, 0xe508e4d5),
+     TOBN(0x09d7e772, 0xb1b65e15), TOBN(0x63439dd6, 0xac302fa0),
+     TOBN(0xb93f802f, 0xc14e35c2), TOBN(0x71735b7c, 0x4341333c),
+     TOBN(0x03a25104, 0x16d4f362), TOBN(0x3f4d069b, 0xbf433c8e),
+     TOBN(0x0d83ae01, 0xf78f5a7c), TOBN(0x50a8ffbe, 0x7c4eed07),
+     TOBN(0xc74f8906, 0x76e10f83), TOBN(0x7d080966, 0x9ddaf8e1),
+     TOBN(0xb11df8e1, 0x698e04cc), TOBN(0x877be203, 0x169005c8),
+     TOBN(0x32749e8c, 0x4f3c6179), TOBN(0x2dbc9d0a, 0x7853fc05),
+     TOBN(0x187d4f93, 0x9454d937), TOBN(0xe682ce9d, 0xb4800e1b),
+     TOBN(0xa9129ad8, 0x165e68e8), TOBN(0x0fe29735, 0xbe7f785b),
+     TOBN(0x5303f40c, 0x5b9e02b7), TOBN(0xa37c9692, 0x35ee04e8),
+     TOBN(0x5f46cc20, 0x34d6632b), TOBN(0x55ef72b2, 0x96ac545b),
+     TOBN(0xabec5c1f, 0x7b91b062), TOBN(0x0a79e1c7, 0xbb33e821),
+     TOBN(0xbb04b428, 0x3a9f4117), TOBN(0x0de1f28f, 0xfd2a475a),
+     TOBN(0x31019ccf, 0x3a4434b4), TOBN(0xa3458111, 0x1a7954dc),
+     TOBN(0xa9dac80d, 0xe34972a7), TOBN(0xb043d054, 0x74f6b8dd),
+     TOBN(0x021c319e, 0x11137b1a), TOBN(0x00a754ce, 0xed5cc03f),
+     TOBN(0x0aa2c794, 0xcbea5ad4), TOBN(0x093e67f4, 0x70c015b6),
+     TOBN(0x72cdfee9, 0xc97e3f6b), TOBN(0xc10bcab4, 0xb6da7461),
+     TOBN(0x3b02d2fc, 0xb59806b9), TOBN(0x85185e89, 0xa1de6f47),
+     TOBN(0x39e6931f, 0x0eb6c4d4), TOBN(0x4d4440bd, 0xd4fa5b04),
+     TOBN(0x5418786e, 0x34be7eb8), TOBN(0x6380e521, 0x9d7259bc),
+     TOBN(0x20ac0351, 0xd598d710), TOBN(0x272c4166, 0xcb3a4da4),
+     TOBN(0xdb82fe1a, 0xca71de1f), TOBN(0x746e79f2, 0xd8f54b0f),
+     TOBN(0x6e7fc736, 0x4b573e9b), TOBN(0x75d03f46, 0xfd4b5040),
+     TOBN(0x5c1cc36d, 0x0b98d87b), TOBN(0x513ba3f1, 0x1f472da1),
+     TOBN(0x79d0af26, 0xabb177dd), TOBN(0xf82ab568, 0x7891d564),
+     TOBN(0x2b6768a9, 0x72232173), TOBN(0xefbb3bb0, 0x8c1f6619),
+     TOBN(0xb29c11db, 0xa6d18358), TOBN(0x519e2797, 0xb0916d3a),
+     TOBN(0xd4dc18f0, 0x9188e290), TOBN(0x648e86e3, 0x98b0ca7f),
+     TOBN(0x859d3145, 0x983c38b5), TOBN(0xb14f176c, 0x637abc8b),
+     TOBN(0x2793fb9d, 0xcaff7be6), TOBN(0xebe5a55f, 0x35a66a5a),
+     TOBN(0x7cec1dcd, 0x9f87dc59), TOBN(0x7c595cd3, 0xfbdbf560),
+     TOBN(0x5b543b22, 0x26eb3257), TOBN(0x69080646, 0xc4c935fd),
+     TOBN(0x7f2e4403, 0x81e9ede3), TOBN(0x243c3894, 0xcaf6df0a),
+     TOBN(0x7c605bb1, 0x1c073b11), TOBN(0xcd06a541, 0xba6a4a62),
+     TOBN(0x29168949, 0x49d4e2e5), TOBN(0x33649d07, 0x4af66880),
+     TOBN(0xbfc0c885, 0xe9a85035), TOBN(0xb4e52113, 0xfc410f4b),
+     TOBN(0xdca3b706, 0x78a6513b), TOBN(0x92ea4a2a, 0x9edb1943),
+     TOBN(0x02642216, 0xdb6e2dd8), TOBN(0x9b45d0b4, 0x9fd57894),
+     TOBN(0x114e70db, 0xc69d11ae), TOBN(0x1477dd19, 0x4c57595f),
+     TOBN(0xbc2208b4, 0xec77c272), TOBN(0x95c5b4d7, 0xdb68f59c),
+     TOBN(0xb8c4fc63, 0x42e532b7), TOBN(0x386ba422, 0x9ae35290),
+     TOBN(0xfb5dda42, 0xd201ecbc), TOBN(0x2353dc8b, 0xa0e38fd6),
+     TOBN(0x9a0b85ea, 0x68f7e978), TOBN(0x96ec5682, 0x2ad6d11f),
+     TOBN(0x5e279d6c, 0xe5f6886d), TOBN(0xd3fe03cd, 0x3cb1914d),
+     TOBN(0xfe541fa4, 0x7ea67c77), TOBN(0x952bd2af, 0xe3ea810c),
+     TOBN(0x791fef56, 0x8d01d374), TOBN(0xa3a1c621, 0x0f11336e),
+     TOBN(0x5ad0d5a9, 0xc7ec6d79), TOBN(0xff7038af, 0x3225c342),
+     TOBN(0x003c6689, 0xbc69601b), TOBN(0x25059bc7, 0x45e8747d),
+     TOBN(0xfa4965b2, 0xf2086fbf), TOBN(0xf6840ea6, 0x86916078),
+     TOBN(0xd7ac7620, 0x70081d6c), TOBN(0xe600da31, 0xb5328645),
+     TOBN(0x01916f63, 0x529b8a80), TOBN(0xe80e4858, 0x2d7d6f3e),
+     TOBN(0x29eb0fe8, 0xd664ca7c), TOBN(0xf017637b, 0xe7b43b0c),
+     TOBN(0x9a75c806, 0x76cb2566), TOBN(0x8f76acb1, 0xb24892d9),
+     TOBN(0x7ae7b9cc, 0x1f08fe45), TOBN(0x19ef7329, 0x6a4907d8),
+     TOBN(0x2db4ab71, 0x5f228bf0), TOBN(0xf3cdea39, 0x817032d7),
+     TOBN(0x0b1f482e, 0xdcabe3c0), TOBN(0x3baf76b4, 0xbb86325c),
+     TOBN(0xd49065e0, 0x10089465), TOBN(0x3bab5d29, 0x8e77c596),
+     TOBN(0x7636c3a6, 0x193dbd95), TOBN(0xdef5d294, 0xb246e499),
+     TOBN(0xb22c58b9, 0x286b2475), TOBN(0xa0b93939, 0xcd80862b),
+     TOBN(0x3002c83a, 0xf0992388), TOBN(0x6de01f9b, 0xeacbe14c),
+     TOBN(0x6aac688e, 0xadd70482), TOBN(0x708de92a, 0x7b4a4e8a),
+     TOBN(0x75b6dd73, 0x758a6eef), TOBN(0xea4bf352, 0x725b3c43),
+     TOBN(0x10041f2c, 0x87912868), TOBN(0xb1b1be95, 0xef09297a),
+     TOBN(0x19ae23c5, 0xa9f3860a), TOBN(0xc4f0f839, 0x515dcf4b),
+     TOBN(0x3c7ecca3, 0x97f6306a), TOBN(0x744c44ae, 0x68a3a4b0),
+     TOBN(0x69cd13a0, 0xb3a1d8a2), TOBN(0x7cad0a1e, 0x5256b578),
+     TOBN(0xea653fcd, 0x33791d9e), TOBN(0x9cc2a05d, 0x74b2e05f),
+     TOBN(0x73b391dc, 0xfd7affa2), TOBN(0xddb7091e, 0xb6b05442),
+     TOBN(0xc71e27bf, 0x8538a5c6), TOBN(0x195c63dd, 0x89abff17),
+     TOBN(0xfd315285, 0x1b71e3da), TOBN(0x9cbdfda7, 0xfa680fa0),
+     TOBN(0x9db876ca, 0x849d7eab), TOBN(0xebe2764b, 0x3c273271),
+     TOBN(0x663357e3, 0xf208dcea), TOBN(0x8c5bd833, 0x565b1b70),
+     TOBN(0xccc3b4f5, 0x9837fc0d), TOBN(0x9b641ba8, 0xa79cf00f),
+     TOBN(0x7428243d, 0xdfdf3990), TOBN(0x83a594c4, 0x020786b1),
+     TOBN(0xb712451a, 0x526c4502), TOBN(0x9d39438e, 0x6adb3f93),
+     TOBN(0xfdb261e3, 0xe9ff0ccd), TOBN(0x80344e3c, 0xe07af4c3),
+     TOBN(0x75900d7c, 0x2fa4f126), TOBN(0x08a3b865, 0x5c99a232),
+     TOBN(0x2478b6bf, 0xdb25e0c3), TOBN(0x482cc2c2, 0x71db2edf),
+     TOBN(0x37df7e64, 0x5f321bb8), TOBN(0x8a93821b, 0x9a8005b4),
+     TOBN(0x3fa2f10c, 0xcc8c1958), TOBN(0x0d332218, 0x2c269d0a),
+     TOBN(0x20ab8119, 0xe246b0e6), TOBN(0xb39781e4, 0xd349fd17),
+     TOBN(0xd293231e, 0xb31aa100), TOBN(0x4b779c97, 0xbb032168),
+     TOBN(0x4b3f19e1, 0xc8470500), TOBN(0x45b7efe9, 0x0c4c869d),
+     TOBN(0xdb84f38a, 0xa1a6bbcc), TOBN(0x3b59cb15, 0xb2fddbc1),
+     TOBN(0xba5514df, 0x3fd165e8), TOBN(0x499fd6a9, 0x061f8811),
+     TOBN(0x72cd1fe0, 0xbfef9f00), TOBN(0x120a4bb9, 0x79ad7e8a),
+     TOBN(0xf2ffd095, 0x5f4a5ac5), TOBN(0xcfd174f1, 0x95a7a2f0),
+     TOBN(0xd42301ba, 0x9d17baf1), TOBN(0xd2fa487a, 0x77f22089),
+     TOBN(0x9cb09efe, 0xb1dc77e1), TOBN(0xe9566939, 0x21c99682),
+     TOBN(0x8c546901, 0x6c6067bb), TOBN(0xfd378574, 0x61c24456),
+     TOBN(0x2b6a6cbe, 0x81796b33), TOBN(0x62d550f6, 0x58e87f8b),
+     TOBN(0x1b763e1c, 0x7f1b01b4), TOBN(0x4b93cfea, 0x1b1b5e12),
+     TOBN(0xb9345238, 0x1d531696), TOBN(0x57201c00, 0x88cdde69),
+     TOBN(0xdde92251, 0x9a86afc7), TOBN(0xe3043895, 0xbd35cea8),
+     TOBN(0x7608c1e1, 0x8555970d), TOBN(0x8267dfa9, 0x2535935e),
+     TOBN(0xd4c60a57, 0x322ea38b), TOBN(0xe0bf7977, 0x804ef8b5),
+     TOBN(0x1a0dab28, 0xc06fece4), TOBN(0xd405991e, 0x94e7b49d),
+     TOBN(0xc542b6d2, 0x706dab28), TOBN(0xcb228da3, 0xa91618fb),
+     TOBN(0x224e4164, 0x107d1cea), TOBN(0xeb9fdab3, 0xd0f5d8f1),
+     TOBN(0xc02ba386, 0x0d6e41cd), TOBN(0x6