ec/ecp_nistz256.c: improve ECDSA sign by 30-40%.

author Andy Polyakov <appro@openssl.org>

Sat, 30 Dec 2017 14:08:31 +0000 (15:08 +0100)

committer Andy Polyakov <appro@openssl.org>

Sun, 7 Jan 2018 20:31:37 +0000 (21:31 +0100)
author Andy Polyakov <appro@openssl.org>
Sat, 30 Dec 2017 14:08:31 +0000 (15:08 +0100)
committer Andy Polyakov <appro@openssl.org>
Sun, 7 Jan 2018 20:31:37 +0000 (21:31 +0100)
diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl

index 48d64645f0a445a8da31104e59865d74031bf5e0..85c4131983db15c6a7191611da6f97172da92ed7 100755 (executable)
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -1,15 +1,17 @@
  #! /usr/bin/env perl
  # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
  # Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+# Copyright (c) 2015 CloudFlare, Inc.
  #
  # Licensed under the OpenSSL license (the "License").  You may not use
  # this file except in compliance with the License.  You can obtain a copy
  # in the file LICENSE in the source distribution or at
  # https://www.openssl.org/source/license.html
  #
-# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
  # (1) Intel Corporation, Israel Development Center, Haifa, Israel
  # (2) University of Haifa, Israel
+# (3) CloudFlare, Inc.
  #
  # Reference:
  # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
@@ -18,23 +20,25 @@
  # Further optimization by <appro@openssl.org>:
  #
  #              this/original   with/without -DECP_NISTZ256_ASM(*)
-# Opteron      +12-49%         +110-150%
-# Bulldozer    +14-45%         +175-210%
-# P4           +18-46%         n/a :-(
-# Westmere     +12-34%         +80-87%
-# Sandy Bridge +9-35%          +110-120%
-# Ivy Bridge   +9-35%          +110-125%
-# Haswell      +8-37%          +140-160%
-# Broadwell    +18-58%         +145-210%
-# Atom         +15-50%         +130-180%
-# VIA Nano     +43-160%        +300-480%
+# Opteron      +15-49%         +150-195%
+# Bulldozer    +18-45%         +175-240%
+# P4           +24-46%         +100-150%
+# Westmere     +18-34%         +87-160%
+# Sandy Bridge +14-35%         +120-185%
+# Ivy Bridge   +11-35%         +125-180%
+# Haswell      +10-37%         +160-200%
+# Broadwell    +24-58%         +210-270%
+# Atom         +20-50%         +180-240%
+# VIA Nano     +50-160%        +480-480%
  #
  # (*)  "without -DECP_NISTZ256_ASM" refers to build with
  #      "enable-ec_nistp_64_gcc_128";
  #
  # Ranges denote minimum and maximum improvement coefficients depending
-# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
-# server-side operation. Keep in mind that +100% means 2x improvement.
+# on benchmark. In "this/original" column lower coefficient is for
+# ECDSA sign, while in "with/without" - for ECDH key agreement, and
+# higher - for ECDSA sign, relatively fastest server-side operation.
+# Keep in mind that +100% means 2x improvement.
  
  $flavour = shift;
  $output  = shift;
@@ -95,6 +99,12 @@ $code.=<<___;
  .long 3,3,3,3,3,3,3,3
  .LONE_mont:
  .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+# Constants for computations modulo ord(p256)
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
  ___
  
  {
@@ -481,6 +491,1014 @@ my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
  my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
  my ($poly1,$poly3)=($acc6,$acc7);
  
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_ord_mul_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   uint64_t b[4]);
+
+.globl ecp_nistz256_ord_mul_mont
+.type  ecp_nistz256_ord_mul_mont,\@function,3
+.align 32
+ecp_nistz256_ord_mul_mont:
+___
+$code.=<<___   if ($addx);
+       mov     \$0x80100, %ecx
+       and     OPENSSL_ia32cap_P+8(%rip), %ecx
+       cmp     \$0x80100, %ecx
+       je      .Lecp_nistz256_ord_mul_montx
+___
+$code.=<<___;
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+
+       mov     8*0($b_org), %rax
+       mov     $b_org, $b_ptr
+       lea     .Lord(%rip), %r14
+       mov     .LordK(%rip), %r15
+
+       ################################# * b[0]
+       mov     %rax, $t0
+       mulq    8*0($a_ptr)
+       mov     %rax, $acc0
+       mov     $t0, %rax
+       mov     %rdx, $acc1
+
+       mulq    8*1($a_ptr)
+       add     %rax, $acc1
+       mov     $t0, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $acc2
+
+       mulq    8*2($a_ptr)
+       add     %rax, $acc2
+       mov     $t0, %rax
+       adc     \$0, %rdx
+
+        mov    $acc0, $acc5
+        imulq  %r15,$acc0
+
+       mov     %rdx, $acc3
+       mulq    8*3($a_ptr)
+       add     %rax, $acc3
+        mov    $acc0, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $acc4
+
+       ################################# First reduction step
+       mulq    8*0(%r14)
+       mov     $acc0, $t1
+       add     %rax, $acc5             # guaranteed to be zero
+       mov     $acc0, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t0
+
+       sub     $acc0, $acc2
+       sbb     \$0, $acc0              # can't borrow
+
+       mulq    8*1(%r14)
+       add     $t0, $acc1
+       adc     \$0, %rdx
+       add     %rax, $acc1
+       mov     $t1, %rax
+       adc     %rdx, $acc2
+       mov     $t1, %rdx
+       adc     \$0, $acc0              # can't overflow
+
+       shl     \$32, %rax
+       shr     \$32, %rdx
+       sub     %rax, $acc3
+        mov    8*1($b_ptr), %rax
+       sbb     %rdx, $t1               # can't borrow
+
+       add     $acc0, $acc3
+       adc     $t1, $acc4
+       adc     \$0, $acc5
+
+       ################################# * b[1]
+       mov     %rax, $t0
+       mulq    8*0($a_ptr)
+       add     %rax, $acc1
+       mov     $t0, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t1
+
+       mulq    8*1($a_ptr)
+       add     $t1, $acc2
+       adc     \$0, %rdx
+       add     %rax, $acc2
+       mov     $t0, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t1
+
+       mulq    8*2($a_ptr)
+       add     $t1, $acc3
+       adc     \$0, %rdx
+       add     %rax, $acc3
+       mov     $t0, %rax
+       adc     \$0, %rdx
+
+        mov    $acc1, $t0
+        imulq  %r15, $acc1
+
+       mov     %rdx, $t1
+       mulq    8*3($a_ptr)
+       add     $t1, $acc4
+       adc     \$0, %rdx
+       xor     $acc0, $acc0
+       add     %rax, $acc4
+        mov    $acc1, %rax
+       adc     %rdx, $acc5
+       adc     \$0, $acc0
+
+       ################################# Second reduction step
+       mulq    8*0(%r14)
+       mov     $acc1, $t1
+       add     %rax, $t0               # guaranteed to be zero
+       mov     $acc1, %rax
+       adc     %rdx, $t0
+
+       sub     $acc1, $acc3
+       sbb     \$0, $acc1              # can't borrow
+
+       mulq    8*1(%r14)
+       add     $t0, $acc2
+       adc     \$0, %rdx
+       add     %rax, $acc2
+       mov     $t1, %rax
+       adc     %rdx, $acc3
+       mov     $t1, %rdx
+       adc     \$0, $acc1              # can't overflow
+
+       shl     \$32, %rax
+       shr     \$32, %rdx
+       sub     %rax, $acc4
+        mov    8*2($b_ptr), %rax
+       sbb     %rdx, $t1               # can't borrow
+
+       add     $acc1, $acc4
+       adc     $t1, $acc5
+       adc     \$0, $acc0
+
+       ################################## * b[2]
+       mov     %rax, $t0
+       mulq    8*0($a_ptr)
+       add     %rax, $acc2
+       mov     $t0, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t1
+
+       mulq    8*1($a_ptr)
+       add     $t1, $acc3
+       adc     \$0, %rdx
+       add     %rax, $acc3
+       mov     $t0, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t1
+
+       mulq    8*2($a_ptr)
+       add     $t1, $acc4
+       adc     \$0, %rdx
+       add     %rax, $acc4
+       mov     $t0, %rax
+       adc     \$0, %rdx
+
+        mov    $acc2, $t0
+        imulq  %r15, $acc2
+
+       mov     %rdx, $t1
+       mulq    8*3($a_ptr)
+       add     $t1, $acc5
+       adc     \$0, %rdx
+       xor     $acc1, $acc1
+       add     %rax, $acc5
+        mov    $acc2, %rax
+       adc     %rdx, $acc0
+       adc     \$0, $acc1
+
+       ################################# Third reduction step
+       mulq    8*0(%r14)
+       mov     $acc2, $t1
+       add     %rax, $t0               # guaranteed to be zero
+       mov     $acc2, %rax
+       adc     %rdx, $t0
+
+       sub     $acc2, $acc4
+       sbb     \$0, $acc2              # can't borrow
+
+       mulq    8*1(%r14)
+       add     $t0, $acc3
+       adc     \$0, %rdx
+       add     %rax, $acc3
+       mov     $t1, %rax
+       adc     %rdx, $acc4
+       mov     $t1, %rdx
+       adc     \$0, $acc2              # can't overflow
+
+       shl     \$32, %rax
+       shr     \$32, %rdx
+       sub     %rax, $acc5
+        mov    8*3($b_ptr), %rax
+       sbb     %rdx, $t1               # can't borrow
+
+       add     $acc2, $acc5
+       adc     $t1, $acc0
+       adc     \$0, $acc1
+
+       ################################# * b[3]
+       mov     %rax, $t0
+       mulq    8*0($a_ptr)
+       add     %rax, $acc3
+       mov     $t0, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t1
+
+       mulq    8*1($a_ptr)
+       add     $t1, $acc4
+       adc     \$0, %rdx
+       add     %rax, $acc4
+       mov     $t0, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t1
+
+       mulq    8*2($a_ptr)
+       add     $t1, $acc5
+       adc     \$0, %rdx
+       add     %rax, $acc5
+       mov     $t0, %rax
+       adc     \$0, %rdx
+
+        mov    $acc3, $t0
+        imulq  %r15, $acc3
+
+       mov     %rdx, $t1
+       mulq    8*3($a_ptr)
+       add     $t1, $acc0
+       adc     \$0, %rdx
+       xor     $acc2, $acc2
+       add     %rax, $acc0
+        mov    $acc3, %rax
+       adc     %rdx, $acc1
+       adc     \$0, $acc2
+
+       ################################# Last reduction step
+       mulq    8*0(%r14)
+       mov     $acc3, $t1
+       add     %rax, $t0               # guaranteed to be zero
+       mov     $acc3, %rax
+       adc     %rdx, $t0
+
+       sub     $acc3, $acc5
+       sbb     \$0, $acc3              # can't borrow
+
+       mulq    8*1(%r14)
+       add     $t0, $acc4
+       adc     \$0, %rdx
+       add     %rax, $acc4
+       mov     $t1, %rax
+       adc     %rdx, $acc5
+       mov     $t1, %rdx
+       adc     \$0, $acc3              # can't overflow
+
+       shl     \$32, %rax
+       shr     \$32, %rdx
+       sub     %rax, $acc0
+       sbb     %rdx, $t1               # can't borrow
+
+       add     $acc3, $acc0
+       adc     $t1, $acc1
+       adc     \$0, $acc2
+
+       ################################# Subtract ord
+        mov    $acc4, $a_ptr
+       sub     8*0(%r14), $acc4
+        mov    $acc5, $acc3
+       sbb     8*1(%r14), $acc5
+        mov    $acc0, $t0
+       sbb     8*2(%r14), $acc0
+        mov    $acc1, $t1
+       sbb     8*3(%r14), $acc1
+       sbb     \$0, $acc2
+
+       cmovc   $a_ptr, $acc4
+       cmovc   $acc3, $acc5
+       cmovc   $t0, $acc0
+       cmovc   $t1, $acc1
+
+       mov     $acc4, 8*0($r_ptr)
+       mov     $acc5, 8*1($r_ptr)
+       mov     $acc0, 8*2($r_ptr)
+       mov     $acc1, 8*3($r_ptr)
+
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbx
+       pop     %rbp
+       ret
+.size  ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+################################################################################
+# void ecp_nistz256_ord_sqr_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   int rep);
+
+.globl ecp_nistz256_ord_sqr_mont
+.type  ecp_nistz256_ord_sqr_mont,\@function,3
+.align 32
+ecp_nistz256_ord_sqr_mont:
+___
+$code.=<<___   if ($addx);
+       mov     \$0x80100, %ecx
+       and     OPENSSL_ia32cap_P+8(%rip), %ecx
+       cmp     \$0x80100, %ecx
+       je      .Lecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+
+       mov     8*0($a_ptr), $acc0
+       mov     8*1($a_ptr), %rax
+       mov     8*2($a_ptr), $acc6
+       mov     8*3($a_ptr), $acc7
+       lea     .Lord(%rip), $a_ptr     # pointer to modulus
+       mov     $b_org, $b_ptr
+       jmp     .Loop_ord_sqr
+
+.align 32
+.Loop_ord_sqr:
+       ################################# a[1:] * a[0]
+       mov     %rax, $t1               # put aside a[1]
+       mul     $acc0                   # a[1] * a[0]
+       mov     %rax, $acc1
+       movq    $t1, %xmm1              # offload a[1]
+       mov     $acc6, %rax
+       mov     %rdx, $acc2
+
+       mul     $acc0                   # a[2] * a[0]
+       add     %rax, $acc2
+       mov     $acc7, %rax
+       movq    $acc6, %xmm2            # offload a[2]
+       adc     \$0, %rdx
+       mov     %rdx, $acc3
+
+       mul     $acc0                   # a[3] * a[0]
+       add     %rax, $acc3
+       mov     $acc7, %rax
+       movq    $acc7, %xmm3            # offload a[3]
+       adc     \$0, %rdx
+       mov     %rdx, $acc4
+
+       ################################# a[3] * a[2]
+       mul     $acc6                   # a[3] * a[2]
+       mov     %rax, $acc5
+       mov     $acc6, %rax
+       mov     %rdx, $acc6
+
+       ################################# a[2:] * a[1]
+       mul     $t1                     # a[2] * a[1]
+       add     %rax, $acc3
+       mov     $acc7, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $acc7
+
+       mul     $t1                     # a[3] * a[1]
+       add     %rax, $acc4
+       adc     \$0, %rdx
+
+       add     $acc7, $acc4
+       adc     %rdx, $acc5
+       adc     \$0, $acc6              # can't overflow
+
+       ################################# *2
+       xor     $acc7, $acc7
+       mov     $acc0, %rax
+       add     $acc1, $acc1
+       adc     $acc2, $acc2
+       adc     $acc3, $acc3
+       adc     $acc4, $acc4
+       adc     $acc5, $acc5
+       adc     $acc6, $acc6
+       adc     \$0, $acc7
+
+       ################################# Missing products
+       mul     %rax                    # a[0] * a[0]
+       mov     %rax, $acc0
+       movq    %xmm1, %rax
+       mov     %rdx, $t1
+
+       mul     %rax                    # a[1] * a[1]
+       add     $t1, $acc1
+       adc     %rax, $acc2
+       movq    %xmm2, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t1
+
+       mul     %rax                    # a[2] * a[2]
+       add     $t1, $acc3
+       adc     %rax, $acc4
+       movq    %xmm3, %rax
+       adc     \$0, %rdx
+       mov     %rdx, $t1
+
+        mov    $acc0, $t0
+        imulq  8*4($a_ptr), $acc0      # *= .LordK
+
+       mul     %rax                    # a[3] * a[3]
+       add     $t1, $acc5
+       adc     %rax, $acc6
+        mov    8*0($a_ptr), %rax       # modulus[0]
+       adc     %rdx, $acc7             # can't overflow
+
+       ################################# First reduction step
+       mul     $acc0
+       mov     $acc0, $t1
+       add     %rax, $t0               # guaranteed to be zero
+       mov     8*1($a_ptr), %rax       # modulus[1]
+       adc     %rdx, $t0
+
+       sub     $acc0, $acc2
+       sbb     \$0, $t1                # can't borrow
+
+       mul     $acc0
+       add     $t0, $acc1
+       adc     \$0, %rdx
+       add     %rax, $acc1
+       mov     $acc0, %rax
+       adc     %rdx, $acc2
+       mov     $acc0, %rdx
+       adc     \$0, $t1                # can't overflow
+
+        mov    $acc1, $t0
+        imulq  8*4($a_ptr), $acc1      # *= .LordK
+
+       shl     \$32, %rax
+       shr     \$32, %rdx
+       sub     %rax, $acc3
+        mov    8*0($a_ptr), %rax
+       sbb     %rdx, $acc0             # can't borrow
+
+       add     $t1, $acc3
+       adc     \$0, $acc0              # can't overflow
+
+       ################################# Second reduction step
+       mul     $acc1
+       mov     $acc1, $t1
+       add     %rax, $t0               # guaranteed to be zero
+       mov     8*1($a_ptr), %rax
+       adc     %rdx, $t0
+
+       sub     $acc1, $acc3
+       sbb     \$0, $t1                # can't borrow
+
+       mul     $acc1
+       add     $t0, $acc2
+       adc     \$0, %rdx
+       add     %rax, $acc2
+       mov     $acc1, %rax
+       adc     %rdx, $acc3
+       mov     $acc1, %rdx
+       adc     \$0, $t1                # can't overflow
+
+        mov    $acc2, $t0
+        imulq  8*4($a_ptr), $acc2      # *= .LordK
+
+       shl     \$32, %rax
+       shr     \$32, %rdx
+       sub     %rax, $acc0
+        mov    8*0($a_ptr), %rax
+       sbb     %rdx, $acc1             # can't borrow
+
+       add     $t1, $acc0
+       adc     \$0, $acc1              # can't overflow
+
+       ################################# Third reduction step
+       mul     $acc2
+       mov     $acc2, $t1
+       add     %rax, $t0               # guaranteed to be zero
+       mov     8*1($a_ptr), %rax
+       adc     %rdx, $t0
+
+       sub     $acc2, $acc0
+       sbb     \$0, $t1                # can't borrow
+
+       mul     $acc2
+       add     $t0, $acc3
+       adc     \$0, %rdx
+       add     %rax, $acc3
+       mov     $acc2, %rax
+       adc     %rdx, $acc0
+       mov     $acc2, %rdx
+       adc     \$0, $t1                # can't overflow
+
+        mov    $acc3, $t0
+        imulq  8*4($a_ptr), $acc3      # *= .LordK
+
+       shl     \$32, %rax
+       shr     \$32, %rdx
+       sub     %rax, $acc1
+        mov    8*0($a_ptr), %rax
+       sbb     %rdx, $acc2             # can't borrow
+
+       add     $t1, $acc1
+       adc     \$0, $acc2              # can't overflow
+
+       ################################# Last reduction step
+       mul     $acc3
+       mov     $acc3, $t1
+       add     %rax, $t0               # guaranteed to be zero
+       mov     8*1($a_ptr), %rax
+       adc     %rdx, $t0
+
+       sub     $acc3, $acc1
+       sbb     \$0, $t1                # can't borrow
+
+       mul     $acc3
+       add     $t0, $acc0
+       adc     \$0, %rdx
+       add     %rax, $acc0
+       mov     $acc3, %rax
+       adc     %rdx, $acc1
+       mov     $acc3, %rdx
+       adc     \$0, $t1                # can't overflow
+
+       shl     \$32, %rax
+       shr     \$32, %rdx
+       sub     %rax, $acc2
+       sbb     %rdx, $acc3             # can't borrow
+
+       add     $t1, $acc2
+       adc     \$0, $acc3              # can't overflow
+
+       ################################# Add bits [511:256] of the sqr result
+       xor     %rdx, %rdx
+       add     $acc4, $acc0
+       adc     $acc5, $acc1
+        mov    $acc0, $acc4
+       adc     $acc6, $acc2
+       adc     $acc7, $acc3
+        mov    $acc1, %rax
+       adc     \$0, %rdx
+
+       ################################# Compare to modulus
+       sub     8*0($a_ptr), $acc0
+        mov    $acc2, $acc6
+       sbb     8*1($a_ptr), $acc1
+       sbb     8*2($a_ptr), $acc2
+        mov    $acc3, $acc7
+       sbb     8*3($a_ptr), $acc3
+       sbb     \$0, %rdx
+
+       cmovc   $acc4, $acc0
+       cmovnc  $acc1, %rax
+       cmovnc  $acc2, $acc6
+       cmovnc  $acc3, $acc7
+
+       dec     $b_ptr
+       jnz     .Loop_ord_sqr
+
+       mov     $acc0, 8*0($r_ptr)
+       mov     %rax,  8*1($r_ptr)
+       pxor    %xmm1, %xmm1
+       mov     $acc6, 8*2($r_ptr)
+       pxor    %xmm2, %xmm2
+       mov     $acc7, 8*3($r_ptr)
+       pxor    %xmm3, %xmm3
+
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbx
+       pop     %rbp
+       ret
+.size  ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+
+$code.=<<___   if ($addx);
+################################################################################
+.type  ecp_nistz256_ord_mul_montx,\@function,3
+.align 32
+ecp_nistz256_ord_mul_montx:
+.Lecp_nistz256_ord_mul_montx:
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+
+       mov     $b_org, $b_ptr
+       mov     8*0($b_org), %rdx
+       mov     8*0($a_ptr), $acc1
+       mov     8*1($a_ptr), $acc2
+       mov     8*2($a_ptr), $acc3
+       mov     8*3($a_ptr), $acc4
+       lea     -128($a_ptr), $a_ptr    # control u-op density
+       lea     .Lord-128(%rip), %r14
+       mov     .LordK(%rip), %r15
+
+       ################################# Multiply by b[0]
+       mulx    $acc1, $acc0, $acc1
+       mulx    $acc2, $t0, $acc2
+       mulx    $acc3, $t1, $acc3
+       add     $t0, $acc1
+       mulx    $acc4, $t0, $acc4
+        mov    $acc0, %rdx
+        mulx   %r15, %rdx, %rax
+       adc     $t1, $acc2
+       adc     $t0, $acc3
+       adc     \$0, $acc4
+
+       ################################# reduction
+       xor     $acc5, $acc5            # $acc5=0, cf=0, of=0
+       mulx    8*0+128(%r14), $t0, $t1
+       adcx    $t0, $acc0              # guaranteed to be zero
+       adox    $t1, $acc1
+
+       mulx    8*1+128(%r14), $t0, $t1
+       adcx    $t0, $acc1
+       adox    $t1, $acc2
+
+       mulx    8*2+128(%r14), $t0, $t1
+       adcx    $t0, $acc2
+       adox    $t1, $acc3
+
+       mulx    8*3+128(%r14), $t0, $t1
+        mov    8*1($b_ptr), %rdx
+       adcx    $t0, $acc3
+       adox    $t1, $acc4
+       adcx    $acc0, $acc4
+       adox    $acc0, $acc5
+       adc     \$0, $acc5              # cf=0, of=0
+
+       ################################# Multiply by b[1]
+       mulx    8*0+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc1
+       adox    $t1, $acc2
+
+       mulx    8*1+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc2
+       adox    $t1, $acc3
+
+       mulx    8*2+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc3
+       adox    $t1, $acc4
+
+       mulx    8*3+128($a_ptr), $t0, $t1
+        mov    $acc1, %rdx
+        mulx   %r15, %rdx, %rax
+       adcx    $t0, $acc4
+       adox    $t1, $acc5
+
+       adcx    $acc0, $acc5
+       adox    $acc0, $acc0
+       adc     \$0, $acc0              # cf=0, of=0
+
+       ################################# reduction
+       mulx    8*0+128(%r14), $t0, $t1
+       adcx    $t0, $acc1              # guaranteed to be zero
+       adox    $t1, $acc2
+
+       mulx    8*1+128(%r14), $t0, $t1
+       adcx    $t0, $acc2
+       adox    $t1, $acc3
+
+       mulx    8*2+128(%r14), $t0, $t1
+       adcx    $t0, $acc3
+       adox    $t1, $acc4
+
+       mulx    8*3+128(%r14), $t0, $t1
+        mov    8*2($b_ptr), %rdx
+       adcx    $t0, $acc4
+       adox    $t1, $acc5
+       adcx    $acc1, $acc5
+       adox    $acc1, $acc0
+       adc     \$0, $acc0              # cf=0, of=0
+
+       ################################# Multiply by b[2]
+       mulx    8*0+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc2
+       adox    $t1, $acc3
+
+       mulx    8*1+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc3
+       adox    $t1, $acc4
+
+       mulx    8*2+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc4
+       adox    $t1, $acc5
+
+       mulx    8*3+128($a_ptr), $t0, $t1
+        mov    $acc2, %rdx
+        mulx   %r15, %rdx, %rax
+       adcx    $t0, $acc5
+       adox    $t1, $acc0
+
+       adcx    $acc1, $acc0
+       adox    $acc1, $acc1
+       adc     \$0, $acc1              # cf=0, of=0
+
+       ################################# reduction
+       mulx    8*0+128(%r14), $t0, $t1
+       adcx    $t0, $acc2              # guaranteed to be zero
+       adox    $t1, $acc3
+
+       mulx    8*1+128(%r14), $t0, $t1
+       adcx    $t0, $acc3
+       adox    $t1, $acc4
+
+       mulx    8*2+128(%r14), $t0, $t1
+       adcx    $t0, $acc4
+       adox    $t1, $acc5
+
+       mulx    8*3+128(%r14), $t0, $t1
+        mov    8*3($b_ptr), %rdx
+       adcx    $t0, $acc5
+       adox    $t1, $acc0
+       adcx    $acc2, $acc0
+       adox    $acc2, $acc1
+       adc     \$0, $acc1              # cf=0, of=0
+
+       ################################# Multiply by b[3]
+       mulx    8*0+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc3
+       adox    $t1, $acc4
+
+       mulx    8*1+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc4
+       adox    $t1, $acc5
+
+       mulx    8*2+128($a_ptr), $t0, $t1
+       adcx    $t0, $acc5
+       adox    $t1, $acc0
+
+       mulx    8*3+128($a_ptr), $t0, $t1
+        mov    $acc3, %rdx
+        mulx   %r15, %rdx, %rax
+       adcx    $t0, $acc0
+       adox    $t1, $acc1
+
+       adcx    $acc2, $acc1
+       adox    $acc2, $acc2
+       adc     \$0, $acc2              # cf=0, of=0
+
+       ################################# reduction
+       mulx    8*0+128(%r14), $t0, $t1
+       adcx    $t0, $acc3              # guranteed to be zero
+       adox    $t1, $acc4
+
+       mulx    8*1+128(%r14), $t0, $t1
+       adcx    $t0, $acc4
+       adox    $t1, $acc5
+
+       mulx    8*2+128(%r14), $t0, $t1
+       adcx    $t0, $acc5
+       adox    $t1, $acc0
+
+       mulx    8*3+128(%r14), $t0, $t1
+       lea     128(%r14),%r14
+        mov    $acc4, $t2
+       adcx    $t0, $acc0
+       adox    $t1, $acc1
+        mov    $acc5, $t3
+       adcx    $acc3, $acc1
+       adox    $acc3, $acc2
+       adc     \$0, $acc2
+
+       #################################
+       # Branch-less conditional subtraction of P
+        mov    $acc0, $t0
+       sub     8*0(%r14), $acc4
+       sbb     8*1(%r14), $acc5
+       sbb     8*2(%r14), $acc0
+        mov    $acc1, $t1
+       sbb     8*3(%r14), $acc1
+       sbb     \$0, $acc2
+
+       cmovc   $t2, $acc4
+       cmovc   $t3, $acc5
+       cmovc   $t0, $acc0
+       cmovc   $t1, $acc1
+
+       mov     $acc4, 8*0($r_ptr)
+       mov     $acc5, 8*1($r_ptr)
+       mov     $acc0, 8*2($r_ptr)
+       mov     $acc1, 8*3($r_ptr)
+
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbx
+       pop     %rbp
+       ret
+.size  ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type  ecp_nistz256_ord_sqr_montx,\@function,3
+.align 32
+ecp_nistz256_ord_sqr_montx:
+.Lecp_nistz256_ord_sqr_montx:
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+
+       mov     $b_org, $b_ptr
+       mov     8*0($a_ptr), %rdx
+       mov     8*1($a_ptr), $acc6
+       mov     8*2($a_ptr), $acc7
+       mov     8*3($a_ptr), $acc0
+       lea     .Lord(%rip), $a_ptr
+       jmp     .Loop_ord_sqrx
+
+.align 32
+.Loop_ord_sqrx:
+       mulx    $acc6, $acc1, $acc2     # a[0]*a[1]
+       mulx    $acc7, $t0, $acc3       # a[0]*a[2]
+        mov    %rdx, %rax              # offload a[0]
+        movq   $acc6, %xmm1            # offload a[1]
+       mulx    $acc0, $t1, $acc4       # a[0]*a[3]
+        mov    $acc6, %rdx
+       add     $t0, $acc2
+        movq   $acc7, %xmm2            # offload a[2]
+       adc     $t1, $acc3
+       adc     \$0, $acc4
+       xor     $acc5, $acc5            # $acc5=0,cf=0,of=0
+       #################################
+       mulx    $acc7, $t0, $t1         # a[1]*a[2]
+       adcx    $t0, $acc3
+       adox    $t1, $acc4
+
+       mulx    $acc0, $t0, $t1         # a[1]*a[3]
+        mov    $acc7, %rdx
+       adcx    $t0, $acc4
+       adox    $t1, $acc5
+       adc     \$0, $acc5
+       #################################
+       mulx    $acc0, $t0, $acc6       # a[2]*a[3]
+       mov     %rax, %rdx
+        movq   $acc0, %xmm3            # offload a[3]
+       xor     $acc7, $acc7            # $acc7=0,cf=0,of=0
+        adcx   $acc1, $acc1            # acc1:6<<1
+       adox    $t0, $acc5
+        adcx   $acc2, $acc2
+       adox    $acc7, $acc6            # of=0
+
+       ################################# a[i]*a[i]
+       mulx    %rdx, $acc0, $t1
+       movq    %xmm1, %rdx
+        adcx   $acc3, $acc3
+       adox    $t1, $acc1
+        adcx   $acc4, $acc4
+       mulx    %rdx, $t0, $t4
+       movq    %xmm2, %rdx
+        adcx   $acc5, $acc5
+       adox    $t0, $acc2
+        adcx   $acc6, $acc6
+       mulx    %rdx, $t0, $t1
+       .byte   0x67
+       movq    %xmm3, %rdx
+       adox    $t4, $acc3
+        adcx   $acc7, $acc7
+       adox    $t0, $acc4
+       adox    $t1, $acc5
+       mulx    %rdx, $t0, $t4
+       adox    $t0, $acc6
+       adox    $t4, $acc7
+
+       ################################# reduction
+       mov     $acc0, %rdx
+       mulx    8*4($a_ptr), %rdx, $t0
+
+       xor     %rax, %rax              # cf=0, of=0
+       mulx    8*0($a_ptr), $t0, $t1
+       adcx    $t0, $acc0              # guaranteed to be zero
+       adox    $t1, $acc1
+       mulx    8*1($a_ptr), $t0, $t1
+       adcx    $t0, $acc1
+       adox    $t1, $acc2
+       mulx    8*2($a_ptr), $t0, $t1
+       adcx    $t0, $acc2
+       adox    $t1, $acc3
+       mulx    8*3($a_ptr), $t0, $t1
+       adcx    $t0, $acc3
+       adox    $t1, $acc0              # of=0
+       adcx    %rax, $acc0             # cf=0
+
+       #################################
+       mov     $acc1, %rdx
+       mulx    8*4($a_ptr), %rdx, $t0
+
+       mulx    8*0($a_ptr), $t0, $t1
+       adox    $t0, $acc1              # guaranteed to be zero
+       adcx    $t1, $acc2
+       mulx    8*1($a_ptr), $t0, $t1
+       adox    $t0, $acc2
+       adcx    $t1, $acc3
+       mulx    8*2($a_ptr), $t0, $t1
+       adox    $t0, $acc3
+       adcx    $t1, $acc0
+       mulx    8*3($a_ptr), $t0, $t1
+       adox    $t0, $acc0
+       adcx    $t1, $acc1              # cf=0
+       adox    %rax, $acc1             # of=0
+
+       #################################
+       mov     $acc2, %rdx
+       mulx    8*4($a_ptr), %rdx, $t0
+
+       mulx    8*0($a_ptr), $t0, $t1
+       adcx    $t0, $acc2              # guaranteed to be zero
+       adox    $t1, $acc3
+       mulx    8*1($a_ptr), $t0, $t1
+       adcx    $t0, $acc3
+       adox    $t1, $acc0
+       mulx    8*2($a_ptr), $t0, $t1
+       adcx    $t0, $acc0
+       adox    $t1, $acc1
+       mulx    8*3($a_ptr), $t0, $t1
+       adcx    $t0, $acc1
+       adox    $t1, $acc2              # of=0
+       adcx    %rax, $acc2             # cf=0
+
+       #################################
+       mov     $acc3, %rdx
+       mulx    8*4($a_ptr), %rdx, $t0
+
+       mulx    8*0($a_ptr), $t0, $t1
+       adox    $t0, $acc3              # guaranteed to be zero
+       adcx    $t1, $acc0
+       mulx    8*1($a_ptr), $t0, $t1
+       adox    $t0, $acc0
+       adcx    $t1, $acc1
+       mulx    8*2($a_ptr), $t0, $t1
+       adox    $t0, $acc1
+       adcx    $t1, $acc2
+       mulx    8*3($a_ptr), $t0, $t1
+       adox    $t0, $acc2
+       adcx    $t1, $acc3
+       adox    %rax, $acc3
+
+       ################################# accumulate upper half
+       add     $acc0, $acc4            # add   $acc4, $acc0
+       adc     $acc5, $acc1
+        mov    $acc4, %rdx
+       adc     $acc6, $acc2
+       adc     $acc7, $acc3
+        mov    $acc1, $acc6
+       adc     \$0, %rax
+
+       ################################# compare to modulus
+       sub     8*0($a_ptr), $acc4
+        mov    $acc2, $acc7
+       sbb     8*1($a_ptr), $acc1
+       sbb     8*2($a_ptr), $acc2
+        mov    $acc3, $acc0
+       sbb     8*3($a_ptr), $acc3
+       sbb     \$0, %rax
+
+       cmovnc  $acc4, %rdx
+       cmovnc  $acc1, $acc6
+       cmovnc  $acc2, $acc7
+       cmovnc  $acc3, $acc0
+
+       dec     $b_ptr
+       jnz     .Loop_ord_sqrx
+
+       mov     %rdx, 8*0($r_ptr)
+       mov     $acc6, 8*1($r_ptr)
+       pxor    %xmm1, %xmm1
+       mov     $acc7, 8*2($r_ptr)
+       pxor    %xmm2, %xmm2
+       mov     $acc0, 8*3($r_ptr)
+       pxor    %xmm3, %xmm3
+
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbx
+       pop     %rbp
+       ret
+
+.size  ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+___
+
  $code.=<<___;
  ################################################################################
  # void ecp_nistz256_to_mont(
diff --git a/crypto/ec/ec_err.c b/crypto/ec/ec_err.c

index 9f82b4ef7e8b509bcae29372f211698a7540f8bf..efec5a7112b160502ea9b2a010c00082739cdb5a 100644 (file)
--- a/crypto/ec/ec_err.c
+++ b/crypto/ec/ec_err.c
@@ -48,6 +48,8 @@ static const ERR_STRING_DATA EC_str_functs[] = {
       "ECPKParameters_print_fp"},
      {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_GET_AFFINE, 0),
       "ecp_nistz256_get_affine"},
+    {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_INV_MOD_ORD, 0),
+     "ecp_nistz256_inv_mod_ord"},
      {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, 0),
       "ecp_nistz256_mult_precompute"},
      {ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_POINTS_MUL, 0),
diff --git a/crypto/ec/ec_lcl.h b/crypto/ec/ec_lcl.h

index 6cc0190aa7f8dba950280a187a151cc428cadef1..540aa534e7ea7e891a4df0fbf8c0292ecf30a016 100644 (file)
--- a/crypto/ec/ec_lcl.h
+++ b/crypto/ec/ec_lcl.h
@@ -155,6 +155,9 @@ struct ec_method_st {
      /* custom ECDH operation */
      int (*ecdh_compute_key)(unsigned char **pout, size_t *poutlen,
                              const EC_POINT *pub_key, const EC_KEY *ecdh);
+    /* Inverse modulo order */
+    int (*field_inverse_mod_ord)(const EC_GROUP *, BIGNUM *r, BIGNUM *x,
+                                 BN_CTX *ctx);
  };
  
  /*
@@ -520,7 +523,6 @@ void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
  void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign,
                                       unsigned char *digit, unsigned char in);
  #endif
-int ec_precompute_mont_data(EC_GROUP *);
  int ec_group_simple_order_bits(const EC_GROUP *group);
  
  #ifdef ECP_NISTZ256_ASM
@@ -604,3 +606,6 @@ int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
             const uint8_t peer_public_value[32]);
  void X25519_public_from_private(uint8_t out_public_value[32],
                                  const uint8_t private_key[32]);
+
+int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
+                            BIGNUM *x, BN_CTX *ctx);
diff --git a/crypto/ec/ec_lib.c b/crypto/ec/ec_lib.c

index 7ae48cf29179fd1df75ab706440c28380d532a3f..8d508ddb6a92d0acfe22662daa71d980c0a0aec2 100644 (file)
--- a/crypto/ec/ec_lib.c
+++ b/crypto/ec/ec_lib.c
@@ -261,6 +261,8 @@ int EC_METHOD_get_field_type(const EC_METHOD *meth)
      return meth->field_type;
  }
  
+static int ec_precompute_mont_data(EC_GROUP *);
+
  int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
                             const BIGNUM *order, const BIGNUM *cofactor)
  {
@@ -961,7 +963,7 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group)
   * ec_precompute_mont_data sets |group->mont_data| from |group->order| and
   * returns one on success. On error it returns zero.
   */
-int ec_precompute_mont_data(EC_GROUP *group)
+static int ec_precompute_mont_data(EC_GROUP *group)
  {
      BN_CTX *ctx = BN_CTX_new();
      int ret = 0;
@@ -1006,3 +1008,12 @@ int ec_group_simple_order_bits(const EC_GROUP *group)
          return 0;
      return BN_num_bits(group->order);
  }
+
+int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
+                            BIGNUM *x, BN_CTX *ctx)
+{
+    if (group->meth->field_inverse_mod_ord != NULL)
+        return group->meth->field_inverse_mod_ord(group, res, x, ctx);
+    else
+        return 0;
+}
diff --git a/crypto/ec/ecdsa_ossl.c b/crypto/ec/ecdsa_ossl.c

index 30458f1402addc2cc856e7f4a3310c4003aae0b7..a405d38b69eaea81aae080c234f7bedc1a46f70b 100644 (file)
--- a/crypto/ec/ecdsa_ossl.c
+++ b/crypto/ec/ecdsa_ossl.c
@@ -153,30 +153,33 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
      }
      while (BN_is_zero(r));
  
-    /* compute the inverse of k */
-    if (EC_GROUP_get_mont_data(group) != NULL) {
-        /*
-         * We want inverse in constant time, therefore we utilize the fact
-         * order must be prime and use Fermat's Little Theorem instead.
-         */
-        if (!BN_set_word(X, 2)) {
-            ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
-        if (!BN_mod_sub(X, order, X, order, ctx)) {
-            ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
-        BN_set_flags(X, BN_FLG_CONSTTIME);
-        if (!BN_mod_exp_mont_consttime
-            (k, k, X, order, ctx, EC_GROUP_get_mont_data(group))) {
-            ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
-    } else {
-        if (!BN_mod_inverse(k, k, order, ctx)) {
-            ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
+    /* Check if optimized inverse is implemented */
+    if (EC_GROUP_do_inverse_ord(group, k, k, ctx) == 0) {
+        /* compute the inverse of k */
+        if (group->mont_data != NULL) {
+            /*
+             * We want inverse in constant time, therefore we utilize the fact
+             * order must be prime and use Fermats Little Theorem instead.
+             */
+            if (!BN_set_word(X, 2)) {
+                ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+                goto err;
+            }
+            if (!BN_mod_sub(X, order, X, order, ctx)) {
+                ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+                goto err;
+            }
+            BN_set_flags(X, BN_FLG_CONSTTIME);
+            if (!BN_mod_exp_mont_consttime(k, k, X, order, ctx,
+                                           group->mont_data)) {
+                ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+                goto err;
+            }
+        } else {
+            if (!BN_mod_inverse(k, k, order, ctx)) {
+                ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+                goto err;
+            }
          }
      }
  
@@ -407,9 +410,12 @@ int ossl_ecdsa_verify_sig(const unsigned char *dgst, int dgst_len,
          goto err;
      }
      /* calculate tmp1 = inv(S) mod order */
-    if (!BN_mod_inverse(u2, sig->s, order, ctx)) {
-        ECerr(EC_F_OSSL_ECDSA_VERIFY_SIG, ERR_R_BN_LIB);
-        goto err;
+    /* Check if optimized inverse is implemented */
+    if (EC_GROUP_do_inverse_ord(group, u2, sig->s, ctx) == 0) {
+        if (!BN_mod_inverse(u2, sig->s, order, ctx)) {
+            ECerr(EC_F_OSSL_ECDSA_VERIFY_SIG, ERR_R_BN_LIB);
+            goto err;
+        }
      }
      /* digest -> m */
      i = BN_num_bits(order);
diff --git a/crypto/ec/ecp_nistz256.c b/crypto/ec/ecp_nistz256.c

index 3863a61b02027c56d67b1e49d50b6839b63f79ed..6c85884fee4210cde39dfd1af8220fd9d10e882e 100644 (file)
--- a/crypto/ec/ecp_nistz256.c
+++ b/crypto/ec/ecp_nistz256.c
@@ -1,15 +1,17 @@
  /*
   * Copyright 2014-2017 The OpenSSL Project Authors. All Rights Reserved.
   * Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+ * Copyright (c) 2015, CloudFlare, Inc.
   *
   * Licensed under the OpenSSL license (the "License").  You may not use
   * this file except in compliance with the License.  You can obtain a copy
   * in the file LICENSE in the source distribution or at
   * https://www.openssl.org/source/license.html
   *
- * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+ * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
   * (1) Intel Corporation, Israel Development Center, Haifa, Israel
   * (2) University of Haifa, Israel
+ * (3) CloudFlare, Inc.
   *
   * Reference:
   * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
@@ -908,7 +910,7 @@ __owur static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx)
   */
  #if defined(ECP_NISTZ256_AVX2)
  # if !(defined(__x86_64) || defined(__x86_64__) || \
-       defined(_M_AMD64) || defined(_MX64)) || \
+       defined(_M_AMD64) || defined(_M_X64)) || \
       !(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */
  #  undef ECP_NISTZ256_AVX2
  # else
@@ -1495,6 +1497,117 @@ static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group)
      return HAVEPRECOMP(group, nistz256);
  }
  
+#if defined(__x86_64) || defined(__x86_64__) || \
+    defined(_M_AMD64) || defined(_M_X64) || \
+    defined(__powerpc64__) || defined(_ARCH_PP64)
+/*
+ * Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P)
+ */
+void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS],
+                               const BN_ULONG b[P256_LIMBS]);
+void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS],
+                               int rep);
+
+static int ecp_nistz256_inv_mod_ord(const EC_GROUP *group, BIGNUM *r,
+                                    BIGNUM *x, BN_CTX *ctx)
+{
+    /* RR = 2^512 mod ord(p256) */
+    static const BN_ULONG RR[P256_LIMBS]  = { TOBN(0x83244c95,0xbe79eea2),
+                                              TOBN(0x4699799c,0x49bd6fa6),
+                                              TOBN(0x2845b239,0x2b6bec59),
+                                              TOBN(0x66e12d94,0xf3d95620) };
+    /* The constant 1 (unlike ONE that is one in Montgomery representation) */
+    static const BN_ULONG one[P256_LIMBS] = { TOBN(0,1),TOBN(0,0),
+                                              TOBN(0,0),TOBN(0,0) };
+    /* expLo - the low 128bit of the exponent we use (ord(p256) - 2),
+     * split into 4bit windows */
+    static const unsigned char expLo[32]  = { 0xb,0xc,0xe,0x6,0xf,0xa,0xa,0xd,
+                                              0xa,0x7,0x1,0x7,0x9,0xe,0x8,0x4,
+                                              0xf,0x3,0xb,0x9,0xc,0xa,0xc,0x2,
+                                              0xf,0xc,0x6,0x3,0x2,0x5,0x4,0xf };
+    /*
+     * We don't use entry 0 in the table, so we omit it and address
+     * with -1 offset.
+     */
+    BN_ULONG table[15][P256_LIMBS];
+    BN_ULONG out[P256_LIMBS], t[P256_LIMBS];
+    int i, ret = 0;
+
+    /*
+     * Catch allocation failure early.
+     */
+    if (bn_wexpand(r, P256_LIMBS) == NULL) {
+        ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB);
+        goto err;
+    }
+
+    if ((BN_num_bits(x) > 256) || BN_is_negative(x)) {
+        BIGNUM *tmp;
+
+        if ((tmp = BN_CTX_get(ctx)) == NULL
+            || !BN_nnmod(tmp, x, group->order, ctx)) {
+            ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB);
+            goto err;
+        }
+        x = tmp;
+    }
+
+    if (!ecp_nistz256_bignum_to_field_elem(t, x)) {
+        ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, EC_R_COORDINATES_OUT_OF_RANGE);
+        goto err;
+    }
+
+    ecp_nistz256_ord_mul_mont(table[0], t, RR);
+    for (i = 2; i < 16; i += 2) {
+        ecp_nistz256_ord_sqr_mont(table[i-1], table[i/2-1], 1);
+        ecp_nistz256_ord_mul_mont(table[i], table[i-1], table[0]);
+    }
+
+    /*
+     * The top 128bit of the exponent are highly redudndant, so we
+     * perform an optimized flow
+     */
+    ecp_nistz256_ord_sqr_mont(t, table[15-1], 4);   /* f0 */
+    ecp_nistz256_ord_mul_mont(t, t, table[15-1]);   /* ff */
+
+    ecp_nistz256_ord_sqr_mont(out, t, 8);           /* ff00 */
+    ecp_nistz256_ord_mul_mont(out, out, t);         /* ffff */
+
+    ecp_nistz256_ord_sqr_mont(t, out, 16);          /* ffff0000 */
+    ecp_nistz256_ord_mul_mont(t, t, out);           /* ffffffff */
+
+    ecp_nistz256_ord_sqr_mont(out, t, 64);          /* ffffffff0000000000000000 */
+    ecp_nistz256_ord_mul_mont(out, out, t);         /* ffffffff00000000ffffffff */
+
+    ecp_nistz256_ord_sqr_mont(out, out, 32);        /* ffffffff00000000ffffffff00000000 */
+    ecp_nistz256_ord_mul_mont(out, out, t);         /* ffffffff00000000ffffffffffffffff */
+
+    /*
+     * The bottom 128 bit of the exponent are easier done with a table
+     */
+    for(i = 0; i < 32; i++) {
+        ecp_nistz256_ord_sqr_mont(out, out, 4);
+        /* The exponent is public, no need in constant-time access */
+        ecp_nistz256_ord_mul_mont(out, out, table[expLo[i]-1]);
+    }
+    ecp_nistz256_ord_mul_mont(out, out, one);
+
+    /*
+     * Can't fail, but check return code to be consistent anyway.
+     */
+    if (!bn_set_words(r, out, P256_LIMBS))
+        goto err;
+
+    ret = 1;
+err:
+    return ret;
+}
+#else
+# define ecp_nistz256_inv_mod_ord NULL
+#endif
+
  const EC_METHOD *EC_GFp_nistz256_method(void)
  {
      static const EC_METHOD ret = {
@@ -1544,7 +1657,8 @@ const EC_METHOD *EC_GFp_nistz256_method(void)
          ec_key_simple_generate_public_key,
          0, /* keycopy */
          0, /* keyfinish */
-        ecdh_simple_compute_key
+        ecdh_simple_compute_key,
+        ecp_nistz256_inv_mod_ord                    /* can be #defined-ed NULL */
      };
  
      return &ret;
diff --git a/crypto/err/openssl.txt b/crypto/err/openssl.txt

index 9ec000964a01b5d26fed325956b94fec2ce6f62b..64496627bbdfbe6b1f6377d21b28b12fac947173 100644 (file)
--- a/crypto/err/openssl.txt
+++ b/crypto/err/openssl.txt
@@ -458,6 +458,7 @@ EC_F_ECPARAMETERS_PRINT_FP:148:ECParameters_print_fp
  EC_F_ECPKPARAMETERS_PRINT:149:ECPKParameters_print
  EC_F_ECPKPARAMETERS_PRINT_FP:150:ECPKParameters_print_fp
  EC_F_ECP_NISTZ256_GET_AFFINE:240:ecp_nistz256_get_affine
+EC_F_ECP_NISTZ256_INV_MOD_ORD:275:ecp_nistz256_inv_mod_ord
  EC_F_ECP_NISTZ256_MULT_PRECOMPUTE:243:ecp_nistz256_mult_precompute
  EC_F_ECP_NISTZ256_POINTS_MUL:241:ecp_nistz256_points_mul
  EC_F_ECP_NISTZ256_PRE_COMP_NEW:244:ecp_nistz256_pre_comp_new
diff --git a/include/openssl/ecerr.h b/include/openssl/ecerr.h

index bd09cb7f1b99bf83c3d8d1538452e9bf97cdd38f..a1b9ea180e0d50fc069e70f2c266eeaaaf08f989 100644 (file)
--- a/include/openssl/ecerr.h
+++ b/include/openssl/ecerr.h
@@ -50,6 +50,7 @@ int ERR_load_EC_strings(void);
  # define EC_F_ECPKPARAMETERS_PRINT                        149
  # define EC_F_ECPKPARAMETERS_PRINT_FP                     150
  # define EC_F_ECP_NISTZ256_GET_AFFINE                     240
+# define EC_F_ECP_NISTZ256_INV_MOD_ORD                    275
  # define EC_F_ECP_NISTZ256_MULT_PRECOMPUTE                243
  # define EC_F_ECP_NISTZ256_POINTS_MUL                     241
  # define EC_F_ECP_NISTZ256_PRE_COMP_NEW                   244
author	Andy Polyakov <appro@openssl.org>
	Sat, 30 Dec 2017 14:08:31 +0000 (15:08 +0100)
committer	Andy Polyakov <appro@openssl.org>
	Sun, 7 Jan 2018 20:31:37 +0000 (21:31 +0100)
crypto/ec/asm/ecp_nistz256-x86_64.pl		patch \| blob \| history
crypto/ec/ec_err.c		patch \| blob \| history
crypto/ec/ec_lcl.h		patch \| blob \| history
crypto/ec/ec_lib.c		patch \| blob \| history
crypto/ec/ecdsa_ossl.c		patch \| blob \| history
crypto/ec/ecp_nistz256.c		patch \| blob \| history
crypto/err/openssl.txt		patch \| blob \| history
include/openssl/ecerr.h		patch \| blob \| history