ec/ecp_nistz256: harmonize is_infinity with ec_GFp_simple_is_at_infinity.
[openssl.git] / crypto / ec / asm / ecp_nistz256-x86.pl
index b0daf151a5bcfcf9a012bb8688243b4218b81aac..1d9e00616b58d0a17f0f6c26c79605aae5cf5278 100755 (executable)
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -35,6 +42,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
+$output=pop;
+open STDOUT,">$output";
+
 &asm_init($ARGV[0],"ecp_nistz256-x86.pl",$ARGV[$#ARGV] eq "386");
 
 $sse2=0;
@@ -274,18 +284,41 @@ for(1..37) {
        &mov    (&DWP(16,"edi"),"eax");
        &adc    ("ecx",&DWP(24,"ebp"));
        &mov    (&DWP(20,"edi"),"ebx");
+       &mov    ("esi",0);
        &adc    ("edx",&DWP(28,"ebp"));
        &mov    (&DWP(24,"edi"),"ecx");
-       &sbb    ("esi","esi");                  # broadcast carry bit
+       &adc    ("esi",0);
        &mov    (&DWP(28,"edi"),"edx");
 
-       # if a+b carries, subtract modulus.
+       # if a+b >= modulus, subtract modulus.
        #
+       # But since comparison implies subtraction, we subtract modulus
+       # to see if it borrows, and then subtract it for real if
+       # subtraction didn't borrow.
+
+       &mov    ("eax",&DWP(0,"edi"));
+       &mov    ("ebx",&DWP(4,"edi"));
+       &mov    ("ecx",&DWP(8,"edi"));
+       &sub    ("eax",-1);
+       &mov    ("edx",&DWP(12,"edi"));
+       &sbb    ("ebx",-1);
+       &mov    ("eax",&DWP(16,"edi"));
+       &sbb    ("ecx",-1);
+       &mov    ("ebx",&DWP(20,"edi"));
+       &sbb    ("edx",0);
+       &mov    ("ecx",&DWP(24,"edi"));
+       &sbb    ("eax",0);
+       &mov    ("edx",&DWP(28,"edi"));
+       &sbb    ("ebx",0);
+       &sbb    ("ecx",1);
+       &sbb    ("edx",-1);
+       &sbb    ("esi",0);
+
        # Note that because mod has special form, i.e. consists of
        # 0xffffffff, 1 and 0s, we can conditionally synthesize it by
-       # assigning carry bit to one register, %ebp, and its negative
-       # to another, %esi. But we started by calculating %esi...
+       # by using borrow.
 
+       &not    ("esi");
        &mov    ("eax",&DWP(0,"edi"));
        &mov    ("ebp","esi");
        &mov    ("ebx",&DWP(4,"edi"));
@@ -1197,6 +1230,7 @@ for ($i=0;$i<7;$i++) {
 ########################################################################
 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
 #
+&static_label("point_double_shortcut");
 &function_begin("ecp_nistz256_point_double");
 {   my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
 
@@ -1204,7 +1238,7 @@ for ($i=0;$i<7;$i++) {
 
        # above map() describes stack layout with 5 temporary
        # 256-bit vectors on top, then we take extra word for
-       # OPENSS_ia32cap_P copy.
+       # OPENSSL_ia32cap_P copy.
        &stack_push(8*5+1);
                                                if ($sse2) {
        &call   ("_picup_eax");
@@ -1212,6 +1246,7 @@ for ($i=0;$i<7;$i++) {
        &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic"));
        &mov    ("ebp",&DWP(0,"edx"));          }
 
+&set_label("point_double_shortcut");
        &mov    ("eax",&DWP(0,"esi"));          # copy in_x
        &mov    ("ebx",&DWP(4,"esi"));
        &mov    ("ecx",&DWP(8,"esi"));
@@ -1354,7 +1389,7 @@ for ($i=0;$i<7;$i++) {
        # above map() describes stack layout with 18 temporary
        # 256-bit vectors on top, then we take extra words for
        # !in1infty, !in2infty, result of check for zero and
-       # OPENSS_ia32cap_P copy. [one unused word for padding]
+       # OPENSSL_ia32cap_P copy. [one unused word for padding]
        &stack_push(8*18+5);
                                                if ($sse2) {
        &call   ("_picup_eax");
@@ -1370,14 +1405,14 @@ for ($i=0;$i<7;$i++) {
        &mov    ("edx",&DWP($i+12,"esi"));
        &mov    (&DWP($i+0,"edi"),"eax");
        &mov    (&DWP(32*18+12,"esp"),"ebp")    if ($i==0);
-       &mov    ("ebp","eax")                   if ($i==0);
-       &or     ("ebp","eax")                   if ($i!=0 && $i<64);
+       &mov    ("ebp","eax")                   if ($i==64);
+       &or     ("ebp","eax")                   if ($i>64);
        &mov    (&DWP($i+4,"edi"),"ebx");
-       &or     ("ebp","ebx")                   if ($i<64);
+       &or     ("ebp","ebx")                   if ($i>=64);
        &mov    (&DWP($i+8,"edi"),"ecx");
-       &or     ("ebp","ecx")                   if ($i<64);
+       &or     ("ebp","ecx")                   if ($i>=64);
        &mov    (&DWP($i+12,"edi"),"edx");
-       &or     ("ebp","edx")                   if ($i<64);
+       &or     ("ebp","edx")                   if ($i>=64);
     }
        &xor    ("eax","eax");
        &mov    ("esi",&wparam(1));
@@ -1393,14 +1428,14 @@ for ($i=0;$i<7;$i++) {
        &mov    ("ecx",&DWP($i+8,"esi"));
        &mov    ("edx",&DWP($i+12,"esi"));
        &mov    (&DWP($i+0,"edi"),"eax");
-       &mov    ("ebp","eax")                   if ($i==0);
-       &or     ("ebp","eax")                   if ($i!=0 && $i<64);
+       &mov    ("ebp","eax")                   if ($i==64);
+       &or     ("ebp","eax")                   if ($i>64);
        &mov    (&DWP($i+4,"edi"),"ebx");
-       &or     ("ebp","ebx")                   if ($i<64);
+       &or     ("ebp","ebx")                   if ($i>=64);
        &mov    (&DWP($i+8,"edi"),"ecx");
-       &or     ("ebp","ecx")                   if ($i<64);
+       &or     ("ebp","ecx")                   if ($i>=64);
        &mov    (&DWP($i+12,"edi"),"edx");
-       &or     ("ebp","edx")                   if ($i<64);
+       &or     ("ebp","edx")                   if ($i>=64);
     }
        &xor    ("eax","eax");
        &sub    ("eax","ebp");
@@ -1491,7 +1526,7 @@ for ($i=0;$i<7;$i++) {
        &mov    ("ebx",&DWP(32*18+8,"esp"));
        &jz     (&label("add_proceed"));        # (in1infty || in2infty)?
        &test   ("ebx","ebx");
-       &jz     (&label("add_proceed"));        # is_equal(S1,S2)?
+       &jz     (&label("add_double"));         # is_equal(S1,S2)?
 
        &mov    ("edi",&wparam(0));
        &xor    ("eax","eax");
@@ -1499,6 +1534,12 @@ for ($i=0;$i<7;$i++) {
        &data_byte(0xfc,0xf3,0xab);             # cld; stosd
        &jmp    (&label("add_done"));
 
+&set_label("add_double",16);
+       &mov    ("esi",&wparam(1));
+       &mov    ("ebp",&DWP(32*18+12,"esp"));   # OPENSSL_ia32cap_P copy
+       &add    ("esp",4*((8*18+5)-(8*5+1)));   # difference in frame sizes
+       &jmp    (&label("point_double_shortcut"));
+
 &set_label("add_proceed",16);
        &mov    ("eax",&DWP(32*18+12,"esp"));   # OPENSSL_ia32cap_P copy
        &lea    ("esi",&DWP($R,"esp"));
@@ -1627,7 +1668,7 @@ for ($i=0;$i<7;$i++) {
 
        # above map() describes stack layout with 15 temporary
        # 256-bit vectors on top, then we take extra words for
-       # !in1infty, !in2infty, and OPENSS_ia32cap_P copy.
+       # !in1infty, !in2infty, and OPENSSL_ia32cap_P copy.
        &stack_push(8*15+3);
                                                if ($sse2) {
        &call   ("_picup_eax");
@@ -1643,14 +1684,14 @@ for ($i=0;$i<7;$i++) {
        &mov    ("edx",&DWP($i+12,"esi"));
        &mov    (&DWP($i+0,"edi"),"eax");
        &mov    (&DWP(32*15+8,"esp"),"ebp")     if ($i==0);
-       &mov    ("ebp","eax")                   if ($i==0);
-       &or     ("ebp","eax")                   if ($i!=0 && $i<64);
+       &mov    ("ebp","eax")                   if ($i==64);
+       &or     ("ebp","eax")                   if ($i>64);
        &mov    (&DWP($i+4,"edi"),"ebx");
-       &or     ("ebp","ebx")                   if ($i<64);
+       &or     ("ebp","ebx")                   if ($i>=64);
        &mov    (&DWP($i+8,"edi"),"ecx");
-       &or     ("ebp","ecx")                   if ($i<64);
+       &or     ("ebp","ecx")                   if ($i>=64);
        &mov    (&DWP($i+12,"edi"),"edx");
-       &or     ("ebp","edx")                   if ($i<64);
+       &or     ("ebp","edx")                   if ($i>=64);
     }
        &xor    ("eax","eax");
        &mov    ("esi",&wparam(2));
@@ -1821,3 +1862,5 @@ for ($i=0;$i<7;$i++) {
 } &function_end("ecp_nistz256_point_add_affine");
 
 &asm_finish();
+
+close STDOUT;