-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
+$output=pop;
+open STDOUT,">$output";
+
&asm_init($ARGV[0],"ecp_nistz256-x86.pl",$ARGV[$#ARGV] eq "386");
$sse2=0;
&mov (&DWP(16,"edi"),"eax");
&adc ("ecx",&DWP(24,"ebp"));
&mov (&DWP(20,"edi"),"ebx");
+ &mov ("esi",0);
&adc ("edx",&DWP(28,"ebp"));
&mov (&DWP(24,"edi"),"ecx");
- &sbb ("esi","esi"); # broadcast carry bit
+ &adc ("esi",0);
&mov (&DWP(28,"edi"),"edx");
- # if a+b carries, subtract modulus.
+ # if a+b >= modulus, subtract modulus.
#
+ # But since comparison implies subtraction, we subtract modulus
+ # to see if it borrows, and then subtract it for real if
+ # subtraction didn't borrow.
+
+ &mov ("eax",&DWP(0,"edi"));
+ &mov ("ebx",&DWP(4,"edi"));
+ &mov ("ecx",&DWP(8,"edi"));
+ &sub ("eax",-1);
+ &mov ("edx",&DWP(12,"edi"));
+ &sbb ("ebx",-1);
+ &mov ("eax",&DWP(16,"edi"));
+ &sbb ("ecx",-1);
+ &mov ("ebx",&DWP(20,"edi"));
+ &sbb ("edx",0);
+ &mov ("ecx",&DWP(24,"edi"));
+ &sbb ("eax",0);
+ &mov ("edx",&DWP(28,"edi"));
+ &sbb ("ebx",0);
+ &sbb ("ecx",1);
+ &sbb ("edx",-1);
+ &sbb ("esi",0);
+
# Note that because mod has special form, i.e. consists of
# 0xffffffff, 1 and 0s, we can conditionally synthesize it by
- # assigning carry bit to one register, %ebp, and its negative
- # to another, %esi. But we started by calculating %esi...
+ # by using borrow.
+ ¬ ("esi");
&mov ("eax",&DWP(0,"edi"));
&mov ("ebp","esi");
&mov ("ebx",&DWP(4,"edi"));
########################################################################
# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
#
+&static_label("point_double_shortcut");
&function_begin("ecp_nistz256_point_double");
{ my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
# above map() describes stack layout with 5 temporary
# 256-bit vectors on top, then we take extra word for
- # OPENSS_ia32cap_P copy.
+ # OPENSSL_ia32cap_P copy.
&stack_push(8*5+1);
if ($sse2) {
&call ("_picup_eax");
&picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic"));
&mov ("ebp",&DWP(0,"edx")); }
+&set_label("point_double_shortcut");
&mov ("eax",&DWP(0,"esi")); # copy in_x
&mov ("ebx",&DWP(4,"esi"));
&mov ("ecx",&DWP(8,"esi"));
# above map() describes stack layout with 18 temporary
# 256-bit vectors on top, then we take extra words for
# !in1infty, !in2infty, result of check for zero and
- # OPENSS_ia32cap_P copy. [one unused word for padding]
+ # OPENSSL_ia32cap_P copy. [one unused word for padding]
&stack_push(8*18+5);
if ($sse2) {
&call ("_picup_eax");
&mov ("edx",&DWP($i+12,"esi"));
&mov (&DWP($i+0,"edi"),"eax");
&mov (&DWP(32*18+12,"esp"),"ebp") if ($i==0);
- &mov ("ebp","eax") if ($i==0);
- &or ("ebp","eax") if ($i!=0 && $i<64);
+ &mov ("ebp","eax") if ($i==64);
+ &or ("ebp","eax") if ($i>64);
&mov (&DWP($i+4,"edi"),"ebx");
- &or ("ebp","ebx") if ($i<64);
+ &or ("ebp","ebx") if ($i>=64);
&mov (&DWP($i+8,"edi"),"ecx");
- &or ("ebp","ecx") if ($i<64);
+ &or ("ebp","ecx") if ($i>=64);
&mov (&DWP($i+12,"edi"),"edx");
- &or ("ebp","edx") if ($i<64);
+ &or ("ebp","edx") if ($i>=64);
}
&xor ("eax","eax");
&mov ("esi",&wparam(1));
&mov ("ecx",&DWP($i+8,"esi"));
&mov ("edx",&DWP($i+12,"esi"));
&mov (&DWP($i+0,"edi"),"eax");
- &mov ("ebp","eax") if ($i==0);
- &or ("ebp","eax") if ($i!=0 && $i<64);
+ &mov ("ebp","eax") if ($i==64);
+ &or ("ebp","eax") if ($i>64);
&mov (&DWP($i+4,"edi"),"ebx");
- &or ("ebp","ebx") if ($i<64);
+ &or ("ebp","ebx") if ($i>=64);
&mov (&DWP($i+8,"edi"),"ecx");
- &or ("ebp","ecx") if ($i<64);
+ &or ("ebp","ecx") if ($i>=64);
&mov (&DWP($i+12,"edi"),"edx");
- &or ("ebp","edx") if ($i<64);
+ &or ("ebp","edx") if ($i>=64);
}
&xor ("eax","eax");
&sub ("eax","ebp");
&mov ("ebx",&DWP(32*18+8,"esp"));
&jz (&label("add_proceed")); # (in1infty || in2infty)?
&test ("ebx","ebx");
- &jz (&label("add_proceed")); # is_equal(S1,S2)?
+ &jz (&label("add_double")); # is_equal(S1,S2)?
&mov ("edi",&wparam(0));
&xor ("eax","eax");
&data_byte(0xfc,0xf3,0xab); # cld; stosd
&jmp (&label("add_done"));
+&set_label("add_double",16);
+ &mov ("esi",&wparam(1));
+ &mov ("ebp",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
+ &add ("esp",4*((8*18+5)-(8*5+1))); # difference in frame sizes
+ &jmp (&label("point_double_shortcut"));
+
&set_label("add_proceed",16);
&mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy
&lea ("esi",&DWP($R,"esp"));
# above map() describes stack layout with 15 temporary
# 256-bit vectors on top, then we take extra words for
- # !in1infty, !in2infty, and OPENSS_ia32cap_P copy.
+ # !in1infty, !in2infty, and OPENSSL_ia32cap_P copy.
&stack_push(8*15+3);
if ($sse2) {
&call ("_picup_eax");
&mov ("edx",&DWP($i+12,"esi"));
&mov (&DWP($i+0,"edi"),"eax");
&mov (&DWP(32*15+8,"esp"),"ebp") if ($i==0);
- &mov ("ebp","eax") if ($i==0);
- &or ("ebp","eax") if ($i!=0 && $i<64);
+ &mov ("ebp","eax") if ($i==64);
+ &or ("ebp","eax") if ($i>64);
&mov (&DWP($i+4,"edi"),"ebx");
- &or ("ebp","ebx") if ($i<64);
+ &or ("ebp","ebx") if ($i>=64);
&mov (&DWP($i+8,"edi"),"ecx");
- &or ("ebp","ecx") if ($i<64);
+ &or ("ebp","ecx") if ($i>=64);
&mov (&DWP($i+12,"edi"),"edx");
- &or ("ebp","edx") if ($i<64);
+ &or ("ebp","edx") if ($i>=64);
}
&xor ("eax","eax");
&mov ("esi",&wparam(2));
} &function_end("ecp_nistz256_point_add_affine");
&asm_finish();
+
+close STDOUT;