.explicit
.text
.ident "ia64.S, Version 2.1"
-.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
+.ident "IA-64 ISA artwork by Andy Polyakov <appro@openssl.org>"
-// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright 2001-2018 The OpenSSL Project Authors. All Rights Reserved.
//
-// Licensed under the OpenSSL license (the "License"). You may not use
+// Licensed under the Apache License 2.0 (the "License"). You may not use
// this file except in compliance with the License. You can obtain a copy
// in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// ====================================================================
-// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
// project.
//
// Rights for redistribution and usage in source and binary forms are
-// granted according to the OpenSSL license. Warranty of any kind is
-// disclaimed.
+// granted according to the License. Warranty of any kind is disclaimed.
// ====================================================================
//
-// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
+// Version 2.x is Itanium2 re-tune. Few words about how Itanium2 is
// different from Itanium to this module viewpoint. Most notably, is it
// "wider" than Itanium? Can you experience loop scalability as
// discussed in commentary sections? Not really:-( Itanium2 has 6
// on Itanium2! What to do? Reschedule loops for Itanium2? But then
// Itanium would exhibit anti-scalability. So I've chosen to reschedule
// for worst latency for every instruction aiming for best *all-round*
-// performance.
+// performance.
// Q. How much faster does it get?
// A. Here is the output from 'openssl speed rsa dsa' for vanilla
// User Mask I want to excuse the kernel from preserving upper
// (f32-f128) FP register bank over process context switch, thus
// minimizing bus bandwidth consumption during the switch (i.e.
-// after PKI opration completes and the program is off doing
+// after PKI operation completes and the program is off doing
// something else like bulk symmetric encryption). Having said
// this, I also want to point out that it might be good idea
// to compile the whole toolkit (as well as majority of the
#else
#define ADDP add
#endif
+#ifdef __VMS
+.alias abort, "decc$abort"
+#endif
#if 1
//
// bn_[add|sub]_words routines.
//
-// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
+// Loops are spinning in 2*(n+5) ticks on Itanium (provided that the
// data reside in L1 cache, i.e. 2 ticks away). It's possible to
// compress the epilogue and get down to 2*n+6, but at the cost of
// scalability (the neat feature of this implementation is that it
.global bn_sqr_words#
.proc bn_sqr_words#
.align 64
-.skip 32 // makes the loop body aligned at 64-byte boundary
+.skip 32 // makes the loop body aligned at 64-byte boundary
bn_sqr_words:
.prologue
.save ar.pfs,r2
// possible to compress the epilogue (I'm getting tired to write this
// comment over and over) and get down to 2*n+16 at the cost of
// scalability. The decision will very likely be reconsidered after the
-// benchmark program is profiled. I.e. if perfomance gain on Itanium
+// benchmark program is profiled. I.e. if performance gain on Itanium
// will appear larger than loss on "wider" IA-64, then the loop should
// be explicitly split and the epilogue compressed.
.L_bn_sqr_words_ctop:
xma.hu f118=f39,f127,f117 }
{ .mfi; xma.lu f117=f39,f127,f117 };;//
//-------------------------------------------------//
-// Leaving muliplier's heaven... Quite a ride, huh?
+// Leaving multiplier's heaven... Quite a ride, huh?
{ .mii; getf.sig r31=f47
add r25=r25,r24
mov ar.ec=0 // don't rotate at exit
mov pr.rot=0 }
{ .mii; mov L=r33 // save l
+ mov r25=r0 // needed if abort is called on VMS
mov r36=r0 };;
.L_divw_shift: // -vv- note signed comparison
// output: f8 = (int)(a/b)
// clobbered: f8,f9,f10,f11,pred
pred=p15
-// One can argue that this snippet is copyrighted to Intel
-// Corporation, as it's essentially identical to one of those
-// found in "Divide, Square Root and Remainder" section at
+// This snippet is based on text found in the "Divide, Square
+// Root and Remainder" section at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
// Yes, I admit that the referred code was used as template,
// but after I realized that there hardly is any other instruction