ARMv8 assembly pack: add Qualcomm Kryo results.

[openssl.git] / crypto / poly1305 / asm / poly1305-armv8.pl
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl

index 2e1dae3df238d157a23215ec7d88885cf105c11b..ac06457b65301a4dd8b95e8aa30f12f5b02bd389 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  #
  # ====================================================================
  # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  #
  # ====================================================================
  # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -19,7 +26,9 @@
  # Cortex-A53   2.69/+58%       1.47
  # Cortex-A57   2.70/+7%        1.14
  # Denver       1.64/+50%       1.18(*)
  # Cortex-A53   2.69/+58%       1.47
  # Cortex-A57   2.70/+7%        1.14
  # Denver       1.64/+50%       1.18(*)
-# X-Gene       2.13/+68%       2.19
+# X-Gene       2.13/+68%       2.27
+# Mongoose     1.77/+75%       1.12
+# Kryo         2.70/+55%       1.13
  #
  # (*)  estimate based on resources availability is less than 1.0,
  #      i.e. measured result is worse than expected, presumably binary
  #
  # (*)  estimate based on resources availability is less than 1.0,
  #      i.e. measured result is worse than expected, presumably binary
@@ -92,7 +101,11 @@ poly1305_init:
         csel    $d0,$d0,$r0,eq
         csel    $d1,$d1,$r1,eq
  
         csel    $d0,$d0,$r0,eq
         csel    $d1,$d1,$r1,eq
  
+#ifdef __ILP32__
+       stp     w12,w13,[$len]
+#else
         stp     $d0,$d1,[$len]
         stp     $d0,$d1,[$len]
+#endif
  
         mov     x0,#1
  .Lno_key:
  
         mov     x0,#1
  .Lno_key:
@@ -507,9 +520,11 @@ poly1305_blocks_neon:
         fmov    $IN01_1,x6
         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
         fmov    $IN01_1,x6
         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
+       movi    $MASK.2d,#-1
         fmov    $IN01_2,x8
         fmov    $IN01_3,x10
         fmov    $IN01_4,x12
         fmov    $IN01_2,x8
         fmov    $IN01_3,x10
         fmov    $IN01_4,x12
+       ushr    $MASK.2d,$MASK.2d,#38
  
         b.ls    .Lskip_loop
  
  
         b.ls    .Lskip_loop
  
@@ -660,41 +675,43 @@ poly1305_blocks_neon:
          fmov   $IN01_2,x8
         umlal   $ACC2,$IN01_4,${S3}[0]
          fmov   $IN01_3,x10
          fmov   $IN01_2,x8
         umlal   $ACC2,$IN01_4,${S3}[0]
          fmov   $IN01_3,x10
+        fmov   $IN01_4,x12
  
         /////////////////////////////////////////////////////////////////
         // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  
         /////////////////////////////////////////////////////////////////
         // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-        // and P. Schwabe
+       // and P. Schwabe
+       //
+       // [see discussion in poly1305-armv4 module]
  
         ushr    $T0.2d,$ACC3,#26
  
         ushr    $T0.2d,$ACC3,#26
-        fmov   $IN01_4,x12
         xtn     $H3,$ACC3
          ushr   $T1.2d,$ACC0,#26
         xtn     $H3,$ACC3
          ushr   $T1.2d,$ACC0,#26
-        xtn    $H0,$ACC0
+        and    $ACC0,$ACC0,$MASK.2d
         add     $ACC4,$ACC4,$T0.2d      // h3 -> h4
         bic     $H3,#0xfc,lsl#24        // &=0x03ffffff
          add    $ACC1,$ACC1,$T1.2d      // h0 -> h1
         add     $ACC4,$ACC4,$T0.2d      // h3 -> h4
         bic     $H3,#0xfc,lsl#24        // &=0x03ffffff
          add    $ACC1,$ACC1,$T1.2d      // h0 -> h1
-        bic    $H0,#0xfc,lsl#24
  
  
-       shrn    $T0.2s,$ACC4,#26
+       ushr    $T0.2d,$ACC4,#26
         xtn     $H4,$ACC4
          ushr   $T1.2d,$ACC1,#26
          xtn    $H1,$ACC1
         xtn     $H4,$ACC4
          ushr   $T1.2d,$ACC1,#26
          xtn    $H1,$ACC1
-        add    $ACC2,$ACC2,$T1.2d      // h1 -> h2
         bic     $H4,#0xfc,lsl#24
         bic     $H4,#0xfc,lsl#24
-        bic    $H1,#0xfc,lsl#24
+        add    $ACC2,$ACC2,$T1.2d      // h1 -> h2
  
  
-       add     $H0,$H0,$T0.2s
-       shl     $T0.2s,$T0.2s,#2
+       add     $ACC0,$ACC0,$T0.2d
+       shl     $T0.2d,$T0.2d,#2
          shrn   $T1.2s,$ACC2,#26
          xtn    $H2,$ACC2
          shrn   $T1.2s,$ACC2,#26
          xtn    $H2,$ACC2
-       add     $H0,$H0,$T0.2s          // h4 -> h0
+       add     $ACC0,$ACC0,$T0.2d      // h4 -> h0
+        bic    $H1,#0xfc,lsl#24
          add    $H3,$H3,$T1.2s          // h2 -> h3
          bic    $H2,#0xfc,lsl#24
  
          add    $H3,$H3,$T1.2s          // h2 -> h3
          bic    $H2,#0xfc,lsl#24
  
-       ushr    $T0.2s,$H0,#26
-       bic     $H0,#0xfc,lsl#24
+       shrn    $T0.2s,$ACC0,#26
+       xtn     $H0,$ACC0
          ushr   $T1.2s,$H3,#26
          bic    $H3,#0xfc,lsl#24
          ushr   $T1.2s,$H3,#26
          bic    $H3,#0xfc,lsl#24
+        bic    $H0,#0xfc,lsl#24
         add     $H1,$H1,$T0.2s          // h0 -> h1
          add    $H4,$H4,$T1.2s          // h3 -> h4
  
         add     $H1,$H1,$T0.2s          // h0 -> h1
          add    $H4,$H4,$T1.2s          // h3 -> h4
  
@@ -702,9 +719,7 @@ poly1305_blocks_neon:
  
  .Lskip_loop:
         dup     $IN23_2,${IN23_2}[0]
  
  .Lskip_loop:
         dup     $IN23_2,${IN23_2}[0]
-       movi    $MASK.2d,#-1
         add     $IN01_2,$IN01_2,$H2
         add     $IN01_2,$IN01_2,$H2
-       ushr    $MASK.2d,$MASK.2d,#38
  
         ////////////////////////////////////////////////////////////////
         // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
  
         ////////////////////////////////////////////////////////////////
         // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1