poly1305/asm/poly1305-x86_64.pl: make it work with linux-x32.
[openssl.git] / crypto / poly1305 / asm / poly1305-x86_64.pl
index d991365fbbd380c1b6cdd3b548ca314e5527df1a..8977d563a25166b5c3bfac9bb952703c40962cfd 100755 (executable)
@@ -129,7 +129,9 @@ $code.=<<___;
 .extern        OPENSSL_ia32cap_P
 
 .globl poly1305_init
-.type  poly1305_init,\@function,2
+.globl poly1305_blocks
+.globl poly1305_emit
+.type  poly1305_init,\@function,3
 .align 32
 poly1305_init:
        xor     %rax,%rax
@@ -163,19 +165,25 @@ $code.=<<___;
        and     8($inp),%rcx
        mov     %rax,24($ctx)
        mov     %rcx,32($ctx)
-
+___
+$code.=<<___   if ($flavour !~ /elf32/);
        mov     %r10,0(%rdx)
        mov     %r11,8(%rdx)
-
+___
+$code.=<<___   if ($flavour =~ /elf32/);
+       mov     %r10d,0(%rdx)
+       mov     %r11d,4(%rdx)
+___
+$code.=<<___;
        mov     \$1,%eax
 .Lno_key:
        ret
 .size  poly1305_init,.-poly1305_init
 
-.globl poly1305_blocks
 .type  poly1305_blocks,\@function,4
 .align 32
 poly1305_blocks:
+.Lblocks:
        sub     \$16,$len               # too short?
        jc      .Lno_data
 
@@ -231,10 +239,10 @@ $code.=<<___;
        ret
 .size  poly1305_blocks,.-poly1305_blocks
 
-.globl poly1305_emit
 .type  poly1305_emit,\@function,3
 .align 32
 poly1305_emit:
+.Lemit:
        mov     0($ctx),%r8     # load hash value
        mov     8($ctx),%r9
        mov     16($ctx),%r10
@@ -453,7 +461,7 @@ poly1305_blocks_avx:
        cmp     \$128,$len
        jae     .Lblocks_avx
        test    %r8d,%r8d
-       jz      poly1305_blocks
+       jz      .Lblocks
 
 .Lblocks_avx:
        and     \$-16,$len
@@ -1195,6 +1203,20 @@ $code.=<<___;
        vpaddq          $T3,$D0,$D0             # d0 += h1*s4
 
 .Lshort_tail_avx:
+       ################################################################
+       # horizontal addition
+
+       vpsrldq         \$8,$D4,$T4
+       vpsrldq         \$8,$D3,$T3
+       vpsrldq         \$8,$D1,$T1
+       vpsrldq         \$8,$D0,$T0
+       vpsrldq         \$8,$D2,$T2
+       vpaddq          $T3,$D3,$D3
+       vpaddq          $T4,$D4,$D4
+       vpaddq          $T0,$D0,$D0
+       vpaddq          $T1,$D1,$D1
+       vpaddq          $T2,$D2,$D2
+
        ################################################################
        # lazy reduction
 
@@ -1229,25 +1251,11 @@ $code.=<<___;
        vpand           $MASK,$D3,$D3
        vpaddq          $H3,$D4,$D4             # h3 -> h4
 
-       ################################################################
-       # horizontal addition
-
-       vpsrldq         \$8,$D2,$T2
-       vpsrldq         \$8,$D0,$T0
-       vpsrldq         \$8,$D1,$T1
-       vpsrldq         \$8,$D3,$T3
-       vpsrldq         \$8,$D4,$T4
-       vpaddq          $T2,$D2,$H2
-       vpaddq          $T0,$D0,$H0
-       vpaddq          $T1,$D1,$H1
-       vpaddq          $T3,$D3,$H3
-       vpaddq          $T4,$D4,$H4
-
-       vmovd           $H0,`4*0-48-64`($ctx)   # save partially reduced
-       vmovd           $H1,`4*1-48-64`($ctx)
-       vmovd           $H2,`4*2-48-64`($ctx)
-       vmovd           $H3,`4*3-48-64`($ctx)
-       vmovd           $H4,`4*4-48-64`($ctx)
+       vmovd           $D0,`4*0-48-64`($ctx)   # save partially reduced
+       vmovd           $D1,`4*1-48-64`($ctx)
+       vmovd           $D2,`4*2-48-64`($ctx)
+       vmovd           $D3,`4*3-48-64`($ctx)
+       vmovd           $D4,`4*4-48-64`($ctx)
 ___
 $code.=<<___   if ($win64);
        vmovdqa         0x50(%r11),%xmm6
@@ -1275,7 +1283,7 @@ $code.=<<___;
 .align 32
 poly1305_emit_avx:
        cmpl    \$0,20($ctx)    # is_base2_26?
-       je      poly1305_emit
+       je      .Lemit
 
        mov     0($ctx),%eax    # load hash value base 2^26
        mov     4($ctx),%ecx
@@ -1339,7 +1347,7 @@ poly1305_blocks_avx2:
        cmp     \$128,$len
        jae     .Lblocks_avx2
        test    %r8d,%r8d
-       jz      poly1305_blocks
+       jz      .Lblocks
 
 .Lblocks_avx2:
        and     \$-16,$len
@@ -1885,6 +1893,31 @@ $code.=<<___;
        vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
        vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
 
+       ################################################################
+       # horizontal addition
+
+       vpsrldq         \$8,$D1,$T1
+       vpsrldq         \$8,$H2,$T2
+       vpsrldq         \$8,$H3,$T3
+       vpsrldq         \$8,$H4,$T4
+       vpsrldq         \$8,$H0,$T0
+       vpaddq          $T1,$D1,$D1
+       vpaddq          $T2,$H2,$H2
+       vpaddq          $T3,$H3,$H3
+       vpaddq          $T4,$H4,$H4
+       vpaddq          $T0,$H0,$H0
+
+       vpermq          \$0x2,$H3,$T3
+       vpermq          \$0x2,$H4,$T4
+       vpermq          \$0x2,$H0,$T0
+       vpermq          \$0x2,$D1,$T1
+       vpermq          \$0x2,$H2,$T2
+       vpaddq          $T3,$H3,$H3
+       vpaddq          $T4,$H4,$H4
+       vpaddq          $T0,$H0,$H0
+       vpaddq          $T1,$D1,$D1
+       vpaddq          $T2,$H2,$H2
+
        ################################################################
        # lazy reduction
 
@@ -1919,31 +1952,6 @@ $code.=<<___;
        vpand           $MASK,$H3,$H3
        vpaddq          $D3,$H4,$H4             # h3 -> h4
 
-       ################################################################
-       # horizontal addition
-
-       vpsrldq         \$8,$H2,$T2
-       vpsrldq         \$8,$H0,$T0
-       vpsrldq         \$8,$H1,$T1
-       vpsrldq         \$8,$H3,$T3
-       vpsrldq         \$8,$H4,$T4
-       vpaddq          $T2,$H2,$H2
-       vpaddq          $T0,$H0,$H0
-       vpaddq          $T1,$H1,$H1
-       vpaddq          $T3,$H3,$H3
-       vpaddq          $T4,$H4,$H4
-
-       vpermq          \$0x2,$H2,$T2
-       vpermq          \$0x2,$H0,$T0
-       vpermq          \$0x2,$H1,$T1
-       vpermq          \$0x2,$H3,$T3
-       vpermq          \$0x2,$H4,$T4
-       vpaddq          $T2,$H2,$H2
-       vpaddq          $T0,$H0,$H0
-       vpaddq          $T1,$H1,$H1
-       vpaddq          $T3,$H3,$H3
-       vpaddq          $T4,$H4,$H4
-
        vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
        vmovd           %x#$H1,`4*1-48-64`($ctx)
        vmovd           %x#$H2,`4*2-48-64`($ctx)