IA64 assembly pack: add {chacha|poly1305}-ia64 modules.
authorAndy Polyakov <appro@openssl.org>
Sat, 16 Mar 2019 20:19:32 +0000 (21:19 +0100)
committerRichard Levitte <levitte@openssl.org>
Fri, 29 Mar 2019 06:33:15 +0000 (07:33 +0100)
Reviewed-by: Paul Dale <paul.dale@oracle.com>
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/8540)

crypto/chacha/asm/chacha-ia64.pl [new file with mode: 0644]
crypto/chacha/build.info
crypto/poly1305/asm/poly1305-ia64.S [new file with mode: 0644]

diff --git a/crypto/chacha/asm/chacha-ia64.pl b/crypto/chacha/asm/chacha-ia64.pl
new file mode 100644 (file)
index 0000000..dd09060
--- /dev/null
@@ -0,0 +1,292 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL.
+# ====================================================================
+#
+# ChaCha20 for Itanium.
+#
+# March 2019
+#
+# Itanium 9xxx, which has pair of shifters, manages to process one byte
+# in 9.3 cycles. This aligns perfectly with theoretical estimate.
+# On the other hand, pre-9000 CPU has single shifter and each extr/dep
+# pairs below takes additional cycle. Then final input->xor->output
+# pass runs slower than expected... Overall result is 15.6 cpb, two
+# cycles more than theoretical estimate.
+
+$output = pop;
+open STDOUT, ">$output" if $output;
+
+my @k = map("r$_",(16..31));
+my @x = map("r$_",(38..53));
+my @y = map("r$_",(8..11));
+my @z = map("r$_",(15,35..37));
+my ($out,$inp,$len,$key,$counter) = map("r$_",(32..36));
+
+$code.=<<___;
+#if defined(_HPUX_SOURCE)
+# if !defined(_LP64)
+#  define ADDP  addp4
+# else
+#  define ADDP  add
+# endif
+#else
+# define ADDP   add
+#endif
+
+.text
+
+.global        ChaCha20_ctr32#
+.proc  ChaCha20_ctr32#
+.align 32
+ChaCha20_ctr32:
+       .prologue
+       .save           ar.pfs,r2
+{ .mmi;        alloc           r2=ar.pfs,5,17,0,0
+       ADDP            @k[11]=4,$key
+       .save           ar.lc,r3
+       mov             r3=ar.lc                }
+{ .mmi;        ADDP            $key=0,$key
+       ADDP            $counter=0,$counter
+       .save           pr,r14
+       mov             r14=pr                  };;
+
+       .body
+{ .mlx;        ld4             @k[4]=[$key],8
+       movl            @k[0]=0x61707865        }
+{ .mlx;        ld4             @k[5]=[@k[11]],8
+       movl            @k[1]=0x3320646e        };;
+{ .mlx;        ld4             @k[6]=[$key],8
+       movl            @k[2]=0x79622d32        }
+{ .mlx;        ld4             @k[7]=[@k[11]],8
+       movl            @k[3]=0x6b206574        };;
+{ .mmi;        ld4             @k[8]=[$key],8
+       ld4             @k[9]=[@k[11]],8
+       add             @k[15]=4,$counter       };;
+{ .mmi;        ld4             @k[10]=[$key]
+       ld4             @k[11]=[@k[11]]
+       mov             @x[0]=@k[0]             };;
+{ .mmi;        ld4             @k[12]=[$counter],8
+       ld4             @k[13]=[@k[15]],8
+       mov             @x[1]=@k[1]             };;
+{ .mmi;        ld4             @k[14]=[$counter]
+       ld4             @k[15]=[@k[15]]
+       mov             @x[2]=@k[2]             }
+{ .mmi;        mov             @x[3]=@k[3]
+       mov             @x[4]=@k[4]
+       mov             @x[5]=@k[5]             };;
+{ .mmi;        mov             @x[6]=@k[6]
+       mov             @x[7]=@k[7]
+       mov             @x[8]=@k[8]             }
+{ .mmi;        mov             @x[9]=@k[9]
+       mov             @x[10]=@k[10]
+       mov             @x[11]=@k[11]           }
+{ .mmi;        mov             @x[12]=@k[12]
+       mov             @x[13]=@k[13]
+       mov             @x[14]=@k[14]           };;
+
+.Loop_outer:
+{ .mii;        mov             @x[15]=@k[15]
+       mov             ar.lc=9
+       mov             ar.ec=1                 }
+{ .mmb;        cmp.geu         p6,p0=64,$len
+       sub             @z[1]=64,$len
+       brp.loop.imp    .Loop_top,.Loop_end-16  };;
+
+.Loop_top:
+___
+sub ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+
+$code.=<<___;
+{ .mmi;        add             @x[$a0]=@x[$a0],@x[$b0]
+       add             @x[$a1]=@x[$a1],@x[$b1]
+       add             @x[$a2]=@x[$a2],@x[$b2]         };;
+{ .mmi;        add             @x[$a3]=@x[$a3],@x[$b3]
+       xor             @x[$d0]=@x[$d0],@x[$a0]
+       xor             @x[$d1]=@x[$d1],@x[$a1]         };;
+{ .mmi;        xor             @x[$d2]=@x[$d2],@x[$a2]
+       xor             @x[$d3]=@x[$d3],@x[$a3]
+       extr.u          @y[0]=@x[$d0],16,16             };;
+{ .mii;        extr.u          @y[1]=@x[$d1],16,16
+       dep             @x[$d0]=@x[$d0],@y[0],16,16     };;
+{ .mii;         add            @x[$c0]=@x[$c0],@x[$d0]
+       extr.u          @y[2]=@x[$d2],16,16
+       dep             @x[$d1]=@x[$d1],@y[1],16,16     };;
+{ .mii;         add            @x[$c1]=@x[$c1],@x[$d1]
+        xor            @x[$b0]=@x[$b0],@x[$c0]
+       extr.u          @y[3]=@x[$d3],16,16             };;
+{ .mii;         xor            @x[$b1]=@x[$b1],@x[$c1]
+       dep             @x[$d2]=@x[$d2],@y[2],16,16
+       dep             @x[$d3]=@x[$d3],@y[3],16,16     };;
+{ .mmi;         add            @x[$c2]=@x[$c2],@x[$d2]
+        add            @x[$c3]=@x[$c3],@x[$d3]
+        extr.u         @y[0]=@x[$b0],20,12             };;
+{ .mmi;         xor            @x[$b2]=@x[$b2],@x[$c2]
+        xor            @x[$b3]=@x[$b3],@x[$c3]
+        dep.z          @x[$b0]=@x[$b0],12,20           };;
+{ .mii;         or             @x[$b0]=@x[$b0],@y[0]
+        extr.u         @y[1]=@x[$b1],20,12
+        dep.z          @x[$b1]=@x[$b1],12,20           };;
+{ .mii;        add             @x[$a0]=@x[$a0],@x[$b0]
+        extr.u         @y[2]=@x[$b2],20,12
+        extr.u         @y[3]=@x[$b3],20,12             }
+{ .mii;         or             @x[$b1]=@x[$b1],@y[1]
+        dep.z          @x[$b2]=@x[$b2],12,20
+        dep.z          @x[$b3]=@x[$b3],12,20           };;
+{ .mmi;         or             @x[$b2]=@x[$b2],@y[2]
+        or             @x[$b3]=@x[$b3],@y[3]
+       add             @x[$a1]=@x[$a1],@x[$b1]         };;
+{ .mmi;        add             @x[$a2]=@x[$a2],@x[$b2]
+       add             @x[$a3]=@x[$a3],@x[$b3]
+       xor             @x[$d0]=@x[$d0],@x[$a0]         };;
+{ .mii;        xor             @x[$d1]=@x[$d1],@x[$a1]
+       extr.u          @y[0]=@x[$d0],24,8
+       dep.z           @x[$d0]=@x[$d0],8,24            };;
+{ .mii;        or              @x[$d0]=@x[$d0],@y[0]
+       extr.u          @y[1]=@x[$d1],24,8
+       dep.z           @x[$d1]=@x[$d1],8,24            };;
+{ .mmi;        or              @x[$d1]=@x[$d1],@y[1]
+       xor             @x[$d2]=@x[$d2],@x[$a2]
+       xor             @x[$d3]=@x[$d3],@x[$a3]         };;
+{ .mii;         add            @x[$c0]=@x[$c0],@x[$d0]
+       extr.u          @y[2]=@x[$d2],24,8
+       dep.z           @x[$d2]=@x[$d2],8,24            };;
+{ .mii;         xor            @x[$b0]=@x[$b0],@x[$c0]
+       extr.u          @y[3]=@x[$d3],24,8
+       dep.z           @x[$d3]=@x[$d3],8,24            };;
+{ .mmi;        or              @x[$d2]=@x[$d2],@y[2]
+       or              @x[$d3]=@x[$d3],@y[3]
+        extr.u         @y[0]=@x[$b0],25,7              };;
+{ .mmi;         add            @x[$c1]=@x[$c1],@x[$d1]
+        add            @x[$c2]=@x[$c2],@x[$d2]
+        dep.z          @x[$b0]=@x[$b0],7,25            };;
+{ .mmi;         xor            @x[$b1]=@x[$b1],@x[$c1]
+        xor            @x[$b2]=@x[$b2],@x[$c2]
+        add            @x[$c3]=@x[$c3],@x[$d3]         };;
+{ .mii;         xor            @x[$b3]=@x[$b3],@x[$c3]
+        extr.u         @y[1]=@x[$b1],25,7
+        dep.z          @x[$b1]=@x[$b1],7,25            };;
+{ .mii;         or             @x[$b0]=@x[$b0],@y[0]
+        extr.u         @y[2]=@x[$b2],25,7
+        dep.z          @x[$b2]=@x[$b2],7,25            };;
+{ .mii;         or             @x[$b1]=@x[$b1],@y[1]
+        extr.u         @y[3]=@x[$b3],25,7
+        dep.z          @x[$b3]=@x[$b3],7,25            };;
+___
+$code.=<<___           if ($d0 == 12);
+{ .mmi;         or             @x[$b2]=@x[$b2],@y[2]
+        or             @x[$b3]=@x[$b3],@y[3]
+       mov             @z[0]=-1                        };;
+___
+$code.=<<___           if ($d0 == 15);
+{ .mmb;         or             @x[$b2]=@x[$b2],@y[2]
+        or             @x[$b3]=@x[$b3],@y[3]
+       br.ctop.sptk    .Loop_top                       };;
+___
+}
+       &ROUND(0, 4, 8, 12);
+       &ROUND(0, 5, 10, 15);
+$code.=<<___;
+.Loop_end:
+
+{ .mmi;        add             @x[0]=@x[0],@k[0]
+       add             @x[1]=@x[1],@k[1]
+(p6)   shr.u           @z[0]=@z[0],@z[1]               }
+{ .mmb;        add             @x[2]=@x[2],@k[2]
+       add             @x[3]=@x[3],@k[3]
+       clrrrb.pr                                       };;
+{ .mmi;        add             @x[4]=@x[4],@k[4]
+       add             @x[5]=@x[5],@k[5]
+       add             @x[6]=@x[6],@k[6]               }
+{ .mmi;        add             @x[7]=@x[7],@k[7]
+       add             @x[8]=@x[8],@k[8]
+       add             @x[9]=@x[9],@k[9]               }
+{ .mmi;        add             @x[10]=@x[10],@k[10]
+       add             @x[11]=@x[11],@k[11]
+       add             @x[12]=@x[12],@k[12]            }
+{ .mmi;        add             @x[13]=@x[13],@k[13]
+       add             @x[14]=@x[14],@k[14]
+       add             @x[15]=@x[15],@k[15]            }
+{ .mmi;        add             @k[12]=1,@k[12]                 // next counter
+       mov             pr=@z[0],0x1ffff                };;
+
+//////////////////////////////////////////////////////////////////
+// Each predicate bit corresponds to byte to be processed. Note
+// that p0 is wired to 1, but it works out, because there always
+// is at least one byte to process...
+{ .mmi;        (p0)    ld1             @z[0]=[$inp],1
+               shr.u           @y[1]=@x[0],8           };;
+{ .mmi;        (p1)    ld1             @z[1]=[$inp],1
+       (p2)    shr.u           @y[2]=@x[0],16          };;
+{ .mmi;        (p2)    ld1             @z[2]=[$inp],1
+       (p0)    xor             @z[0]=@z[0],@x[0]
+       (p3)    shr.u           @y[3]=@x[0],24          };;
+___
+for(my $i0=0; $i0<60; $i0+=4) {
+my ($i1, $i2, $i3, $i4, $i5, $i6, $i7) = map($i0+$_,(1..7));
+my $k = $i0/4+1;
+
+$code.=<<___;
+{ .mmi;        (p$i3)  ld1             @z[3]=[$inp],1
+       (p$i0)  st1             [$out]=@z[0],1
+       (p$i1)  xor             @z[1]=@z[1],@y[1]       };;
+{ .mmi;        (p$i4)  ld1             @z[0]=[$inp],1
+       (p$i5)  shr.u           @y[1]=@x[$k],8          }
+{ .mmi;        (p$i1)  st1             [$out]=@z[1],1
+       (p$i2)  xor             @z[2]=@z[2],@y[2]
+       (p1)    mov             @x[$k-1]=@k[$k-1]       };;
+{ .mfi;        (p$i5)  ld1             @z[1]=[$inp],1
+       (p$i6)  shr.u           @y[2]=@x[$k],16         }
+{ .mfi;        (p$i2)  st1             [$out]=@z[2],1
+       (p$i3)  xor             @z[3]=@z[3],@y[3]       };;
+{ .mfi;        (p$i6)  ld1             @z[2]=[$inp],1
+       (p$i7)  shr.u           @y[3]=@x[$k],24         }
+___
+$code.=<<___   if ($i0==0);    # p1,p2 are available for reuse in first round
+{ .mmi;        (p$i3)  st1             [$out]=@z[3],1
+       (p$i4)  xor             @z[0]=@z[0],@x[$k]
+               cmp.ltu         p1,p2=64,$len           };;
+___
+$code.=<<___   if ($i0>0);
+{ .mfi;        (p$i3)  st1             [$out]=@z[3],1
+       (p$i4)  xor             @z[0]=@z[0],@x[$k]      };;
+___
+}
+$code.=<<___;
+{ .mmi;        (p63)   ld1             @z[3]=[$inp],1
+       (p60)   st1             [$out]=@z[0],1
+       (p61)   xor             @z[1]=@z[1],@y[1]       };;
+{ .mmi;        (p61)   st1             [$out]=@z[1],1
+       (p62)   xor             @z[2]=@z[2],@y[2]       };;
+{ .mmi;        (p62)   st1             [$out]=@z[2],1
+       (p63)   xor             @z[3]=@z[3],@y[3]
+       (p2)    mov             ar.lc=r3                };;
+{ .mib;        (p63)   st1             [$out]=@z[3],1
+       (p1)    add             $len=-64,$len
+(p1)   br.dptk.many            .Loop_outer             };;
+
+{ .mmi;        mov                     @k[4]=0                 // wipe key material
+       mov                     @k[5]=0
+       mov                     @k[6]=0                 }
+{ .mmi;        mov                     @k[7]=0
+       mov                     @k[8]=0
+       mov                     @k[9]=0                 }
+{ .mmi;        mov                     @k[10]=0
+       mov                     @k[11]=0
+       mov                     @k[12]=0                }
+{ .mmi;        mov                     @k[13]=0
+       mov                     @k[14]=0
+       mov                     @k[15]=0                }
+{ .mib;        mov                     pr=r14,0x1ffff
+       br.ret.sptk.many        b0                      };;
+.endp  ChaCha20_ctr32#
+stringz "ChaCha20 for IA64, CRYPTOGAMS by \@dot-asm"
+___
+
+print $code;
+close STDOUT;
index bbd9ca8b333f092841bdb1d9be2ed574df7bccb7..a7418265a226135da9f9ac8af1709a7b4af6b47f 100644 (file)
@@ -12,3 +12,4 @@ INCLUDE[chacha-armv8.o]=..
 INCLUDE[chacha-s390x.o]=..
 GENERATE[chacha-c64xplus.S]=asm/chacha-c64xplus.pl $(PERLASM_SCHEME)
 GENERATE[chacha-s390x.S]=asm/chacha-s390x.pl $(PERLASM_SCHEME)
+GENERATE[chacha-ia64.S]=asm/chacha-ia64.pl $(PERLASM_SCHEME)
diff --git a/crypto/poly1305/asm/poly1305-ia64.S b/crypto/poly1305/asm/poly1305-ia64.S
new file mode 100644 (file)
index 0000000..54d6454
--- /dev/null
@@ -0,0 +1,365 @@
+// ====================================================================
+// Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
+// project.
+// ====================================================================
+//
+// Poly1305 for Itanium.
+//
+// January 2019
+//
+// Performance was reported to be ~2.1 cycles per byte on Itanium 2.
+// With exception for processors in 95xx family, which have higher
+// floating-point instructions' latencies and deliver ~2.6 cpb.
+// Comparison to compiler-generated code is not exactly fair, because
+// of different radixes. But just for reference, it was observed to be
+// >3x faster. Originally it was argued that floating-point base 2^32
+// implementation would be optimal. Upon closer look estimate for below
+// integer base 2^64 implementation turned to be approximately same on
+// Itanium 2. But floating-point code would be larger, and have higher
+// overhead, which would negatively affect small-block performance...
+
+#if defined(_HPUX_SOURCE)
+# if !defined(_LP64)
+#  define ADDP  addp4
+# else
+#  define ADDP  add
+# endif
+# define RUM    rum
+# define SUM    sum
+#else
+# define ADDP   add
+# define RUM    nop
+# define SUM    nop
+#endif
+
+.text
+.explicit
+
+.global        poly1305_init#
+.proc  poly1305_init#
+.align 64
+poly1305_init:
+       .prologue
+       .save           ar.pfs,r2
+{ .mmi;        alloc           r2=ar.pfs,2,0,0,0
+       cmp.eq          p6,p7=0,r33             }       // key == NULL?
+{ .mmi;        ADDP            r9=8,r32
+       ADDP            r10=16,r32
+       ADDP            r32=0,r32               };;
+       .body
+{ .mmi;        st8             [r32]=r0,24                     // ctx->h0 = 0
+       st8             [r9]=r0                         // ctx->h1 = 0
+(p7)   ADDP            r8=0,r33                }
+{ .mib;        st8             [r10]=r0                        // ctx->h2 = 0
+(p6)   mov             r8=0
+(p6)   br.ret.spnt     b0                      };;
+
+{ .mmi;        ADDP            r9=1,r33
+       ADDP            r10=2,r33
+       ADDP            r11=3,r33               };;
+{ .mmi;        ld1             r16=[r8],4                      // load key, little-endian
+       ld1             r17=[r9],4              }
+{ .mmi;        ld1             r18=[r10],4
+       ld1             r19=[r11],4             };;
+{ .mmi;        ld1             r20=[r8],4
+       ld1             r21=[r9],4              }
+{ .mmi;        ld1             r22=[r10],4
+       ld1             r23=[r11],4
+       and             r19=15,r19              };;
+{ .mmi;        ld1             r24=[r8],4
+       ld1             r25=[r9],4
+       and             r20=-4,r20              }
+{ .mmi;        ld1             r26=[r10],4
+       ld1             r27=[r11],4
+       and             r23=15,r23              };;
+{ .mmi;        ld1             r28=[r8],4
+       ld1             r29=[r9],4
+       and             r24=-4,r24              }
+{ .mmi;        ld1             r30=[r10],4
+       ld1             r31=[r11],4
+       and             r27=15,r27              };;
+
+{ .mii;        and             r28=-4,r28
+       dep             r16=r17,r16,8,8
+       dep             r18=r19,r18,8,8         };;
+{ .mii;        and             r31=15,r31
+       dep             r16=r18,r16,16,16
+       dep             r20=r21,r20,8,8         };;
+{ .mii;        dep             r16=r20,r16,32,16
+       dep             r22=r23,r22,8,8         };;
+{ .mii;        dep             r16=r22,r16,48,16
+       dep             r24=r25,r24,8,8         };;
+{ .mii;        dep             r26=r27,r26,8,8
+       dep             r28=r29,r28,8,8         };;
+{ .mii;        dep             r24=r26,r24,16,16
+       dep             r30=r31,r30,8,8         };;
+{ .mii;        st8             [r32]=r16,8                     // ctx->r0
+       dep             r24=r28,r24,32,16;;
+       dep             r24=r30,r24,48,16       };;
+{ .mii;        st8             [r32]=r24,8                     // ctx->r1
+       shr.u           r25=r24,2;;
+       add             r25=r25,r24             };;
+{ .mib; st8            [r32]=r25                       // ctx->s1
+       mov             r8=0
+       br.ret.sptk     b0                      };;
+.endp  poly1305_init#
+
+h0=r17;  h1=r18;  h2=r19;
+i0=r20;  i1=r21;
+HF0=f8;  HF1=f9;  HF2=f10;
+RF0=f11; RF1=f12; SF1=f13;
+
+.global        poly1305_blocks#
+.proc  poly1305_blocks#
+.align 64
+poly1305_blocks:
+       .prologue
+       .save           ar.pfs,r2
+{ .mii;        alloc           r2=ar.pfs,4,1,0,0
+       .save           ar.lc,r3
+       mov             r3=ar.lc
+       .save           pr,r36
+       mov             r36=pr                  }
+
+       .body
+{ .mmi;        ADDP            r8=0,r32
+       ADDP            r9=8,r32
+       and             r29=7,r33               };;
+{ .mmi;        ld8             h0=[r8],16
+       ld8             h1=[r9],16
+       and             r33=-8,r33              };;
+{ .mmi;        ld8             h2=[r8],16
+       ldf8            RF0=[r9],16
+       shr.u           r34=r34,4               };;
+{ .mmi;        ldf8            RF1=[r8],-32
+       ldf8            SF1=[r9],-32
+       cmp.ltu         p16,p17=1,r34           };;
+{ .mmi;
+(p16)  add             r34=-2,r34
+(p17)  mov             r34=0
+       ADDP            r10=0,r33               }
+{ .mii;        ADDP            r11=8,r33
+(p16)  mov             ar.ec=2
+(p17)  mov             ar.ec=1                 };;
+{ .mib;        RUM             1<<1                            // go little-endian
+       mov             ar.lc=r34
+       brp.loop.imp    .Loop,.Lcend-16         }
+
+{ .mmi;        cmp.eq          p8,p7=0,r29
+       cmp.eq          p9,p0=1,r29
+       cmp.eq          p10,p0=2,r29            }
+{ .mmi;        cmp.eq          p11,p0=3,r29
+       cmp.eq          p12,p0=4,r29
+       cmp.eq          p13,p0=5,r29            }
+{ .mmi;        cmp.eq          p14,p0=6,r29
+       cmp.eq          p15,p0=7,r29
+       add             r16=16,r10              };;
+
+{ .mmb;
+(p8)   ld8             i0=[r10],16                     // aligned input
+(p8)   ld8             i1=[r11],16
+(p8)   br.cond.sptk    .Loop                   };;
+
+       // align first block
+       .pred.rel       "mutex",p8,p9,p10,p11,p12,p13,p14,p15
+{ .mmi;        (p7)    ld8             r14=[r10],24
+       (p7)    ld8             r15=[r11],24            }
+
+{ .mii;        (p7)    ld8             r16=[r16]
+               nop.i           0;;
+       (p15)   shrp            i0=r15,r14,56           }
+{ .mii;        (p15)   shrp            i1=r16,r15,56
+       (p14)   shrp            i0=r15,r14,48           }
+{ .mii;        (p14)   shrp            i1=r16,r15,48
+       (p13)   shrp            i0=r15,r14,40           }
+{ .mii;        (p13)   shrp            i1=r16,r15,40
+       (p12)   shrp            i0=r15,r14,32           }
+{ .mii;        (p12)   shrp            i1=r16,r15,32
+       (p11)   shrp            i0=r15,r14,24           }
+{ .mii;        (p11)   shrp            i1=r16,r15,24
+       (p10)   shrp            i0=r15,r14,16           }
+{ .mii;        (p10)   shrp            i1=r16,r15,16
+       (p9)    shrp            i0=r15,r14,8            }
+{ .mii;        (p9)    shrp            i1=r16,r15,8
+               mov             r14=r16                 };;
+
+.Loop:
+               .pred.rel       "mutex",p8,p9,p10,p11,p12,p13,p14,p15
+{ .mmi;                add             h0=h0,i0
+               add             h1=h1,i1
+               add             h2=h2,r35               };;
+{ .mmi;                setf.sig        HF0=h0
+               cmp.ltu         p6,p0=h0,i0
+               cmp.ltu         p7,p0=h1,i1             };;
+{ .mmi;        (p6)    add             h1=1,h1;;
+               setf.sig        HF1=h1
+       (p6)    cmp.eq.or       p7,p0=0,h1              };;
+{ .mmi;        (p7)    add             h2=1,h2;;
+               setf.sig        HF2=h2                  };;
+
+{ .mfi;        (p16)   ld8             r15=[r10],16
+               xmpy.lu         f32=HF0,RF0             }
+{ .mfi;        (p16)   ld8             r16=[r11],16
+               xmpy.hu         f33=HF0,RF0             }
+{ .mfi;                xmpy.lu         f36=HF0,RF1             }
+{ .mfi;                xmpy.hu         f37=HF0,RF1             };;
+{ .mfi;                xmpy.lu         f34=HF1,SF1
+       (p15)   shrp            i0=r15,r14,56           }
+{ .mfi;                xmpy.hu         f35=HF1,SF1             }
+{ .mfi;                xmpy.lu         f38=HF1,RF0
+       (p15)   shrp            i1=r16,r15,56           }
+{ .mfi;                xmpy.hu         f39=HF1,RF0             }
+{ .mfi;                xmpy.lu         f40=HF2,SF1
+       (p14)   shrp            i0=r15,r14,48           }
+{ .mfi;                xmpy.lu         f41=HF2,RF0             };;
+
+{ .mmi;                getf.sig        r22=f32
+               getf.sig        r23=f33
+       (p14)   shrp            i1=r16,r15,48           }
+{ .mmi;                getf.sig        r24=f34
+               getf.sig        r25=f35
+       (p13)   shrp            i0=r15,r14,40           }
+{ .mmi;                getf.sig        r26=f36
+               getf.sig        r27=f37
+       (p13)   shrp            i1=r16,r15,40           }
+{ .mmi;                getf.sig        r28=f38
+               getf.sig        r29=f39
+       (p12)   shrp            i0=r15,r14,32           }
+{ .mmi;                getf.sig        r30=f40
+               getf.sig        r31=f41                 };;
+
+{ .mmi;                add             h0=r22,r24
+               add             r23=r23,r25
+       (p12)   shrp            i1=r16,r15,32           }
+{ .mmi;                add             h1=r26,r28
+               add             r27=r27,r29
+       (p11)   shrp            i0=r15,r14,24           };;
+{ .mmi;                cmp.ltu         p6,p0=h0,r24
+               cmp.ltu         p7,p0=h1,r28
+               add             r23=r23,r30             };;
+{ .mmi;        (p6)    add             r23=1,r23
+       (p7)    add             r27=1,r27
+       (p11)   shrp            i1=r16,r15,24           };;
+{ .mmi;                add             h1=h1,r23;;
+               cmp.ltu         p6,p7=h1,r23
+       (p10)   shrp            i0=r15,r14,16           };;
+{ .mmi;        (p6)    add             h2=r31,r27,1
+       (p7)    add             h2=r31,r27
+       (p10)   shrp            i1=r16,r15,16           };;
+
+{ .mmi;        (p8)    mov             i0=r15
+               and             r22=-4,h2
+               shr.u           r23=h2,2                };;
+{ .mmi;                add             r22=r22,r23
+               and             h2=3,h2
+       (p9)    shrp            i0=r15,r14,8            };;
+
+{ .mmi;                add             h0=h0,r22;;
+               cmp.ltu         p6,p0=h0,r22
+       (p9)    shrp            i1=r16,r15,8            };;
+{ .mmi;        (p8)    mov             i1=r16
+       (p6)    cmp.eq.unc      p7,p0=-1,h1
+       (p6)    add             h1=1,h1                 };;
+{ .mmb;        (p7)    add             h2=1,h2
+               mov             r14=r16
+               br.ctop.sptk    .Loop                   };;
+.Lcend:
+
+{ .mii;        SUM             1<<1                            // back to big-endian
+       mov             ar.lc=r3                };;
+
+{ .mmi;        st8             [r8]=h0,16
+       st8             [r9]=h1
+       mov             pr=r36,0x1ffff          };;
+{ .mmb;        st8             [r8]=h2
+       rum             1<<5
+       br.ret.sptk     b0                      };;
+.endp  poly1305_blocks#
+
+.global        poly1305_emit#
+.proc  poly1305_emit#
+.align 64
+poly1305_emit:
+       .prologue
+       .save           ar.pfs,r2
+{ .mmi;        alloc           r2=ar.pfs,3,0,0,0
+       ADDP            r8=0,r32
+       ADDP            r9=8,r32                };;
+
+       .body
+{ .mmi;        ld8             r16=[r8],16                     // load hash
+       ld8             r17=[r9]
+       ADDP            r10=0,r34               };;
+{ .mmi;        ld8             r18=[r8]
+       ld4             r24=[r10],8                     // load nonce
+       ADDP            r11=4,r34               };;
+
+{ .mmi;        ld4             r25=[r11],8
+       ld4             r26=[r10]
+       add             r20=5,r16               };;
+
+{ .mmi;        ld4             r27=[r11]
+       cmp.ltu         p6,p7=r20,r16
+       shl             r25=r25,32              };;
+{ .mmi;
+(p6)   add             r21=1,r17
+(p7)   add             r21=0,r17
+(p6)   cmp.eq.or.andcm p6,p7=-1,r17            };;
+{ .mmi;
+(p6)   add             r22=1,r18
+(p7)   add             r22=0,r18
+       shl             r27=r27,32              };;
+{ .mmi;        or              r24=r24,r25
+       or              r26=r26,r27
+       cmp.leu         p6,p7=4,r22             };;
+{ .mmi;
+(p6)   add             r16=r20,r24
+(p7)   add             r16=r16,r24
+(p6)   add             r17=r21,r26             };;
+{ .mii;
+(p7)   add             r17=r17,r26
+       cmp.ltu         p6,p7=r16,r24;;
+(p6)   add             r17=1,r17               };;
+
+{ .mmi;        ADDP            r8=0,r33
+       ADDP            r9=4,r33
+       shr.u           r20=r16,32              }
+{ .mmi;        ADDP            r10=8,r33
+       ADDP            r11=12,r33
+       shr.u           r21=r17,32              };;
+
+{ .mmi;        st1             [r8]=r16,1                      // write mac, little-endian
+       st1             [r9]=r20,1
+       shr.u           r16=r16,8               }
+{ .mii;        st1             [r10]=r17,1
+       shr.u           r20=r20,8
+       shr.u           r17=r17,8               }
+{ .mmi;        st1             [r11]=r21,1
+       shr.u           r21=r21,8               };;
+
+{ .mmi;        st1             [r8]=r16,1
+       st1             [r9]=r20,1
+       shr.u           r16=r16,8               }
+{ .mii;        st1             [r10]=r17,1
+       shr.u           r20=r20,8
+       shr.u           r17=r17,8               }
+{ .mmi;        st1             [r11]=r21,1
+       shr.u           r21=r21,8               };;
+
+{ .mmi;        st1             [r8]=r16,1
+       st1             [r9]=r20,1
+       shr.u           r16=r16,8               }
+{ .mii;        st1             [r10]=r17,1
+       shr.u           r20=r20,8
+       shr.u           r17=r17,8               }
+{ .mmi;        st1             [r11]=r21,1
+       shr.u           r21=r21,8               };;
+
+{ .mmi;        st1             [r8]=r16
+       st1             [r9]=r20                }
+{ .mmb;        st1             [r10]=r17
+       st1             [r11]=r21
+       br.ret.sptk     b0                      };;
+.endp  poly1305_emit#
+
+stringz        "Poly1305 for IA64, CRYPTOGAMS by \@dot-asm"