chacha/asm/chacha-*.pl: fix typos in tail processing.
authorAndy Polyakov <appro@openssl.org>
Sun, 21 Feb 2016 20:16:36 +0000 (21:16 +0100)
committerAndy Polyakov <appro@openssl.org>
Sat, 27 Feb 2016 20:09:02 +0000 (21:09 +0100)
RT#4323

Reviewed-by: Rich Salz <rsalz@openssl.org>
crypto/chacha/asm/chacha-armv4.pl
crypto/chacha/asm/chacha-s390x.pl
crypto/chacha/asm/chacha-x86_64.pl

index 55ebc9e586475a35e313b74483eb4b8d5b6f2b03..6c207557a5066ad094adec796458c3bf6202004d 100755 (executable)
@@ -440,9 +440,9 @@ $code.=<<___;
        eorhs   @x[4],@x[4],@t[0]
        eorhs   @x[5],@x[5],@t[1]
 # ifdef        __thumb2__
        eorhs   @x[4],@x[4],@t[0]
        eorhs   @x[5],@x[5],@t[1]
 # ifdef        __thumb2__
-       it      hi
+        it     ne
 # endif
 # endif
-        ldrhi  @t[0],[sp,#4*(32+2)]    @ re-load len
+        ldrne  @t[0],[sp,#4*(32+2)]    @ re-load len
 # ifdef        __thumb2__
        itt     hs
 # endif
 # ifdef        __thumb2__
        itt     hs
 # endif
@@ -584,9 +584,9 @@ ___
 }
 $code.=<<___;
 # ifdef        __thumb2__
 }
 $code.=<<___;
 # ifdef        __thumb2__
-       it      hi
+       it      ne
 # endif
 # endif
-       ldrhi   @t[0],[sp,#4*(32+2)]            @ re-load len
+       ldrne   @t[0],[sp,#4*(32+2)]            @ re-load len
 # ifdef        __thumb2__
        it      hs
 # endif
 # ifdef        __thumb2__
        it      hs
 # endif
@@ -598,15 +598,15 @@ $code.=<<___;
 
 .Ltail:
        ldr     r12,[sp,#4*(32+1)]      @ load inp
 
 .Ltail:
        ldr     r12,[sp,#4*(32+1)]      @ load inp
-       add     @t[2],sp,#4*(0)
+       add     @t[1],sp,#4*(0)
        ldr     r14,[sp,#4*(32+0)]      @ load out
 
 .Loop_tail:
        ldr     r14,[sp,#4*(32+0)]      @ load out
 
 .Loop_tail:
-       ldrb    @t[0],[@t[2]],#1        @ read buffer on stack
-       ldrb    @t[1],[r12],#1          @ read input
-       subs    @t[3],@t[3],#1
-       eor     @t[0],@t[0],@t[1]
-       strb    @t[0],[r14],#1          @ store output
+       ldrb    @t[2],[@t[1]],#1        @ read buffer on stack
+       ldrb    @t[3],[r12],#1          @ read input
+       subs    @t[0],@t[0],#1
+       eor     @t[3],@t[3],@t[2]
+       strb    @t[3],[r14],#1          @ store output
        bne     .Loop_tail
 
 .Ldone:
        bne     .Loop_tail
 
 .Ldone:
@@ -1120,7 +1120,7 @@ $code.=<<___;
 # endif
        stmia           @t[0],{@x[0]-@x[7]}
         add            @t[2],sp,#4*(0)
 # endif
        stmia           @t[0],{@x[0]-@x[7]}
         add            @t[2],sp,#4*(0)
-        sub            @t[3],@t[0],#64*3       @ len-=64*3
+        sub            @t[3],@t[3],#64*3       @ len-=64*3
 
 .Loop_tail_neon:
        ldrb            @t[0],[@t[2]],#1        @ read buffer on stack
 
 .Loop_tail_neon:
        ldrb            @t[0],[@t[2]],#1        @ read buffer on stack
index e637dc2f4fbd2ddde844ac91a9550703a0b9ea92..00e4a146b250b60d42991e4f146163fe05e756f8 100755 (executable)
@@ -257,11 +257,11 @@ $code.=<<___;
         st     @x[1],4*9(@t[0])
         x      @x[3],4*11(%r14)
         st     @x[2],4*10(@t[0])
         st     @x[1],4*9(@t[0])
         x      @x[3],4*11(%r14)
         st     @x[2],4*10(@t[0])
-       la      %r14,64(%r14)
         st     @x[3],4*11(@t[0])
 
        cl${g}r %r14,@t[1]                      # done yet?
         st     @x[3],4*11(@t[0])
 
        cl${g}r %r14,@t[1]                      # done yet?
-       jle     .Loop_outer
+       la      %r14,64(%r14)
+       jl      .Loop_outer
 
 .Ldone:
        xgr     %r0,%r0
 
 .Ldone:
        xgr     %r0,%r0
@@ -291,7 +291,7 @@ $code.=<<___;
        lrvr    @x[1],@x[1]
        lrvr    @x[2],@x[2]
        lrvr    @x[3],@x[3]
        lrvr    @x[1],@x[1]
        lrvr    @x[2],@x[2]
        lrvr    @x[3],@x[3]
-       stm     @x[0],@x[3],$stdframe+4*8+4*8($sp)
+       stm     @x[0],@x[3],$stdframe+4*8($sp)
 
 .Loop_tail:
        llgc    @x[4],0(@x[6],%r14)
 
 .Loop_tail:
        llgc    @x[4],0(@x[6],%r14)
index 107fc70819d559b2d9f764fec7b8f14bdd2ff7e4..4b36b5825c4508ef20364d1052f376b6c4842152 100755 (executable)
@@ -1544,13 +1544,13 @@ $code.=<<___;
        je              .Ldone4xop
 
        lea             0x40($inp),$inp         # inp+=64*3
        je              .Ldone4xop
 
        lea             0x40($inp),$inp         # inp+=64*3
-       vmovdqa         $xa2,0x00(%rsp)
+       vmovdqa         $xa3,0x00(%rsp)
        xor             %r10,%r10
        xor             %r10,%r10
-       vmovdqa         $xb2,0x10(%rsp)
+       vmovdqa         $xb3,0x10(%rsp)
        lea             0x40($out),$out         # out+=64*3
        lea             0x40($out),$out         # out+=64*3
-       vmovdqa         $xc2,0x20(%rsp)
+       vmovdqa         $xc3,0x20(%rsp)
        sub             \$192,$len              # len-=64*3
        sub             \$192,$len              # len-=64*3
-       vmovdqa         $xd2,0x30(%rsp)
+       vmovdqa         $xd3,0x30(%rsp)
 
 .Loop_tail4xop:
        movzb           ($inp,%r10),%eax
 
 .Loop_tail4xop:
        movzb           ($inp,%r10),%eax