{arm64|x86_64}cpuid.pl: add special 16-byte case to OPENSSL_memcmp.
authorAndy Polyakov <appro@openssl.org>
Sun, 20 May 2018 10:13:16 +0000 (12:13 +0200)
committerAndy Polyakov <appro@openssl.org>
Sun, 3 Jun 2018 19:15:18 +0000 (21:15 +0200)
OPENSSL_memcmp is a must in GCM decrypt and general-purpose loop takes
quite a portion of execution time for short inputs, more than GHASH for
few-byte inputs according to profiler. Special 16-byte case takes it off
top five list in profiler output.

Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6312)

crypto/arm64cpuid.pl
crypto/x86_64cpuid.pl

index daa2b17..06c8add 100755 (executable)
@@ -115,6 +115,19 @@ OPENSSL_cleanse:
 CRYPTO_memcmp:
        eor     w3,w3,w3
        cbz     x2,.Lno_data    // len==0?
+       cmp     x2,#16
+       b.ne    .Loop_cmp
+       ldp     x8,x9,[x0]
+       ldp     x10,x11,[x1]
+       eor     x8,x8,x10
+       eor     x9,x9,x11
+       orr     x8,x8,x9
+       mov     x0,#1
+       cmp     x8,#0
+       csel    x0,xzr,x0,eq
+       ret
+
+.align 4
 .Loop_cmp:
        ldrb    w4,[x0],#1
        ldrb    w5,[x1],#1
index 513d005..6423e80 100644 (file)
@@ -271,6 +271,18 @@ CRYPTO_memcmp:
        xor     %r10,%r10
        cmp     \$0,$arg3
        je      .Lno_data
+       cmp     \$16,$arg3
+       jne     .Loop_cmp
+       mov     ($arg1),%r10
+       mov     8($arg1),%r11
+       mov     \$1,$arg3
+       xor     ($arg2),%r10
+       xor     8($arg2),%r11
+       or      %r11,%r10
+       cmovnz  $arg3,%rax
+       ret
+
+.align 16
 .Loop_cmp:
        mov     ($arg1),%r10b
        lea     1($arg1),$arg1