x86_64 assembler pack to comply with updated styling x86_64-xlate.pl rules.
authorAndy Polyakov <appro@openssl.org>
Wed, 12 Nov 2008 08:15:52 +0000 (08:15 +0000)
committerAndy Polyakov <appro@openssl.org>
Wed, 12 Nov 2008 08:15:52 +0000 (08:15 +0000)
crypto/aes/asm/aes-x86_64.pl
crypto/bn/asm/x86_64-gcc.c
crypto/rc4/asm/rc4-x86_64.pl
crypto/sha/asm/sha512-x86_64.pl
crypto/whrlpool/asm/wp-x86_64.pl
crypto/x86_64cpuid.pl

index da425388911a774ff7d80c4088f9e41cc22c1200..d041507343b6357af1a627aeb914c604fdbd0a9f 100755 (executable)
@@ -617,8 +617,7 @@ AES_encrypt:
        push    $key
 
        # pick Te4 copy which can't "overlap" with stack frame or key schedule
-       .picmeup        $sbox
-       lea     AES_Te+2048-.($sbox),$sbox
+       lea     .LAES_Te+2048(%rip),$sbox
        lea     768(%rsp),%rbp
        sub     $sbox,%rbp
        and     \$0x300,%rbp
@@ -1210,8 +1209,7 @@ AES_decrypt:
        push    $key
 
        # pick Td4 copy which can't "overlap" with stack frame or key schedule
-       .picmeup        $sbox
-       lea     AES_Td+2048-.($sbox),$sbox
+       lea     .LAES_Td+2048(%rip),$sbox
        lea     768(%rsp),%rbp
        sub     $sbox,%rbp
        and     \$0x300,%rbp
@@ -1292,8 +1290,7 @@ _x86_64_AES_set_encrypt_key:
        test    \$-1,%rdi
        jz      .Lbadpointer
 
-       .picmeup %rbp
-       lea     AES_Te-.(%rbp),%rbp
+       lea     .LAES_Te(%rip),%rbp
        lea     2048+128(%rbp),%rbp
 
        # prefetch Te4
@@ -1564,8 +1561,7 @@ AES_set_decrypt_key:
                cmp     %rsi,%rdi
        jne     .Linvert
 
-       .picmeup %rax
-       lea     AES_Te+2048+1024-.(%rax),%rax   # rcon
+       lea     .LAES_Te+2048+1024(%rip),%rax   # rcon
 
        mov     40(%rax),$mask80
        mov     48(%rax),$maskfe
@@ -1636,11 +1632,10 @@ AES_cbc_encrypt:
        cld
        mov     %r9d,%r9d       # clear upper half of enc
 
-       .picmeup $sbox
-       lea     AES_Te-.($sbox),$sbox
+       lea     .LAES_Te(%rip),$sbox
        cmp     \$0,%r9
        jne     .Lcbc_picked_te
-       lea     AES_Td-AES_Te($sbox),$sbox
+       lea     .LAES_Td(%rip),$sbox
 .Lcbc_picked_te:
 
        mov     OPENSSL_ia32cap_P(%rip),%eax
@@ -2066,9 +2061,8 @@ ___
 }
 
 $code.=<<___;
-.globl AES_Te
 .align 64
-AES_Te:
+.LAES_Te:
 ___
        &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
        &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
@@ -2275,9 +2269,8 @@ $code.=<<___;
        .long   0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
 ___
 $code.=<<___;
-.globl AES_Td
 .align 64
-AES_Td:
+.LAES_Td:
 ___
        &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
        &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
index f13f52dd853e2f8bdb795b5226d4137f105652ab..c4d941d0b4c7f1b36d59935abcd432f44d49f1d8 100644 (file)
@@ -182,7 +182,7 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
 
        asm (
        "       subq    %2,%2           \n"
-       ".align 16                      \n"
+       ".p2align 4                     \n"
        "1:     movq    (%4,%2,8),%0    \n"
        "       adcq    (%5,%2,8),%0    \n"
        "       movq    %0,(%3,%2,8)    \n"
@@ -205,7 +205,7 @@ BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
 
        asm (
        "       subq    %2,%2           \n"
-       ".align 16                      \n"
+       ".p2align 4                     \n"
        "1:     movq    (%4,%2,8),%0    \n"
        "       sbbq    (%5,%2,8),%0    \n"
        "       movq    %0,(%3,%2,8)    \n"
index c2af3109a069eae24fa3a13fa57e9992bbc99f44..959a67a868ad6a851ab4e1021f7fb84694694467 100755 (executable)
@@ -336,8 +336,7 @@ RC4_set_key:
 .type  RC4_options,\@function,0
 .align 16
 RC4_options:
-       .picmeup %rax
-       lea     .Lopts-.(%rax),%rax
+       lea     .Lopts(%rip),%rax
        mov     OPENSSL_ia32cap_P(%rip),%edx
        bt      \$20,%edx
        jnc     .Ldone
index b6252d31eca20192099be564471f91fb06eea6d3..10fd2abb65e3f07ec03363b5cd14af44218548ef 100755 (executable)
 # sha256_block:-( This is presumably because 64-bit shifts/rotates
 # apparently are not atomic instructions, but implemented in microcode.
 
-$output=shift;
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $output";
+open STDOUT,"| $^X $xlate $flavour $output";
 
 if ($output =~ /512/) {
        $func="sha512_block_data_order";
@@ -196,8 +198,7 @@ $func:
        mov     %rdx,$_end              # save end pointer, "3rd" arg
        mov     %rbp,$_rsp              # save copy of %rsp
 
-       .picmeup $Tbl
-       lea     $TABLE-.($Tbl),$Tbl
+       lea     $TABLE(%rip),$Tbl
 
        mov     $SZ*0($ctx),$A
        mov     $SZ*1($ctx),$B
index 41bf3b2025bb2e6f9c8ba13e5bc69c67ab9b65b6..aaed353419c6693baecf4921df11975e982e0b4b 100644 (file)
@@ -71,8 +71,7 @@ $func:
        mov     %rdx,16(%rbx)
        mov     %rax,32(%rbx)           # saved stack pointer
 
-       .picmeup %rbp
-       lea     $table-.(%rbp),%rbp
+       lea     $table(%rip),%rbp
 
        xor     %rcx,%rcx
        xor     %rdx,%rdx
index e19ecdbbfc8fc6da68d47a9d5b1004b3ecb9c6cc..c54b9e3681c6cebba14a9509a43c13865079dccc 100644 (file)
 #!/usr/bin/env perl
 
-$output=shift;
-$masm=1 if ($output =~ /\.asm/);
-open STDOUT,">$output" || die "can't open $output: $!";
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 
-print<<___ if(defined($masm));
-_TEXT  SEGMENT
-PUBLIC OPENSSL_rdtsc
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 
-PUBLIC OPENSSL_atomic_add
-ALIGN  16
-OPENSSL_atomic_add     PROC
-       mov     eax,DWORD PTR[rcx]
-\$Lspin:       lea     r8,DWORD PTR[rdx+rax]
-lock   cmpxchg DWORD PTR[rcx],r8d
-       jne     \$Lspin
-       mov     eax,r8d
-       cdqe    
-       ret
-OPENSSL_atomic_add     ENDP
-
-PUBLIC OPENSSL_wipe_cpu
-ALIGN  16
-OPENSSL_wipe_cpu       PROC
-       pxor    xmm0,xmm0
-       pxor    xmm1,xmm1
-       pxor    xmm2,xmm2
-       pxor    xmm3,xmm3
-       pxor    xmm4,xmm4
-       pxor    xmm5,xmm5
-       xor     rcx,rcx
-       xor     rdx,rdx
-       xor     r8,r8
-       xor     r9,r9
-       xor     r10,r10
-       xor     r11,r11
-       lea     rax,QWORD PTR[rsp+8]
-       ret
-OPENSSL_wipe_cpu       ENDP
-_TEXT  ENDS
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
 
-CRT\$XIU       SEGMENT
-EXTRN  OPENSSL_cpuid_setup:PROC
-DQ     OPENSSL_cpuid_setup
-CRT\$XIU       ENDS
+if ($win64)    { $arg1="%rcx"; $arg2="%rdx"; }
+else           { $arg1="%rdi"; $arg2="%rsi"; }
+print<<___;
+.extern                OPENSSL_cpuid_setup
+.section       .init
+       call    OPENSSL_cpuid_setup
 
-___
-print<<___ if(!defined($masm));
 .text
 
 .globl OPENSSL_atomic_add
-.type  OPENSSL_atomic_add,\@function
+.type  OPENSSL_atomic_add,\@abi-omnipotent
 .align 16
 OPENSSL_atomic_add:
-       movl    (%rdi),%eax
-.Lspin:        leaq    (%rsi,%rax),%r8
-lock;  cmpxchgl        %r8d,(%rdi)
+       movl    ($arg1),%eax
+.Lspin:        leaq    ($arg2,%rax),%r8
+       .byte   0xf0            # lock
+       cmpxchgl        %r8d,($arg1)
        jne     .Lspin
        movl    %r8d,%eax
-       .byte   0x48,0x98
+       .byte   0x48,0x98       # cltq/cdqe
        ret
 .size  OPENSSL_atomic_add,.-OPENSSL_atomic_add
 
-.globl OPENSSL_wipe_cpu
-.type  OPENSSL_wipe_cpu,\@function
-.align 16
-OPENSSL_wipe_cpu:
-       pxor    %xmm0,%xmm0
-       pxor    %xmm1,%xmm1
-       pxor    %xmm2,%xmm2
-       pxor    %xmm3,%xmm3
-       pxor    %xmm4,%xmm4
-       pxor    %xmm5,%xmm5
-       pxor    %xmm6,%xmm6
-       pxor    %xmm7,%xmm7
-       pxor    %xmm8,%xmm8
-       pxor    %xmm9,%xmm9
-       pxor    %xmm10,%xmm10
-       pxor    %xmm11,%xmm11
-       pxor    %xmm12,%xmm12
-       pxor    %xmm13,%xmm13
-       pxor    %xmm14,%xmm14
-       pxor    %xmm15,%xmm15
-       xorq    %rcx,%rcx
-       xorq    %rdx,%rdx
-       xorq    %rsi,%rsi
-       xorq    %rdi,%rdi
-       xorq    %r8,%r8
-       xorq    %r9,%r9
-       xorq    %r10,%r10
-       xorq    %r11,%r11
-       leaq    8(%rsp),%rax
-       ret
-.size  OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
-
-.section       .init
-       call    OPENSSL_cpuid_setup
-
-___
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $output";
-
-print<<___;
-.text
-
 .globl OPENSSL_rdtsc
 .type  OPENSSL_rdtsc,\@abi-omnipotent
 .align 16
@@ -159,35 +86,91 @@ OPENSSL_ia32_cpuid:
 .size  OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
 
 .globl  OPENSSL_cleanse
-.type   OPENSSL_cleanse,\@function,2
+.type   OPENSSL_cleanse,\@abi-omnipotent
 .align  16
 OPENSSL_cleanse:
        xor     %rax,%rax
-       cmp     \$15,%rsi
+       cmp     \$15,$arg2
        jae     .Lot
 .Little:
-       mov     %al,(%rdi)
-       sub     \$1,%rsi
-       lea     1(%rdi),%rdi
+       mov     %al,($arg1)
+       sub     \$1,$arg2
+       lea     1($arg1),$arg1
        jnz     .Little
        ret
 .align 16
 .Lot:
-       test    \$7,%rdi
+       test    \$7,$arg1
        jz      .Laligned
-       mov     %al,(%rdi)
-       lea     -1(%rsi),%rsi
-       lea     1(%rdi),%rdi
+       mov     %al,($arg1)
+       lea     -1($arg2),$arg2
+       lea     1($arg1),$arg1
        jmp     .Lot
 .Laligned:
-       mov     %rax,(%rdi)
-       lea     -8(%rsi),%rsi
-       test    \$-8,%rsi
-       lea     8(%rdi),%rdi
+       mov     %rax,($arg1)
+       lea     -8($arg2),$arg2
+       test    \$-8,$arg2
+       lea     8($arg1),$arg1
        jnz     .Laligned
-       cmp     \$0,%rsi
+       cmp     \$0,$arg2
        jne     .Little
        ret
 .size  OPENSSL_cleanse,.-OPENSSL_cleanse
 ___
+
+print<<___ if (!$win64);
+.globl OPENSSL_wipe_cpu
+.type  OPENSSL_wipe_cpu,\@abi-omnipotent
+.align 16
+OPENSSL_wipe_cpu:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
+       pxor    %xmm10,%xmm10
+       pxor    %xmm11,%xmm11
+       pxor    %xmm12,%xmm12
+       pxor    %xmm13,%xmm13
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+       xorq    %rcx,%rcx
+       xorq    %rdx,%rdx
+       xorq    %rsi,%rsi
+       xorq    %rdi,%rdi
+       xorq    %r8,%r8
+       xorq    %r9,%r9
+       xorq    %r10,%r10
+       xorq    %r11,%r11
+       leaq    8(%rsp),%rax
+       ret
+.size  OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+___
+print<<___ if ($win64);
+.globl OPENSSL_wipe_cpu
+.type  OPENSSL_wipe_cpu,\@abi-omnipotent
+.align 16
+OPENSSL_wipe_cpu:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       xorq    %rcx,%rcx
+       xorq    %rdx,%rdx
+       xorq    %r8,%r8
+       xorq    %r9,%r9
+       xorq    %r10,%r10
+       xorq    %r11,%r11
+       leaq    8(%rsp),%rax
+       ret
+.size  OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+___
+
 close STDOUT;  # flush