ARM assembler pack: reschedule instructions for dual-issue pipeline.
[openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA256 block procedure for ARMv4. May 2007.
11
12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14 # byte [on single-issue Xscale PXA250 core].
15
16 # July 2010.
17 #
18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
19 # Cortex A8 core and ~20 cycles per processed byte.
20
21 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
22 open STDOUT,">$output";
23
24 $ctx="r0";      $t0="r0";
25 $inp="r1";
26 $len="r2";      $t1="r2";
27 $T1="r3";
28 $A="r4";
29 $B="r5";
30 $C="r6";
31 $D="r7";
32 $E="r8";
33 $F="r9";
34 $G="r10";
35 $H="r11";
36 @V=($A,$B,$C,$D,$E,$F,$G,$H);
37 $t2="r12";
38 $Ktbl="r14";
39
40 @Sigma0=( 2,13,22);
41 @Sigma1=( 6,11,25);
42 @sigma0=( 7,18, 3);
43 @sigma1=(17,19,10);
44
45 sub BODY_00_15 {
46 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
47
48 $code.=<<___ if ($i<16);
49         ldrb    $T1,[$inp,#3]                   @ $i
50         ldrb    $t2,[$inp,#2]
51         ldrb    $t1,[$inp,#1]
52         ldrb    $t0,[$inp],#4
53         orr     $T1,$T1,$t2,lsl#8
54         orr     $T1,$T1,$t1,lsl#16
55         orr     $T1,$T1,$t0,lsl#24
56         `"str   $inp,[sp,#17*4]"        if ($i==15)`
57 ___
58 $code.=<<___;
59         ldr     $t2,[$Ktbl],#4                  @ *K256++
60         mov     $t0,$e,ror#$Sigma1[0]
61         str     $T1,[sp,#`$i%16`*4]
62         eor     $t0,$t0,$e,ror#$Sigma1[1]
63         eor     $t1,$f,$g
64         eor     $t0,$t0,$e,ror#$Sigma1[2]       @ Sigma1(e)
65         and     $t1,$t1,$e
66         add     $T1,$T1,$t0
67         eor     $t1,$t1,$g                      @ Ch(e,f,g)
68         add     $T1,$T1,$h
69         mov     $h,$a,ror#$Sigma0[0]
70         add     $T1,$T1,$t1
71         eor     $h,$h,$a,ror#$Sigma0[1]
72         add     $T1,$T1,$t2
73         eor     $h,$h,$a,ror#$Sigma0[2]         @ Sigma0(a)
74         orr     $t0,$a,$b
75         and     $t1,$a,$b
76         and     $t0,$t0,$c
77         add     $h,$h,$T1
78         orr     $t0,$t0,$t1                     @ Maj(a,b,c)
79         add     $d,$d,$T1
80         add     $h,$h,$t0
81 ___
82 }
83
84 sub BODY_16_XX {
85 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
86
87 $code.=<<___;
88         ldr     $t1,[sp,#`($i+1)%16`*4]         @ $i
89         ldr     $t2,[sp,#`($i+14)%16`*4]
90         ldr     $T1,[sp,#`($i+0)%16`*4]
91         mov     $t0,$t1,ror#$sigma0[0]
92         ldr     $inp,[sp,#`($i+9)%16`*4]
93         eor     $t0,$t0,$t1,ror#$sigma0[1]
94         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
95         mov     $t1,$t2,ror#$sigma1[0]
96         add     $T1,$T1,$t0
97         eor     $t1,$t1,$t2,ror#$sigma1[1]
98         add     $T1,$T1,$inp
99         eor     $t1,$t1,$t2,lsr#$sigma1[2]      @ sigma1(X[i+14])
100         add     $T1,$T1,$t1
101 ___
102         &BODY_00_15(@_);
103 }
104
105 $code=<<___;
106 .text
107 .code   32
108
109 .type   K256,%object
110 .align  5
111 K256:
112 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
113 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
114 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
115 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
116 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
117 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
118 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
119 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
120 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
121 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
122 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
123 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
124 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
125 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
126 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
127 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
128 .size   K256,.-K256
129
130 .global sha256_block_data_order
131 .type   sha256_block_data_order,%function
132 sha256_block_data_order:
133         sub     r3,pc,#8                @ sha256_block_data_order
134         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
135         stmdb   sp!,{$ctx,$inp,$len,r4-r12,lr}
136         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
137         sub     $Ktbl,r3,#256           @ K256
138         sub     sp,sp,#16*4             @ alloca(X[16])
139 .Loop:
140 ___
141 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
142 $code.=".Lrounds_16_xx:\n";
143 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
144 $code.=<<___;
145         and     $t2,$t2,#0xff
146         cmp     $t2,#0xf2
147         bne     .Lrounds_16_xx
148
149         ldr     $T1,[sp,#16*4]          @ pull ctx
150         ldr     $t0,[$T1,#0]
151         ldr     $t1,[$T1,#4]
152         ldr     $t2,[$T1,#8]
153         add     $A,$A,$t0
154         ldr     $t0,[$T1,#12]
155         add     $B,$B,$t1
156         ldr     $t1,[$T1,#16]
157         add     $C,$C,$t2
158         ldr     $t2,[$T1,#20]
159         add     $D,$D,$t0
160         ldr     $t0,[$T1,#24]
161         add     $E,$E,$t1
162         ldr     $t1,[$T1,#28]
163         add     $F,$F,$t2
164         ldr     $inp,[sp,#17*4]         @ pull inp
165         ldr     $t2,[sp,#18*4]          @ pull inp+len
166         add     $G,$G,$t0
167         add     $H,$H,$t1
168         stmia   $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
169         cmp     $inp,$t2
170         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
171         bne     .Loop
172
173         add     sp,sp,#`16+3`*4 @ destroy frame
174         ldmia   sp!,{r4-r12,lr}
175         tst     lr,#1
176         moveq   pc,lr                   @ be binary compatible with V4, yet
177         bx      lr                      @ interoperable with Thumb ISA:-)
178 .size   sha256_block_data_order,.-sha256_block_data_order
179 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
180 .align  2
181 ___
182
183 $code =~ s/\`([^\`]*)\`/eval $1/gem;
184 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
185 print $code;
186 close STDOUT; # enforce flush