Add assembly support to ios64-cross.
[openssl.git] / crypto / sha / asm / sha1-armv8.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # SHA1 for ARMv8.
11 #
12 # Performance in cycles per processed byte and improvement coefficient
13 # over code generated with "default" compiler:
14 #
15 #               hardware-assisted       software(*)
16 # Apple A7      2.31                    4.13 (+14%)
17 # Cortex-A53    2.19                    8.73 (+108%)
18 # Cortex-A57    2.35                    7.88 (+74%)
19 #
20 # (*)   Software results are presented mostly for reference purposes.
21
22 $flavour = shift;
23 $output  = shift;
24
25 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
26 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
27 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
28 die "can't locate arm-xlate.pl";
29
30 open OUT,"| \"$^X\" $xlate $flavour $output";
31 *STDOUT=*OUT;
32
33 ($ctx,$inp,$num)=("x0","x1","x2");
34 @Xw=map("w$_",(3..17,19));
35 @Xx=map("x$_",(3..17,19));
36 @V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
37 ($t0,$t1,$t2,$K)=map("w$_",(25..28));
38
39
40 sub BODY_00_19 {
41 my ($i,$a,$b,$c,$d,$e)=@_;
42 my $j=($i+2)&15;
43
44 $code.=<<___ if ($i<15 && !($i&1));
45         lsr     @Xx[$i+1],@Xx[$i],#32
46 ___
47 $code.=<<___ if ($i<14 && !($i&1));
48         ldr     @Xx[$i+2],[$inp,#`($i+2)*4-64`]
49 ___
50 $code.=<<___ if ($i<14 && ($i&1));
51 #ifdef  __ARMEB__
52         ror     @Xx[$i+1],@Xx[$i+1],#32
53 #else
54         rev32   @Xx[$i+1],@Xx[$i+1]
55 #endif
56 ___
57 $code.=<<___ if ($i<14);
58         bic     $t0,$d,$b
59         and     $t1,$c,$b
60         ror     $t2,$a,#27
61         add     $d,$d,$K                // future e+=K
62         orr     $t0,$t0,$t1
63         add     $e,$e,$t2               // e+=rot(a,5)
64         ror     $b,$b,#2
65         add     $d,$d,@Xw[($i+1)&15]    // future e+=X[i]
66         add     $e,$e,$t0               // e+=F(b,c,d)
67 ___
68 $code.=<<___ if ($i==19);
69         movz    $K,#0xeba1
70         movk    $K,#0x6ed9,lsl#16
71 ___
72 $code.=<<___ if ($i>=14);
73          eor    @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
74         bic     $t0,$d,$b
75         and     $t1,$c,$b
76         ror     $t2,$a,#27
77          eor    @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
78         add     $d,$d,$K                // future e+=K
79         orr     $t0,$t0,$t1
80         add     $e,$e,$t2               // e+=rot(a,5)
81          eor    @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
82         ror     $b,$b,#2
83         add     $d,$d,@Xw[($i+1)&15]    // future e+=X[i]
84         add     $e,$e,$t0               // e+=F(b,c,d)
85          ror    @Xw[$j],@Xw[$j],#31
86 ___
87 }
88
89 sub BODY_40_59 {
90 my ($i,$a,$b,$c,$d,$e)=@_;
91 my $j=($i+2)&15;
92
93 $code.=<<___ if ($i==59);
94         movz    $K,#0xc1d6
95         movk    $K,#0xca62,lsl#16
96 ___
97 $code.=<<___;
98         orr     $t0,$b,$c
99         and     $t1,$b,$c
100          eor    @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
101         ror     $t2,$a,#27
102         and     $t0,$t0,$d
103         add     $d,$d,$K                // future e+=K
104          eor    @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
105         add     $e,$e,$t2               // e+=rot(a,5)
106         orr     $t0,$t0,$t1
107         ror     $b,$b,#2
108          eor    @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
109         add     $d,$d,@Xw[($i+1)&15]    // future e+=X[i]
110         add     $e,$e,$t0               // e+=F(b,c,d)
111          ror    @Xw[$j],@Xw[$j],#31
112 ___
113 }
114
115 sub BODY_20_39 {
116 my ($i,$a,$b,$c,$d,$e)=@_;
117 my $j=($i+2)&15;
118
119 $code.=<<___ if ($i==39);
120         movz    $K,#0xbcdc
121         movk    $K,#0x8f1b,lsl#16
122 ___
123 $code.=<<___ if ($i<78);
124          eor    @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
125         eor     $t0,$d,$b
126         ror     $t2,$a,#27
127         add     $d,$d,$K                // future e+=K
128          eor    @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
129         eor     $t0,$t0,$c
130         add     $e,$e,$t2               // e+=rot(a,5)
131         ror     $b,$b,#2
132          eor    @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
133         add     $d,$d,@Xw[($i+1)&15]    // future e+=X[i]
134         add     $e,$e,$t0               // e+=F(b,c,d)
135          ror    @Xw[$j],@Xw[$j],#31
136 ___
137 $code.=<<___ if ($i==78);
138         ldp     @Xw[1],@Xw[2],[$ctx]
139         eor     $t0,$d,$b
140         ror     $t2,$a,#27
141         add     $d,$d,$K                // future e+=K
142         eor     $t0,$t0,$c
143         add     $e,$e,$t2               // e+=rot(a,5)
144         ror     $b,$b,#2
145         add     $d,$d,@Xw[($i+1)&15]    // future e+=X[i]
146         add     $e,$e,$t0               // e+=F(b,c,d)
147 ___
148 $code.=<<___ if ($i==79);
149         ldp     @Xw[3],@Xw[4],[$ctx,#8]
150         eor     $t0,$d,$b
151         ror     $t2,$a,#27
152         eor     $t0,$t0,$c
153         add     $e,$e,$t2               // e+=rot(a,5)
154         ror     $b,$b,#2
155         ldr     @Xw[5],[$ctx,#16]
156         add     $e,$e,$t0               // e+=F(b,c,d)
157 ___
158 }
159
160 $code.=<<___;
161 #include "arm_arch.h"
162
163 .text
164
165 .extern OPENSSL_armcap_P
166 .globl  sha1_block_data_order
167 .type   sha1_block_data_order,%function
168 .align  6
169 sha1_block_data_order:
170         ldr     x16,.LOPENSSL_armcap_P
171         adr     x17,.LOPENSSL_armcap_P
172         add     x16,x16,x17
173         ldr     w16,[x16]
174         tst     w16,#ARMV8_SHA1
175         b.ne    .Lv8_entry
176
177         stp     x29,x30,[sp,#-96]!
178         add     x29,sp,#0
179         stp     x19,x20,[sp,#16]
180         stp     x21,x22,[sp,#32]
181         stp     x23,x24,[sp,#48]
182         stp     x25,x26,[sp,#64]
183         stp     x27,x28,[sp,#80]
184
185         ldp     $A,$B,[$ctx]
186         ldp     $C,$D,[$ctx,#8]
187         ldr     $E,[$ctx,#16]
188
189 .Loop:
190         ldr     @Xx[0],[$inp],#64
191         movz    $K,#0x7999
192         sub     $num,$num,#1
193         movk    $K,#0x5a82,lsl#16
194 #ifdef  __ARMEB__
195         ror     $Xx[0],@Xx[0],#32
196 #else
197         rev32   @Xx[0],@Xx[0]
198 #endif
199         add     $E,$E,$K                // warm it up
200         add     $E,$E,@Xw[0]
201 ___
202 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
203 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
204 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
205 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
206 $code.=<<___;
207         add     $B,$B,@Xw[2]
208         add     $C,$C,@Xw[3]
209         add     $A,$A,@Xw[1]
210         add     $D,$D,@Xw[4]
211         add     $E,$E,@Xw[5]
212         stp     $A,$B,[$ctx]
213         stp     $C,$D,[$ctx,#8]
214         str     $E,[$ctx,#16]
215         cbnz    $num,.Loop
216
217         ldp     x19,x20,[sp,#16]
218         ldp     x21,x22,[sp,#32]
219         ldp     x23,x24,[sp,#48]
220         ldp     x25,x26,[sp,#64]
221         ldp     x27,x28,[sp,#80]
222         ldr     x29,[sp],#96
223         ret
224 .size   sha1_block_data_order,.-sha1_block_data_order
225 ___
226 {{{
227 my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
228 my @MSG=map("v$_.16b",(4..7));
229 my @Kxx=map("v$_.4s",(16..19));
230 my ($W0,$W1)=("v20.4s","v21.4s");
231 my $ABCD_SAVE="v22.16b";
232
233 $code.=<<___;
234 .type   sha1_block_armv8,%function
235 .align  6
236 sha1_block_armv8:
237 .Lv8_entry:
238         stp     x29,x30,[sp,#-16]!
239         add     x29,sp,#0
240
241         adr     x4,.Lconst
242         eor     $E,$E,$E
243         ld1.32  {$ABCD},[$ctx],#16
244         ld1.32  {$E}[0],[$ctx]
245         sub     $ctx,$ctx,#16
246         ld1.32  {@Kxx[0]-@Kxx[3]},[x4]
247
248 .Loop_hw:
249         ld1     {@MSG[0]-@MSG[3]},[$inp],#64
250         sub     $num,$num,#1
251         rev32   @MSG[0],@MSG[0]
252         rev32   @MSG[1],@MSG[1]
253
254         add.i32 $W0,@Kxx[0],@MSG[0]
255         rev32   @MSG[2],@MSG[2]
256         orr     $ABCD_SAVE,$ABCD,$ABCD  // offload
257
258         add.i32 $W1,@Kxx[0],@MSG[1]
259         rev32   @MSG[3],@MSG[3]
260         sha1h   $E1,$ABCD
261         sha1c   $ABCD,$E,$W0            // 0
262         add.i32 $W0,@Kxx[$j],@MSG[2]
263         sha1su0 @MSG[0],@MSG[1],@MSG[2]
264 ___
265 for ($j=0,$i=1;$i<20-3;$i++) {
266 my $f=("c","p","m","p")[$i/5];
267 $code.=<<___;
268         sha1h   $E0,$ABCD               // $i
269         sha1$f  $ABCD,$E1,$W1
270         add.i32 $W1,@Kxx[$j],@MSG[3]
271         sha1su1 @MSG[0],@MSG[3]
272 ___
273 $code.=<<___ if ($i<20-4);
274         sha1su0 @MSG[1],@MSG[2],@MSG[3]
275 ___
276         ($E0,$E1)=($E1,$E0);            ($W0,$W1)=($W1,$W0);
277         push(@MSG,shift(@MSG));         $j++ if ((($i+3)%5)==0);
278 }
279 $code.=<<___;
280         sha1h   $E0,$ABCD               // $i
281         sha1p   $ABCD,$E1,$W1
282         add.i32 $W1,@Kxx[$j],@MSG[3]
283
284         sha1h   $E1,$ABCD               // 18
285         sha1p   $ABCD,$E0,$W0
286
287         sha1h   $E0,$ABCD               // 19
288         sha1p   $ABCD,$E1,$W1
289
290         add.i32 $E,$E,$E0
291         add.i32 $ABCD,$ABCD,$ABCD_SAVE
292
293         cbnz    $num,.Loop_hw
294
295         st1.32  {$ABCD},[$ctx],#16
296         st1.32  {$E}[0],[$ctx]
297
298         ldr     x29,[sp],#16
299         ret
300 .size   sha1_block_armv8,.-sha1_block_armv8
301 .align  6
302 .Lconst:
303 .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     //K_00_19
304 .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     //K_20_39
305 .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     //K_40_59
306 .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     //K_60_79
307 .LOPENSSL_armcap_P:
308 .quad   OPENSSL_armcap_P-.
309 .asciz  "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
310 .align  2
311 .comm   OPENSSL_armcap_P,4,4
312 ___
313 }}}
314
315 {   my  %opcode = (
316         "sha1c"         => 0x5e000000,  "sha1p"         => 0x5e001000,
317         "sha1m"         => 0x5e002000,  "sha1su0"       => 0x5e003000,
318         "sha1h"         => 0x5e280800,  "sha1su1"       => 0x5e281800   );
319
320     sub unsha1 {
321         my ($mnemonic,$arg)=@_;
322
323         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
324         &&
325         sprintf ".inst\t0x%08x\t//%s %s",
326                         $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
327                         $mnemonic,$arg;
328     }
329 }
330
331 foreach(split("\n",$code)) {
332
333         s/\`([^\`]*)\`/eval($1)/geo;
334
335         s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
336
337         s/\.\w?32\b//o          and s/\.16b/\.4s/go;
338         m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
339
340         print $_,"\n";
341 }
342
343 close STDOUT;