spelling fixes, just comments and readme.
[openssl.git] / crypto / sha / asm / sha1-armv8.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # SHA1 for ARMv8.
18 #
19 # Performance in cycles per processed byte and improvement coefficient
20 # over code generated with "default" compiler:
21 #
22 #               hardware-assisted       software(*)
23 # Apple A7      2.31                    4.13 (+14%)
24 # Cortex-A53    2.24                    8.03 (+97%)
25 # Cortex-A57    2.35                    7.88 (+74%)
26 # Denver        2.13                    3.97 (+0%)(**)
27 # X-Gene                                8.80 (+200%)
28 #
29 # (*)   Software results are presented mostly for reference purposes.
30 # (**)  Keep in mind that Denver relies on binary translation, which
31 #       optimizes compiler output at run-time.
32
33 $flavour = shift;
34 $output  = shift;
35
36 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
38 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
39 die "can't locate arm-xlate.pl";
40
41 open OUT,"| \"$^X\" $xlate $flavour $output";
42 *STDOUT=*OUT;
43
44 ($ctx,$inp,$num)=("x0","x1","x2");
45 @Xw=map("w$_",(3..17,19));
46 @Xx=map("x$_",(3..17,19));
47 @V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
48 ($t0,$t1,$t2,$K)=map("w$_",(25..28));
49
50
51 sub BODY_00_19 {
52 my ($i,$a,$b,$c,$d,$e)=@_;
53 my $j=($i+2)&15;
54
55 $code.=<<___ if ($i<15 && !($i&1));
56         lsr     @Xx[$i+1],@Xx[$i],#32
57 ___
58 $code.=<<___ if ($i<14 && !($i&1));
59         ldr     @Xx[$i+2],[$inp,#`($i+2)*4-64`]
60 ___
61 $code.=<<___ if ($i<14 && ($i&1));
62 #ifdef  __ARMEB__
63         ror     @Xx[$i+1],@Xx[$i+1],#32
64 #else
65         rev32   @Xx[$i+1],@Xx[$i+1]
66 #endif
67 ___
68 $code.=<<___ if ($i<14);
69         bic     $t0,$d,$b
70         and     $t1,$c,$b
71         ror     $t2,$a,#27
72         add     $d,$d,$K                // future e+=K
73         orr     $t0,$t0,$t1
74         add     $e,$e,$t2               // e+=rot(a,5)
75         ror     $b,$b,#2
76         add     $d,$d,@Xw[($i+1)&15]    // future e+=X[i]
77         add     $e,$e,$t0               // e+=F(b,c,d)
78 ___
79 $code.=<<___ if ($i==19);
80         movz    $K,#0xeba1
81         movk    $K,#0x6ed9,lsl#16
82 ___
83 $code.=<<___ if ($i>=14);
84          eor    @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
85         bic     $t0,$d,$b
86         and     $t1,$c,$b
87         ror     $t2,$a,#27
88          eor    @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
89         add     $d,$d,$K                // future e+=K
90         orr     $t0,$t0,$t1
91         add     $e,$e,$t2               // e+=rot(a,5)
92          eor    @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
93         ror     $b,$b,#2
94         add     $d,$d,@Xw[($i+1)&15]    // future e+=X[i]
95         add     $e,$e,$t0               // e+=F(b,c,d)
96          ror    @Xw[$j],@Xw[$j],#31
97 ___
98 }
99
100 sub BODY_40_59 {
101 my ($i,$a,$b,$c,$d,$e)=@_;
102 my $j=($i+2)&15;
103
104 $code.=<<___ if ($i==59);
105         movz    $K,#0xc1d6
106         movk    $K,#0xca62,lsl#16
107 ___
108 $code.=<<___;
109         orr     $t0,$b,$c
110         and     $t1,$b,$c
111          eor    @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
112         ror     $t2,$a,#27
113         and     $t0,$t0,$d
114         add     $d,$d,$K                // future e+=K
115          eor    @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
116         add     $e,$e,$t2               // e+=rot(a,5)
117         orr     $t0,$t0,$t1
118         ror     $b,$b,#2
119          eor    @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
120         add     $d,$d,@Xw[($i+1)&15]    // future e+=X[i]
121         add     $e,$e,$t0               // e+=F(b,c,d)
122          ror    @Xw[$j],@Xw[$j],#31
123 ___
124 }
125
126 sub BODY_20_39 {
127 my ($i,$a,$b,$c,$d,$e)=@_;
128 my $j=($i+2)&15;
129
130 $code.=<<___ if ($i==39);
131         movz    $K,#0xbcdc
132         movk    $K,#0x8f1b,lsl#16
133 ___
134 $code.=<<___ if ($i<78);
135          eor    @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
136         eor     $t0,$d,$b
137         ror     $t2,$a,#27
138         add     $d,$d,$K                // future e+=K
139          eor    @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
140         eor     $t0,$t0,$c
141         add     $e,$e,$t2               // e+=rot(a,5)
142         ror     $b,$b,#2
143          eor    @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
144         add     $d,$d,@Xw[($i+1)&15]    // future e+=X[i]
145         add     $e,$e,$t0               // e+=F(b,c,d)
146          ror    @Xw[$j],@Xw[$j],#31
147 ___
148 $code.=<<___ if ($i==78);
149         ldp     @Xw[1],@Xw[2],[$ctx]
150         eor     $t0,$d,$b
151         ror     $t2,$a,#27
152         add     $d,$d,$K                // future e+=K
153         eor     $t0,$t0,$c
154         add     $e,$e,$t2               // e+=rot(a,5)
155         ror     $b,$b,#2
156         add     $d,$d,@Xw[($i+1)&15]    // future e+=X[i]
157         add     $e,$e,$t0               // e+=F(b,c,d)
158 ___
159 $code.=<<___ if ($i==79);
160         ldp     @Xw[3],@Xw[4],[$ctx,#8]
161         eor     $t0,$d,$b
162         ror     $t2,$a,#27
163         eor     $t0,$t0,$c
164         add     $e,$e,$t2               // e+=rot(a,5)
165         ror     $b,$b,#2
166         ldr     @Xw[5],[$ctx,#16]
167         add     $e,$e,$t0               // e+=F(b,c,d)
168 ___
169 }
170
171 $code.=<<___;
172 #include "arm_arch.h"
173
174 .text
175
176 .extern OPENSSL_armcap_P
177 .globl  sha1_block_data_order
178 .type   sha1_block_data_order,%function
179 .align  6
180 sha1_block_data_order:
181 #ifdef  __ILP32__
182         ldrsw   x16,.LOPENSSL_armcap_P
183 #else
184         ldr     x16,.LOPENSSL_armcap_P
185 #endif
186         adr     x17,.LOPENSSL_armcap_P
187         add     x16,x16,x17
188         ldr     w16,[x16]
189         tst     w16,#ARMV8_SHA1
190         b.ne    .Lv8_entry
191
192         stp     x29,x30,[sp,#-96]!
193         add     x29,sp,#0
194         stp     x19,x20,[sp,#16]
195         stp     x21,x22,[sp,#32]
196         stp     x23,x24,[sp,#48]
197         stp     x25,x26,[sp,#64]
198         stp     x27,x28,[sp,#80]
199
200         ldp     $A,$B,[$ctx]
201         ldp     $C,$D,[$ctx,#8]
202         ldr     $E,[$ctx,#16]
203
204 .Loop:
205         ldr     @Xx[0],[$inp],#64
206         movz    $K,#0x7999
207         sub     $num,$num,#1
208         movk    $K,#0x5a82,lsl#16
209 #ifdef  __ARMEB__
210         ror     $Xx[0],@Xx[0],#32
211 #else
212         rev32   @Xx[0],@Xx[0]
213 #endif
214         add     $E,$E,$K                // warm it up
215         add     $E,$E,@Xw[0]
216 ___
217 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
218 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
219 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
220 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
221 $code.=<<___;
222         add     $B,$B,@Xw[2]
223         add     $C,$C,@Xw[3]
224         add     $A,$A,@Xw[1]
225         add     $D,$D,@Xw[4]
226         add     $E,$E,@Xw[5]
227         stp     $A,$B,[$ctx]
228         stp     $C,$D,[$ctx,#8]
229         str     $E,[$ctx,#16]
230         cbnz    $num,.Loop
231
232         ldp     x19,x20,[sp,#16]
233         ldp     x21,x22,[sp,#32]
234         ldp     x23,x24,[sp,#48]
235         ldp     x25,x26,[sp,#64]
236         ldp     x27,x28,[sp,#80]
237         ldr     x29,[sp],#96
238         ret
239 .size   sha1_block_data_order,.-sha1_block_data_order
240 ___
241 {{{
242 my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
243 my @MSG=map("v$_.16b",(4..7));
244 my @Kxx=map("v$_.4s",(16..19));
245 my ($W0,$W1)=("v20.4s","v21.4s");
246 my $ABCD_SAVE="v22.16b";
247
248 $code.=<<___;
249 .type   sha1_block_armv8,%function
250 .align  6
251 sha1_block_armv8:
252 .Lv8_entry:
253         stp     x29,x30,[sp,#-16]!
254         add     x29,sp,#0
255
256         adr     x4,.Lconst
257         eor     $E,$E,$E
258         ld1.32  {$ABCD},[$ctx],#16
259         ld1.32  {$E}[0],[$ctx]
260         sub     $ctx,$ctx,#16
261         ld1.32  {@Kxx[0]-@Kxx[3]},[x4]
262
263 .Loop_hw:
264         ld1     {@MSG[0]-@MSG[3]},[$inp],#64
265         sub     $num,$num,#1
266         rev32   @MSG[0],@MSG[0]
267         rev32   @MSG[1],@MSG[1]
268
269         add.i32 $W0,@Kxx[0],@MSG[0]
270         rev32   @MSG[2],@MSG[2]
271         orr     $ABCD_SAVE,$ABCD,$ABCD  // offload
272
273         add.i32 $W1,@Kxx[0],@MSG[1]
274         rev32   @MSG[3],@MSG[3]
275         sha1h   $E1,$ABCD
276         sha1c   $ABCD,$E,$W0            // 0
277         add.i32 $W0,@Kxx[$j],@MSG[2]
278         sha1su0 @MSG[0],@MSG[1],@MSG[2]
279 ___
280 for ($j=0,$i=1;$i<20-3;$i++) {
281 my $f=("c","p","m","p")[$i/5];
282 $code.=<<___;
283         sha1h   $E0,$ABCD               // $i
284         sha1$f  $ABCD,$E1,$W1
285         add.i32 $W1,@Kxx[$j],@MSG[3]
286         sha1su1 @MSG[0],@MSG[3]
287 ___
288 $code.=<<___ if ($i<20-4);
289         sha1su0 @MSG[1],@MSG[2],@MSG[3]
290 ___
291         ($E0,$E1)=($E1,$E0);            ($W0,$W1)=($W1,$W0);
292         push(@MSG,shift(@MSG));         $j++ if ((($i+3)%5)==0);
293 }
294 $code.=<<___;
295         sha1h   $E0,$ABCD               // $i
296         sha1p   $ABCD,$E1,$W1
297         add.i32 $W1,@Kxx[$j],@MSG[3]
298
299         sha1h   $E1,$ABCD               // 18
300         sha1p   $ABCD,$E0,$W0
301
302         sha1h   $E0,$ABCD               // 19
303         sha1p   $ABCD,$E1,$W1
304
305         add.i32 $E,$E,$E0
306         add.i32 $ABCD,$ABCD,$ABCD_SAVE
307
308         cbnz    $num,.Loop_hw
309
310         st1.32  {$ABCD},[$ctx],#16
311         st1.32  {$E}[0],[$ctx]
312
313         ldr     x29,[sp],#16
314         ret
315 .size   sha1_block_armv8,.-sha1_block_armv8
316 .align  6
317 .Lconst:
318 .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     //K_00_19
319 .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     //K_20_39
320 .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     //K_40_59
321 .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     //K_60_79
322 .LOPENSSL_armcap_P:
323 #ifdef  __ILP32__
324 .long   OPENSSL_armcap_P-.
325 #else
326 .quad   OPENSSL_armcap_P-.
327 #endif
328 .asciz  "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
329 .align  2
330 .comm   OPENSSL_armcap_P,4,4
331 ___
332 }}}
333
334 {   my  %opcode = (
335         "sha1c"         => 0x5e000000,  "sha1p"         => 0x5e001000,
336         "sha1m"         => 0x5e002000,  "sha1su0"       => 0x5e003000,
337         "sha1h"         => 0x5e280800,  "sha1su1"       => 0x5e281800   );
338
339     sub unsha1 {
340         my ($mnemonic,$arg)=@_;
341
342         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
343         &&
344         sprintf ".inst\t0x%08x\t//%s %s",
345                         $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
346                         $mnemonic,$arg;
347     }
348 }
349
350 foreach(split("\n",$code)) {
351
352         s/\`([^\`]*)\`/eval($1)/geo;
353
354         s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
355
356         s/\.\w?32\b//o          and s/\.16b/\.4s/go;
357         m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
358
359         print $_,"\n";
360 }
361
362 close STDOUT;