{chacha|poly1305}/asm/*-x64.pl: harmonize clang version detection.
[openssl.git] / crypto / poly1305 / asm / poly1305-mips.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # Poly1305 hash for MIPS64.
18 #
19 # May 2016
20 #
21 # Numbers are cycles per processed byte with poly1305_blocks alone.
22 #
23 #               IALU/gcc
24 # R1x000        5.64/+120%      (big-endian)
25 # Octeon II     3.80/+280%      (little-endian)
26
27 ######################################################################
28 # There is a number of MIPS ABI in use, O32 and N32/64 are most
29 # widely used. Then there is a new contender: NUBI. It appears that if
30 # one picks the latter, it's possible to arrange code in ABI neutral
31 # manner. Therefore let's stick to NUBI register layout:
32 #
33 ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
34 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
35 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
36 ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
37 #
38 # The return value is placed in $a0. Following coding rules facilitate
39 # interoperability:
40 #
41 # - never ever touch $tp, "thread pointer", former $gp [o32 can be
42 #   excluded from the rule, because it's specified volatile];
43 # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
44 #   old code];
45 # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
46 #
47 # For reference here is register layout for N32/64 MIPS ABIs:
48 #
49 # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
50 # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
51 # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
52 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
53 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
54 #
55 # <appro@openssl.org>
56 #
57 ######################################################################
58
59 $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
60
61 die "MIPS64 only" unless ($flavour =~ /64|n32/i);
62
63 $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
64 $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
65
66 ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
67 ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
68
69 $code.=<<___;
70 #include "mips_arch.h"
71
72 #ifdef MIPSEB
73 # define MSB 0
74 # define LSB 7
75 #else
76 # define MSB 7
77 # define LSB 0
78 #endif
79
80 .text
81 .set    noat
82 .set    noreorder
83
84 .align  5
85 .globl  poly1305_init
86 .ent    poly1305_init
87 poly1305_init:
88         .frame  $sp,0,$ra
89         .set    reorder
90
91         sd      $zero,0($ctx)
92         sd      $zero,8($ctx)
93         sd      $zero,16($ctx)
94
95         beqz    $inp,.Lno_key
96
97 #if defined(_MIPS_ARCH_MIPS64R6)
98         ld      $in0,0($inp)
99         ld      $in1,8($inp)
100 #else
101         ldl     $in0,0+MSB($inp)
102         ldl     $in1,8+MSB($inp)
103         ldr     $in0,0+LSB($inp)
104         ldr     $in1,8+LSB($inp)
105 #endif
106 #ifdef  MIPSEB
107 # if defined(_MIPS_ARCH_MIPS64R2)
108         dsbh    $in0,$in0               # byte swap
109          dsbh   $in1,$in1
110         dshd    $in0,$in0
111          dshd   $in1,$in1
112 # else
113         ori     $tmp0,$zero,0xFF
114         dsll    $tmp2,$tmp0,32
115         or      $tmp0,$tmp2             # 0x000000FF000000FF
116
117         and     $tmp1,$in0,$tmp0        # byte swap
118          and    $tmp3,$in1,$tmp0
119         dsrl    $tmp2,$in0,24
120          dsrl   $tmp4,$in1,24
121         dsll    $tmp1,24
122          dsll   $tmp3,24
123         and     $tmp2,$tmp0
124          and    $tmp4,$tmp0
125         dsll    $tmp0,8                 # 0x0000FF000000FF00
126         or      $tmp1,$tmp2
127          or     $tmp3,$tmp4
128         and     $tmp2,$in0,$tmp0
129          and    $tmp4,$in1,$tmp0
130         dsrl    $in0,8
131          dsrl   $in1,8
132         dsll    $tmp2,8
133          dsll   $tmp4,8
134         and     $in0,$tmp0
135          and    $in1,$tmp0
136         or      $tmp1,$tmp2
137          or     $tmp3,$tmp4
138         or      $in0,$tmp1
139          or     $in1,$tmp3
140         dsrl    $tmp1,$in0,32
141          dsrl   $tmp3,$in1,32
142         dsll    $in0,32
143          dsll   $in1,32
144         or      $in0,$tmp1
145          or     $in1,$tmp3
146 # endif
147 #endif
148         li      $tmp0,1
149         dsll    $tmp0,32
150         daddiu  $tmp0,-63
151         dsll    $tmp0,28
152         daddiu  $tmp0,-1                # 0ffffffc0fffffff
153
154         and     $in0,$tmp0
155         daddiu  $tmp0,-3                # 0ffffffc0ffffffc
156         and     $in1,$tmp0
157
158         sd      $in0,24($ctx)
159         dsrl    $tmp0,$in1,2
160         sd      $in1,32($ctx)
161         daddu   $tmp0,$in1              # s1 = r1 + (r1 >> 2)
162         sd      $tmp0,40($ctx)
163
164 .Lno_key:
165         li      $v0,0                   # return 0
166         jr      $ra
167 .end    poly1305_init
168 ___
169 {
170 my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
171    ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
172
173 $code.=<<___;
174 .align  5
175 .globl  poly1305_blocks
176 .ent    poly1305_blocks
177 poly1305_blocks:
178         .set    noreorder
179         dsrl    $len,4                  # number of complete blocks
180         bnez    $len,poly1305_blocks_internal
181         nop
182         jr      $ra
183         nop
184 .end    poly1305_blocks
185
186 .align  5
187 .ent    poly1305_blocks_internal
188 poly1305_blocks_internal:
189         .frame  $sp,6*8,$ra
190         .mask   $SAVED_REGS_MASK,-8
191         .set    noreorder
192         dsubu   $sp,6*8
193         sd      $s5,40($sp)
194         sd      $s4,32($sp)
195 ___
196 $code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
197         sd      $s3,24($sp)
198         sd      $s2,16($sp)
199         sd      $s1,8($sp)
200         sd      $s0,0($sp)
201 ___
202 $code.=<<___;
203         .set    reorder
204
205         ld      $h0,0($ctx)             # load hash value
206         ld      $h1,8($ctx)
207         ld      $h2,16($ctx)
208
209         ld      $r0,24($ctx)            # load key
210         ld      $r1,32($ctx)
211         ld      $s1,40($ctx)
212
213 .Loop:
214 #if defined(_MIPS_ARCH_MIPS64R6)
215         ld      $in0,0($inp)            # load input
216         ld      $in1,8($inp)
217 #else
218         ldl     $in0,0+MSB($inp)        # load input
219         ldl     $in1,8+MSB($inp)
220         ldr     $in0,0+LSB($inp)
221         ldr     $in1,8+LSB($inp)
222 #endif
223         daddiu  $len,-1
224         daddiu  $inp,16
225 #ifdef  MIPSEB
226 # if defined(_MIPS_ARCH_MIPS64R2)
227         dsbh    $in0,$in0               # byte swap
228          dsbh   $in1,$in1
229         dshd    $in0,$in0
230          dshd   $in1,$in1
231 # else
232         ori     $tmp0,$zero,0xFF
233         dsll    $tmp2,$tmp0,32
234         or      $tmp0,$tmp2             # 0x000000FF000000FF
235
236         and     $tmp1,$in0,$tmp0        # byte swap
237          and    $tmp3,$in1,$tmp0
238         dsrl    $tmp2,$in0,24
239          dsrl   $tmp4,$in1,24
240         dsll    $tmp1,24
241          dsll   $tmp3,24
242         and     $tmp2,$tmp0
243          and    $tmp4,$tmp0
244         dsll    $tmp0,8                 # 0x0000FF000000FF00
245         or      $tmp1,$tmp2
246          or     $tmp3,$tmp4
247         and     $tmp2,$in0,$tmp0
248          and    $tmp4,$in1,$tmp0
249         dsrl    $in0,8
250          dsrl   $in1,8
251         dsll    $tmp2,8
252          dsll   $tmp4,8
253         and     $in0,$tmp0
254          and    $in1,$tmp0
255         or      $tmp1,$tmp2
256          or     $tmp3,$tmp4
257         or      $in0,$tmp1
258          or     $in1,$tmp3
259         dsrl    $tmp1,$in0,32
260          dsrl   $tmp3,$in1,32
261         dsll    $in0,32
262          dsll   $in1,32
263         or      $in0,$tmp1
264          or     $in1,$tmp3
265 # endif
266 #endif
267         daddu   $h0,$in0                # accumulate input
268         daddu   $h1,$in1
269         sltu    $tmp0,$h0,$in0
270         sltu    $tmp1,$h1,$in1
271         daddu   $h1,$tmp0
272
273         dmultu  ($r0,$h0)               # h0*r0
274          daddu  $h2,$padbit
275          sltu   $tmp0,$h1,$tmp0
276         mflo    ($d0,$r0,$h0)
277         mfhi    ($d1,$r0,$h0)
278
279         dmultu  ($s1,$h1)               # h1*5*r1
280          daddu  $tmp0,$tmp1
281          daddu  $h2,$tmp0
282         mflo    ($tmp0,$s1,$h1)
283         mfhi    ($tmp1,$s1,$h1)
284
285         dmultu  ($r1,$h0)               # h0*r1
286          daddu  $d0,$tmp0
287          daddu  $d1,$tmp1
288         mflo    ($tmp2,$r1,$h0)
289         mfhi    ($d2,$r1,$h0)
290          sltu   $tmp0,$d0,$tmp0
291          daddu  $d1,$tmp0
292
293         dmultu  ($r0,$h1)               # h1*r0
294          daddu  $d1,$tmp2
295          sltu   $tmp2,$d1,$tmp2
296         mflo    ($tmp0,$r0,$h1)
297         mfhi    ($tmp1,$r0,$h1)
298          daddu  $d2,$tmp2
299
300         dmultu  ($s1,$h2)               # h2*5*r1
301          daddu  $d1,$tmp0
302          daddu  $d2,$tmp1
303         mflo    ($tmp2,$s1,$h2)
304
305         dmultu  ($r0,$h2)               # h2*r0
306          sltu   $tmp0,$d1,$tmp0
307          daddu  $d2,$tmp0
308         mflo    ($tmp3,$r0,$h2)
309
310         daddu   $d1,$tmp2
311         daddu   $d2,$tmp3
312         sltu    $tmp2,$d1,$tmp2
313         daddu   $d2,$tmp2
314
315         li      $tmp0,-4                # final reduction
316         and     $tmp0,$d2
317         dsrl    $tmp1,$d2,2
318         andi    $h2,$d2,3
319         daddu   $tmp0,$tmp1
320         daddu   $h0,$d0,$tmp0
321         sltu    $tmp0,$h0,$tmp0
322         daddu   $h1,$d1,$tmp0
323         sltu    $tmp0,$h1,$tmp0
324         daddu   $h2,$h2,$tmp0
325
326         bnez    $len,.Loop
327
328         sd      $h0,0($ctx)             # store hash value
329         sd      $h1,8($ctx)
330         sd      $h2,16($ctx)
331
332         .set    noreorder
333         ld      $s5,40($sp)             # epilogue
334         ld      $s4,32($sp)
335 ___
336 $code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi epilogue
337         ld      $s3,24($sp)
338         ld      $s2,16($sp)
339         ld      $s1,8($sp)
340         ld      $s0,0($sp)
341 ___
342 $code.=<<___;
343         jr      $ra
344         daddu   $sp,6*8
345 .end    poly1305_blocks_internal
346 ___
347 }
348 {
349 my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
350
351 $code.=<<___;
352 .align  5
353 .globl  poly1305_emit
354 .ent    poly1305_emit
355 poly1305_emit:
356         .frame  $sp,0,$ra
357         .set    reorder
358
359         ld      $tmp0,0($ctx)
360         ld      $tmp1,8($ctx)
361         ld      $tmp2,16($ctx)
362
363         daddiu  $in0,$tmp0,5            # compare to modulus
364         sltiu   $tmp3,$in0,5
365         daddu   $in1,$tmp1,$tmp3
366         sltu    $tmp3,$in1,$tmp3
367         daddu   $tmp2,$tmp2,$tmp3
368
369         dsrl    $tmp2,2                 # see if it carried/borrowed
370         dsubu   $tmp2,$zero,$tmp2
371         nor     $tmp3,$zero,$tmp2
372
373         and     $in0,$tmp2
374         and     $tmp0,$tmp3
375         and     $in1,$tmp2
376         and     $tmp1,$tmp3
377         or      $in0,$tmp0
378         or      $in1,$tmp1
379
380         lwu     $tmp0,0($nonce)         # load nonce
381         lwu     $tmp1,4($nonce)
382         lwu     $tmp2,8($nonce)
383         lwu     $tmp3,12($nonce)
384         dsll    $tmp1,32
385         dsll    $tmp3,32
386         or      $tmp0,$tmp1
387         or      $tmp2,$tmp3
388
389         daddu   $in0,$tmp0              # accumulate nonce
390         daddu   $in1,$tmp2
391         sltu    $tmp0,$in0,$tmp0
392         daddu   $in1,$tmp0
393
394         dsrl    $tmp0,$in0,8            # write mac value
395         dsrl    $tmp1,$in0,16
396         dsrl    $tmp2,$in0,24
397         sb      $in0,0($mac)
398         dsrl    $tmp3,$in0,32
399         sb      $tmp0,1($mac)
400         dsrl    $tmp0,$in0,40
401         sb      $tmp1,2($mac)
402         dsrl    $tmp1,$in0,48
403         sb      $tmp2,3($mac)
404         dsrl    $tmp2,$in0,56
405         sb      $tmp3,4($mac)
406         dsrl    $tmp3,$in1,8
407         sb      $tmp0,5($mac)
408         dsrl    $tmp0,$in1,16
409         sb      $tmp1,6($mac)
410         dsrl    $tmp1,$in1,24
411         sb      $tmp2,7($mac)
412
413         sb      $in1,8($mac)
414         dsrl    $tmp2,$in1,32
415         sb      $tmp3,9($mac)
416         dsrl    $tmp3,$in1,40
417         sb      $tmp0,10($mac)
418         dsrl    $tmp0,$in1,48
419         sb      $tmp1,11($mac)
420         dsrl    $tmp1,$in1,56
421         sb      $tmp2,12($mac)
422         sb      $tmp3,13($mac)
423         sb      $tmp0,14($mac)
424         sb      $tmp1,15($mac)
425
426         jr      $ra
427 .end    poly1305_emit
428 .rdata
429 .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
430 .align  2
431 ___
432 }
433
434 $output=pop and open STDOUT,">$output";
435 print $code;
436 close STDOUT;
437