d2b3e90d93f0456399c267e144667f01256a8cf2
[openssl.git] / crypto / poly1305 / asm / poly1305-mips.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # Poly1305 hash for MIPS64.
18 #
19 # May 2016
20 #
21 # Numbers are cycles per processed byte with poly1305_blocks alone.
22 #
23 #               IALU/gcc
24 # R1x000        5.64/+120%      (big-endian)
25 # Octeon II     3.80/+280%      (little-endian)
26
27 ######################################################################
28 # There is a number of MIPS ABI in use, O32 and N32/64 are most
29 # widely used. Then there is a new contender: NUBI. It appears that if
30 # one picks the latter, it's possible to arrange code in ABI neutral
31 # manner. Therefore let's stick to NUBI register layout:
32 #
33 ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
34 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
35 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
36 ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
37 #
38 # The return value is placed in $a0. Following coding rules facilitate
39 # interoperability:
40 #
41 # - never ever touch $tp, "thread pointer", former $gp [o32 can be
42 #   excluded from the rule, because it's specified volatile];
43 # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
44 #   old code];
45 # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
46 #
47 # For reference here is register layout for N32/64 MIPS ABIs:
48 #
49 # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
50 # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
51 # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
52 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
53 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
54 #
55 # <appro@openssl.org>
56 #
57 ######################################################################
58
59 $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
60
61 die "MIPS64 only" unless ($flavour =~ /64|n32/i);
62
63 $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
64 $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
65
66 ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
67 ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
68
69 $code.=<<___;
70 #ifdef MIPSEB
71 # define MSB 0
72 # define LSB 7
73 #else
74 # define MSB 7
75 # define LSB 0
76 #endif
77
78 .text
79 .set    noat
80 .set    noreorder
81
82 .align  5
83 .globl  poly1305_init
84 .ent    poly1305_init
85 poly1305_init:
86         .frame  $sp,0,$ra
87         .set    reorder
88
89         sd      $zero,0($ctx)
90         sd      $zero,8($ctx)
91         sd      $zero,16($ctx)
92
93         beqz    $inp,.Lno_key
94
95         ldl     $in0,0+MSB($inp)
96         ldl     $in1,8+MSB($inp)
97         ldr     $in0,0+LSB($inp)
98         ldr     $in1,8+LSB($inp)
99 #ifdef  MIPSEB
100 # if defined(_MIPS_ARCH_MIPS64R2)
101         dsbh    $in0,$in0               # byte swap
102          dsbh   $in1,$in1
103         dshd    $in0,$in0
104          dshd   $in1,$in1
105 # else
106         ori     $tmp0,$zero,0xFF
107         dsll    $tmp2,$tmp0,32
108         or      $tmp0,$tmp2             # 0x000000FF000000FF
109
110         and     $tmp1,$in0,$tmp0        # byte swap
111          and    $tmp3,$in1,$tmp0
112         dsrl    $tmp2,$in0,24
113          dsrl   $tmp4,$in1,24
114         dsll    $tmp1,24
115          dsll   $tmp3,24
116         and     $tmp2,$tmp0
117          and    $tmp4,$tmp0
118         dsll    $tmp0,8                 # 0x0000FF000000FF00
119         or      $tmp1,$tmp2
120          or     $tmp3,$tmp4
121         and     $tmp2,$in0,$tmp0
122          and    $tmp4,$in1,$tmp0
123         dsrl    $in0,8
124          dsrl   $in1,8
125         dsll    $tmp2,8
126          dsll   $tmp4,8
127         and     $in0,$tmp0
128          and    $in1,$tmp0
129         or      $tmp1,$tmp2
130          or     $tmp3,$tmp4
131         or      $in0,$tmp1
132          or     $in1,$tmp3
133         dsrl    $tmp1,$in0,32
134          dsrl   $tmp3,$in1,32
135         dsll    $in0,32
136          dsll   $in1,32
137         or      $in0,$tmp1
138          or     $in1,$tmp3
139 # endif
140 #endif
141         li      $tmp0,1
142         dsll    $tmp0,32
143         daddiu  $tmp0,-63
144         dsll    $tmp0,28
145         daddiu  $tmp0,-1                # 0ffffffc0fffffff
146
147         and     $in0,$tmp0
148         daddiu  $tmp0,-3                # 0ffffffc0ffffffc
149         and     $in1,$tmp0
150
151         sd      $in0,24($ctx)
152         dsrl    $tmp0,$in1,2
153         sd      $in1,32($ctx)
154         daddu   $tmp0,$in1              # s1 = r1 + (r1 >> 2)
155         sd      $tmp0,40($ctx)
156
157 .Lno_key:
158         li      $v0,0                   # return 0
159         jr      $ra
160 .end    poly1305_init
161 ___
162 {
163 my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
164    ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
165
166 $code.=<<___;
167 .align  5
168 .globl  poly1305_blocks
169 .ent    poly1305_blocks
170 poly1305_blocks:
171         .set    noreorder
172         dsrl    $len,4                  # number of complete blocks
173         bnez    $len,poly1305_blocks_internal
174         nop
175         jr      $ra
176         nop
177 .end    poly1305_blocks
178
179 .align  5
180 .ent    poly1305_blocks_internal
181 poly1305_blocks_internal:
182         .frame  $sp,6*8,$ra
183         .mask   $SAVED_REGS_MASK,-8
184         .set    noreorder
185         dsub    $sp,6*8
186         sd      $s5,40($sp)
187         sd      $s4,32($sp)
188 ___
189 $code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
190         sd      $s3,24($sp)
191         sd      $s2,16($sp)
192         sd      $s1,8($sp)
193         sd      $s0,0($sp)
194 ___
195 $code.=<<___;
196         .set    reorder
197
198         ld      $h0,0($ctx)             # load hash value
199         ld      $h1,8($ctx)
200         ld      $h2,16($ctx)
201
202         ld      $r0,24($ctx)            # load key
203         ld      $r1,32($ctx)
204         ld      $s1,40($ctx)
205
206 .Loop:
207         ldl     $in0,0+MSB($inp)        # load input
208         ldl     $in1,8+MSB($inp)
209         ldr     $in0,0+LSB($inp)
210         daddiu  $len,-1
211         ldr     $in1,8+LSB($inp)
212         daddiu  $inp,16
213 #ifdef  MIPSEB
214 # if defined(_MIPS_ARCH_MIPS64R2)
215         dsbh    $in0,$in0               # byte swap
216          dsbh   $in1,$in1
217         dshd    $in0,$in0
218          dshd   $in1,$in1
219 # else
220         ori     $tmp0,$zero,0xFF
221         dsll    $tmp2,$tmp0,32
222         or      $tmp0,$tmp2             # 0x000000FF000000FF
223
224         and     $tmp1,$in0,$tmp0        # byte swap
225          and    $tmp3,$in1,$tmp0
226         dsrl    $tmp2,$in0,24
227          dsrl   $tmp4,$in1,24
228         dsll    $tmp1,24
229          dsll   $tmp3,24
230         and     $tmp2,$tmp0
231          and    $tmp4,$tmp0
232         dsll    $tmp0,8                 # 0x0000FF000000FF00
233         or      $tmp1,$tmp2
234          or     $tmp3,$tmp4
235         and     $tmp2,$in0,$tmp0
236          and    $tmp4,$in1,$tmp0
237         dsrl    $in0,8
238          dsrl   $in1,8
239         dsll    $tmp2,8
240          dsll   $tmp4,8
241         and     $in0,$tmp0
242          and    $in1,$tmp0
243         or      $tmp1,$tmp2
244          or     $tmp3,$tmp4
245         or      $in0,$tmp1
246          or     $in1,$tmp3
247         dsrl    $tmp1,$in0,32
248          dsrl   $tmp3,$in1,32
249         dsll    $in0,32
250          dsll   $in1,32
251         or      $in0,$tmp1
252          or     $in1,$tmp3
253 # endif
254 #endif
255         daddu   $h0,$in0                # accumulate input
256         daddu   $h1,$in1
257         sltu    $tmp0,$h0,$in0
258         sltu    $tmp1,$h1,$in1
259         daddu   $h1,$tmp0
260
261         dmultu  $r0,$h0                 # h0*r0
262          daddu  $h2,$padbit
263          sltu   $tmp0,$h1,$tmp0
264         mflo    $d0
265         mfhi    $d1
266
267         dmultu  $s1,$h1                 # h1*5*r1
268          daddu  $tmp0,$tmp1
269          daddu  $h2,$tmp0
270         mflo    $tmp0
271         mfhi    $tmp1
272
273         dmultu  $r1,$h0                 # h0*r1
274          daddu  $d0,$tmp0
275          daddu  $d1,$tmp1
276         mflo    $tmp2
277         mfhi    $d2
278          sltu   $tmp0,$d0,$tmp0
279          daddu  $d1,$tmp0
280
281         dmultu  $r0,$h1                 # h1*r0
282          daddu  $d1,$tmp2
283          sltu   $tmp2,$d1,$tmp2
284         mflo    $tmp0
285         mfhi    $tmp1
286          daddu  $d2,$tmp2
287
288         dmultu  $s1,$h2                 # h2*5*r1
289          daddu  $d1,$tmp0
290          daddu  $d2,$tmp1
291         mflo    $tmp2
292
293         dmultu  $r0,$h2                 # h2*r0
294          sltu   $tmp0,$d1,$tmp0
295          daddu  $d2,$tmp0
296         mflo    $tmp3
297
298         daddu   $d1,$tmp2
299         daddu   $d2,$tmp3
300         sltu    $tmp2,$d1,$tmp2
301         daddu   $d2,$tmp2
302
303         li      $tmp0,-4                # final reduction
304         and     $tmp0,$d2
305         dsrl    $tmp1,$d2,2
306         andi    $h2,$d2,3
307         daddu   $tmp0,$tmp1
308         daddu   $h0,$d0,$tmp0
309         sltu    $tmp0,$h0,$tmp0
310         daddu   $h1,$d1,$tmp0
311         sltu    $tmp0,$h1,$tmp0
312         daddu   $h2,$h2,$tmp0
313
314         bnez    $len,.Loop
315
316         sd      $h0,0($ctx)             # store hash value
317         sd      $h1,8($ctx)
318         sd      $h2,16($ctx)
319
320         .set    noreorder
321         ld      $s5,40($sp)             # epilogue
322         ld      $s4,32($sp)
323 ___
324 $code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi epilogue
325         ld      $s3,24($sp)
326         ld      $s2,16($sp)
327         ld      $s1,8($sp)
328         ld      $s0,0($sp)
329 ___
330 $code.=<<___;
331         jr      $ra
332         dadd    $sp,6*8
333 .end    poly1305_blocks_internal
334 ___
335 }
336 {
337 my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
338
339 $code.=<<___;
340 .align  5
341 .globl  poly1305_emit
342 .ent    poly1305_emit
343 poly1305_emit:
344         .frame  $sp,0,$ra
345         .set    reorder
346
347         ld      $tmp0,0($ctx)
348         ld      $tmp1,8($ctx)
349         ld      $tmp2,16($ctx)
350
351         daddiu  $in0,$tmp0,5            # compare to modulus
352         sltiu   $tmp3,$in0,5
353         daddu   $in1,$tmp1,$tmp3
354         sltu    $tmp3,$in1,$tmp3
355         daddu   $tmp2,$tmp2,$tmp3
356
357         dsrl    $tmp2,2                 # see if it carried/borrowed
358         dsubu   $tmp2,$zero,$tmp2
359         nor     $tmp3,$zero,$tmp2
360
361         and     $in0,$tmp2
362         and     $tmp0,$tmp3
363         and     $in1,$tmp2
364         and     $tmp1,$tmp3
365         or      $in0,$tmp0
366         or      $in1,$tmp1
367
368         lwu     $tmp0,0($nonce)         # load nonce
369         lwu     $tmp1,4($nonce)
370         lwu     $tmp2,8($nonce)
371         lwu     $tmp3,12($nonce)
372         dsll    $tmp1,32
373         dsll    $tmp3,32
374         or      $tmp0,$tmp1
375         or      $tmp2,$tmp3
376
377         daddu   $in0,$tmp0              # accumulate nonce
378         daddu   $in1,$tmp2
379         sltu    $tmp0,$in0,$tmp0
380         daddu   $in1,$tmp0
381
382         dsrl    $tmp0,$in0,8            # write mac value
383         dsrl    $tmp1,$in0,16
384         dsrl    $tmp2,$in0,24
385         sb      $in0,0($mac)
386         dsrl    $tmp3,$in0,32
387         sb      $tmp0,1($mac)
388         dsrl    $tmp0,$in0,40
389         sb      $tmp1,2($mac)
390         dsrl    $tmp1,$in0,48
391         sb      $tmp2,3($mac)
392         dsrl    $tmp2,$in0,56
393         sb      $tmp3,4($mac)
394         dsrl    $tmp3,$in1,8
395         sb      $tmp0,5($mac)
396         dsrl    $tmp0,$in1,16
397         sb      $tmp1,6($mac)
398         dsrl    $tmp1,$in1,24
399         sb      $tmp2,7($mac)
400
401         sb      $in1,8($mac)
402         dsrl    $tmp2,$in1,32
403         sb      $tmp3,9($mac)
404         dsrl    $tmp3,$in1,40
405         sb      $tmp0,10($mac)
406         dsrl    $tmp0,$in1,48
407         sb      $tmp1,11($mac)
408         dsrl    $tmp1,$in1,56
409         sb      $tmp2,12($mac)
410         sb      $tmp3,13($mac)
411         sb      $tmp0,14($mac)
412         sb      $tmp1,15($mac)
413
414         jr      $ra
415 .end    poly1305_emit
416 .rdata
417 .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
418 .align  2
419 ___
420 }
421
422 $output=pop and open STDOUT,">$output";
423 print $code;
424 close STDOUT;
425