MIPS64 assembly pack: add Poly1305 module.
[openssl.git] / crypto / poly1305 / asm / poly1305-mips.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # Poly1305 hash for MIPS64.
11 #
12 # May 2016
13 #
14 # Numbers are cycles per processed byte with poly1305_blocks alone.
15 #
16 #               IALU/gcc
17 # R1x000        5.64/+120%      (big-endian)
18 # Octeon II     3.80/+280%      (little-endian)
19
20 ######################################################################
21 # There is a number of MIPS ABI in use, O32 and N32/64 are most
22 # widely used. Then there is a new contender: NUBI. It appears that if
23 # one picks the latter, it's possible to arrange code in ABI neutral
24 # manner. Therefore let's stick to NUBI register layout:
25 #
26 ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
27 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
28 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
29 ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
30 #
31 # The return value is placed in $a0. Following coding rules facilitate
32 # interoperability:
33 #
34 # - never ever touch $tp, "thread pointer", former $gp [o32 can be
35 #   excluded from the rule, because it's specified volatile];
36 # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
37 #   old code];
38 # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
39 #
40 # For reference here is register layout for N32/64 MIPS ABIs:
41 #
42 # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
43 # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
44 # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
45 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
46 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
47 #
48 # <appro@openssl.org>
49 #
50 ######################################################################
51
52 $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
53
54 die "MIPS64 only" unless ($flavour =~ /64|n32/i);
55
56 $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
57 $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
58
59 ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
60 ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
61
62 $code.=<<___;
63 #ifdef MIPSEB
64 # define MSB 0
65 # define LSB 7
66 #else
67 # define MSB 7
68 # define LSB 0
69 #endif
70
71 .text
72 .set    noat
73 .set    noreorder
74
75 .align  5
76 .globl  poly1305_init
77 .ent    poly1305_init
78 poly1305_init:
79         .frame  $sp,0,$ra
80         .set    reorder
81
82         sd      $zero,0($ctx)
83         sd      $zero,8($ctx)
84         sd      $zero,16($ctx)
85
86         beqz    $inp,.Lno_key
87
88         ldl     $in0,0+MSB($inp)
89         ldl     $in1,8+MSB($inp)
90         ldr     $in0,0+LSB($inp)
91         ldr     $in1,8+LSB($inp)
92 #ifdef  MIPSEB
93 # if defined(_MIPS_ARCH_MIPS64R2)
94         dsbh    $in0,$in0               # byte swap
95          dsbh   $in1,$in1
96         dshd    $in0,$in0
97          dshd   $in1,$in1
98 # else
99         ori     $tmp0,$zero,0xFF
100         dsll    $tmp2,$tmp0,32
101         or      $tmp0,$tmp2             # 0x000000FF000000FF
102
103         and     $tmp1,$in0,$tmp0        # byte swap
104          and    $tmp3,$in1,$tmp0
105         dsrl    $tmp2,$in0,24
106          dsrl   $tmp4,$in1,24
107         dsll    $tmp1,24
108          dsll   $tmp3,24
109         and     $tmp2,$tmp0
110          and    $tmp4,$tmp0
111         dsll    $tmp0,8                 # 0x0000FF000000FF00
112         or      $tmp1,$tmp2
113          or     $tmp3,$tmp4
114         and     $tmp2,$in0,$tmp0
115          and    $tmp4,$in1,$tmp0
116         dsrl    $in0,8
117          dsrl   $in1,8
118         dsll    $tmp2,8
119          dsll   $tmp4,8
120         and     $in0,$tmp0
121          and    $in1,$tmp0
122         or      $tmp1,$tmp2
123          or     $tmp3,$tmp4
124         or      $in0,$tmp1
125          or     $in1,$tmp3
126         dsrl    $tmp1,$in0,32
127          dsrl   $tmp3,$in1,32
128         dsll    $in0,32
129          dsll   $in1,32
130         or      $in0,$tmp1
131          or     $in1,$tmp3
132 # endif
133 #endif
134         li      $tmp0,1
135         dsll    $tmp0,32
136         daddiu  $tmp0,-63
137         dsll    $tmp0,28
138         daddiu  $tmp0,-1                # 0ffffffc0fffffff
139
140         and     $in0,$tmp0
141         daddiu  $tmp0,-3                # 0ffffffc0ffffffc
142         and     $in1,$tmp0
143
144         sd      $in0,24($ctx)
145         dsrl    $tmp0,$in1,2
146         sd      $in1,32($ctx)
147         daddu   $tmp0,$in1              # s1 = r1 + (r1 >> 2)
148         sd      $tmp0,40($ctx)
149
150 .Lno_key:
151         li      $v0,0                   # return 0
152         jr      $ra
153 .end    poly1305_init
154 ___
155 {
156 my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
157    ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
158
159 $code.=<<___;
160 .align  5
161 .globl  poly1305_blocks
162 .ent    poly1305_blocks
163 poly1305_blocks:
164         .set    noreorder
165         dsrl    $len,4                  # number of complete blocks
166         beqz    $len,.Lno_data
167         nop
168
169         .frame  $sp,8*8,$ra
170         .mask   $SAVED_REGS_MASK,-8
171         dsub    $sp,8*8
172         sd      $s5,0($sp)
173         sd      $s4,8($sp)
174 ___
175 $code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
176         sd      $s3,16($sp)
177         sd      $s2,24($sp)
178         sd      $s1,32($sp)
179         sd      $s0,40($sp)
180 ___
181 $code.=<<___;
182         .set    reorder
183
184         ld      $h0,0($ctx)             # load hash value
185         ld      $h1,8($ctx)
186         ld      $h2,16($ctx)
187
188         ld      $r0,24($ctx)            # load key
189         ld      $r1,32($ctx)
190         ld      $s1,40($ctx)
191
192 .Loop:
193         ldl     $in0,0+MSB($inp)        # load input
194         ldl     $in1,8+MSB($inp)
195         ldr     $in0,0+LSB($inp)
196         daddiu  $len,-1
197         ldr     $in1,8+LSB($inp)
198         daddiu  $inp,16
199 #ifdef  MIPSEB
200 # if defined(_MIPS_ARCH_MIPS64R2)
201         dsbh    $in0,$in0               # byte swap
202          dsbh   $in1,$in1
203         dshd    $in0,$in0
204          dshd   $in1,$in1
205 # else
206         ori     $tmp0,$zero,0xFF
207         dsll    $tmp2,$tmp0,32
208         or      $tmp0,$tmp2             # 0x000000FF000000FF
209
210         and     $tmp1,$in0,$tmp0        # byte swap
211          and    $tmp3,$in1,$tmp0
212         dsrl    $tmp2,$in0,24
213          dsrl   $tmp4,$in1,24
214         dsll    $tmp1,24
215          dsll   $tmp3,24
216         and     $tmp2,$tmp0
217          and    $tmp4,$tmp0
218         dsll    $tmp0,8                 # 0x0000FF000000FF00
219         or      $tmp1,$tmp2
220          or     $tmp3,$tmp4
221         and     $tmp2,$in0,$tmp0
222          and    $tmp4,$in1,$tmp0
223         dsrl    $in0,8
224          dsrl   $in1,8
225         dsll    $tmp2,8
226          dsll   $tmp4,8
227         and     $in0,$tmp0
228          and    $in1,$tmp0
229         or      $tmp1,$tmp2
230          or     $tmp3,$tmp4
231         or      $in0,$tmp1
232          or     $in1,$tmp3
233         dsrl    $tmp1,$in0,32
234          dsrl   $tmp3,$in1,32
235         dsll    $in0,32
236          dsll   $in1,32
237         or      $in0,$tmp1
238          or     $in1,$tmp3
239 # endif
240 #endif
241         daddu   $h0,$in0                # accumulate input
242         daddu   $h1,$in1
243         sltu    $tmp0,$h0,$in0
244         sltu    $tmp1,$h1,$in1
245         daddu   $h1,$tmp0
246
247         dmultu  $r0,$h0                 # h0*r0
248          daddu  $h2,$padbit
249          sltu   $tmp0,$h1,$tmp0
250         mflo    $d0
251         mfhi    $d1
252
253         dmultu  $s1,$h1                 # h1*5*r1
254          daddu  $tmp0,$tmp1
255          daddu  $h2,$tmp0
256         mflo    $tmp0
257         mfhi    $tmp1
258
259         dmultu  $r1,$h0                 # h0*r1
260          daddu  $d0,$tmp0
261          daddu  $d1,$tmp1
262         mflo    $tmp2
263         mfhi    $d2
264          sltu   $tmp0,$d0,$tmp0
265          daddu  $d1,$tmp0
266
267         dmultu  $r0,$h1                 # h1*r0
268          daddu  $d1,$tmp2
269          sltu   $tmp2,$d1,$tmp2
270         mflo    $tmp0
271         mfhi    $tmp1
272          daddu  $d2,$tmp2
273
274         dmultu  $s1,$h2                 # h2*5*r1
275          daddu  $d1,$tmp0
276          daddu  $d2,$tmp1
277         mflo    $tmp2
278
279         dmultu  $r0,$h2                 # h2*r0
280          sltu   $tmp0,$d1,$tmp0
281          daddu  $d2,$tmp0
282         mflo    $tmp3
283
284         daddu   $d1,$tmp2
285         daddu   $d2,$tmp3
286         sltu    $tmp2,$d1,$tmp2
287         daddu   $d2,$tmp2
288
289         li      $tmp0,-4                # final reduction
290         and     $tmp0,$d2
291         dsrl    $tmp1,$d2,2
292         andi    $h2,$d2,3
293         daddu   $tmp0,$tmp1
294         daddu   $h0,$d0,$tmp0
295         sltu    $tmp0,$h0,$tmp0
296         daddu   $h1,$d1,$tmp0
297         sltu    $tmp0,$h1,$tmp0
298         daddu   $h2,$h2,$tmp0
299
300         bnez    $len,.Loop
301
302         sd      $h0,0($ctx)             # store hash value
303         sd      $h1,8($ctx)
304         sd      $h2,16($ctx)
305
306         .set    noreorder
307         ld      $s5,0($sp)              # epilogue
308         ld      $s4,8($sp)
309 ___
310 $code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi epilogue
311         ld      $s3,16($sp)
312         ld      $s2,24($sp)
313         ld      $s1,32($sp)
314         ld      $s0,40($sp)
315 ___
316 $code.=<<___;
317         dadd    $sp,8*8
318
319 .Lno_data:
320         jr      $ra
321         nop
322 .end    poly1305_blocks
323 ___
324 }
325 {
326 my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
327
328 $code.=<<___;
329 .align  5
330 .globl  poly1305_emit
331 .ent    poly1305_emit
332 poly1305_emit:
333         .frame  $sp,0,$ra
334         .set    reorder
335
336         ld      $tmp0,0($ctx)
337         ld      $tmp1,8($ctx)
338         ld      $tmp2,16($ctx)
339
340         daddiu  $in0,$tmp0,5            # compare to modulus
341         sltiu   $tmp3,$in0,5
342         daddu   $in1,$tmp1,$tmp3
343         sltu    $tmp3,$in1,$tmp3
344         daddu   $tmp2,$tmp2,$tmp3
345
346         dsrl    $tmp2,2                 # see if it carried/borrowed
347         dsubu   $tmp2,$zero,$tmp2
348         nor     $tmp3,$zero,$tmp2
349
350         and     $in0,$tmp2
351         and     $tmp0,$tmp3
352         and     $in1,$tmp2
353         and     $tmp1,$tmp3
354         or      $in0,$tmp0
355         or      $in1,$tmp1
356
357         lwu     $tmp0,0($nonce)         # load nonce
358         lwu     $tmp1,4($nonce)
359         lwu     $tmp2,8($nonce)
360         lwu     $tmp3,12($nonce)
361         dsll    $tmp1,32
362         dsll    $tmp3,32
363         or      $tmp0,$tmp1
364         or      $tmp2,$tmp3
365
366         daddu   $in0,$tmp0              # accumulate nonce
367         daddu   $in1,$tmp2
368         sltu    $tmp0,$in0,$tmp0
369         daddu   $in1,$tmp0
370
371         dsrl    $tmp0,$in0,8            # write mac value
372         dsrl    $tmp1,$in0,16
373         dsrl    $tmp2,$in0,24
374         sb      $in0,0($mac)
375         dsrl    $tmp3,$in0,32
376         sb      $tmp0,1($mac)
377         dsrl    $tmp0,$in0,40
378         sb      $tmp1,2($mac)
379         dsrl    $tmp1,$in0,48
380         sb      $tmp2,3($mac)
381         dsrl    $tmp2,$in0,56
382         sb      $tmp3,4($mac)
383         dsrl    $tmp3,$in1,8
384         sb      $tmp0,5($mac)
385         dsrl    $tmp0,$in1,16
386         sb      $tmp1,6($mac)
387         dsrl    $tmp1,$in1,24
388         sb      $tmp2,7($mac)
389
390         sb      $in1,8($mac)
391         dsrl    $tmp2,$in1,32
392         sb      $tmp3,9($mac)
393         dsrl    $tmp3,$in1,40
394         sb      $tmp0,10($mac)
395         dsrl    $tmp0,$in1,48
396         sb      $tmp1,11($mac)
397         dsrl    $tmp1,$in1,56
398         sb      $tmp2,12($mac)
399         sb      $tmp3,13($mac)
400         sb      $tmp0,14($mac)
401         sb      $tmp1,15($mac)
402
403         jr      $ra
404 .end    poly1305_emit
405 .rdata
406 .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
407 .align  2
408 ___
409 }
410
411 $output=pop and open STDOUT,">$output";
412 print $code;
413 close STDOUT;
414