b5bce4775d1415d58d458933c65d5cb4918bcbc7
[openssl.git] / crypto / poly1305 / asm / poly1305-mips.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # Poly1305 hash for MIPS64.
18 #
19 # May 2016
20 #
21 # Numbers are cycles per processed byte with poly1305_blocks alone.
22 #
23 #               IALU/gcc
24 # R1x000        5.64/+120%      (big-endian)
25 # Octeon II     3.80/+280%      (little-endian)
26
27 ######################################################################
28 # There is a number of MIPS ABI in use, O32 and N32/64 are most
29 # widely used. Then there is a new contender: NUBI. It appears that if
30 # one picks the latter, it's possible to arrange code in ABI neutral
31 # manner. Therefore let's stick to NUBI register layout:
32 #
33 ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
34 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
35 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
36 ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
37 #
38 # The return value is placed in $a0. Following coding rules facilitate
39 # interoperability:
40 #
41 # - never ever touch $tp, "thread pointer", former $gp [o32 can be
42 #   excluded from the rule, because it's specified volatile];
43 # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
44 #   old code];
45 # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
46 #
47 # For reference here is register layout for N32/64 MIPS ABIs:
48 #
49 # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
50 # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
51 # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
52 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
53 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
54 #
55 # <appro@openssl.org>
56 #
57 ######################################################################
58
59 $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
60
61 die "MIPS64 only" unless ($flavour =~ /64|n32/i);
62
63 $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
64 $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
65
66 ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
67 ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
68
69 $code.=<<___;
70 #ifdef MIPSEB
71 # define MSB 0
72 # define LSB 7
73 #else
74 # define MSB 7
75 # define LSB 0
76 #endif
77
78 .text
79 .set    noat
80 .set    noreorder
81
82 .align  5
83 .globl  poly1305_init
84 .ent    poly1305_init
85 poly1305_init:
86         .frame  $sp,0,$ra
87         .set    reorder
88
89         sd      $zero,0($ctx)
90         sd      $zero,8($ctx)
91         sd      $zero,16($ctx)
92
93         beqz    $inp,.Lno_key
94
95         ldl     $in0,0+MSB($inp)
96         ldl     $in1,8+MSB($inp)
97         ldr     $in0,0+LSB($inp)
98         ldr     $in1,8+LSB($inp)
99 #ifdef  MIPSEB
100 # if defined(_MIPS_ARCH_MIPS64R2)
101         dsbh    $in0,$in0               # byte swap
102          dsbh   $in1,$in1
103         dshd    $in0,$in0
104          dshd   $in1,$in1
105 # else
106         ori     $tmp0,$zero,0xFF
107         dsll    $tmp2,$tmp0,32
108         or      $tmp0,$tmp2             # 0x000000FF000000FF
109
110         and     $tmp1,$in0,$tmp0        # byte swap
111          and    $tmp3,$in1,$tmp0
112         dsrl    $tmp2,$in0,24
113          dsrl   $tmp4,$in1,24
114         dsll    $tmp1,24
115          dsll   $tmp3,24
116         and     $tmp2,$tmp0
117          and    $tmp4,$tmp0
118         dsll    $tmp0,8                 # 0x0000FF000000FF00
119         or      $tmp1,$tmp2
120          or     $tmp3,$tmp4
121         and     $tmp2,$in0,$tmp0
122          and    $tmp4,$in1,$tmp0
123         dsrl    $in0,8
124          dsrl   $in1,8
125         dsll    $tmp2,8
126          dsll   $tmp4,8
127         and     $in0,$tmp0
128          and    $in1,$tmp0
129         or      $tmp1,$tmp2
130          or     $tmp3,$tmp4
131         or      $in0,$tmp1
132          or     $in1,$tmp3
133         dsrl    $tmp1,$in0,32
134          dsrl   $tmp3,$in1,32
135         dsll    $in0,32
136          dsll   $in1,32
137         or      $in0,$tmp1
138          or     $in1,$tmp3
139 # endif
140 #endif
141         li      $tmp0,1
142         dsll    $tmp0,32
143         daddiu  $tmp0,-63
144         dsll    $tmp0,28
145         daddiu  $tmp0,-1                # 0ffffffc0fffffff
146
147         and     $in0,$tmp0
148         daddiu  $tmp0,-3                # 0ffffffc0ffffffc
149         and     $in1,$tmp0
150
151         sd      $in0,24($ctx)
152         dsrl    $tmp0,$in1,2
153         sd      $in1,32($ctx)
154         daddu   $tmp0,$in1              # s1 = r1 + (r1 >> 2)
155         sd      $tmp0,40($ctx)
156
157 .Lno_key:
158         li      $v0,0                   # return 0
159         jr      $ra
160 .end    poly1305_init
161 ___
162 {
163 my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
164    ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
165
166 $code.=<<___;
167 .align  5
168 .globl  poly1305_blocks
169 .ent    poly1305_blocks
170 poly1305_blocks:
171         .set    noreorder
172         dsrl    $len,4                  # number of complete blocks
173         beqz    $len,.Lno_data
174         nop
175
176         .frame  $sp,8*8,$ra
177         .mask   $SAVED_REGS_MASK,-8
178         dsub    $sp,8*8
179         sd      $s5,0($sp)
180         sd      $s4,8($sp)
181 ___
182 $code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi prologue
183         sd      $s3,16($sp)
184         sd      $s2,24($sp)
185         sd      $s1,32($sp)
186         sd      $s0,40($sp)
187 ___
188 $code.=<<___;
189         .set    reorder
190
191         ld      $h0,0($ctx)             # load hash value
192         ld      $h1,8($ctx)
193         ld      $h2,16($ctx)
194
195         ld      $r0,24($ctx)            # load key
196         ld      $r1,32($ctx)
197         ld      $s1,40($ctx)
198
199 .Loop:
200         ldl     $in0,0+MSB($inp)        # load input
201         ldl     $in1,8+MSB($inp)
202         ldr     $in0,0+LSB($inp)
203         daddiu  $len,-1
204         ldr     $in1,8+LSB($inp)
205         daddiu  $inp,16
206 #ifdef  MIPSEB
207 # if defined(_MIPS_ARCH_MIPS64R2)
208         dsbh    $in0,$in0               # byte swap
209          dsbh   $in1,$in1
210         dshd    $in0,$in0
211          dshd   $in1,$in1
212 # else
213         ori     $tmp0,$zero,0xFF
214         dsll    $tmp2,$tmp0,32
215         or      $tmp0,$tmp2             # 0x000000FF000000FF
216
217         and     $tmp1,$in0,$tmp0        # byte swap
218          and    $tmp3,$in1,$tmp0
219         dsrl    $tmp2,$in0,24
220          dsrl   $tmp4,$in1,24
221         dsll    $tmp1,24
222          dsll   $tmp3,24
223         and     $tmp2,$tmp0
224          and    $tmp4,$tmp0
225         dsll    $tmp0,8                 # 0x0000FF000000FF00
226         or      $tmp1,$tmp2
227          or     $tmp3,$tmp4
228         and     $tmp2,$in0,$tmp0
229          and    $tmp4,$in1,$tmp0
230         dsrl    $in0,8
231          dsrl   $in1,8
232         dsll    $tmp2,8
233          dsll   $tmp4,8
234         and     $in0,$tmp0
235          and    $in1,$tmp0
236         or      $tmp1,$tmp2
237          or     $tmp3,$tmp4
238         or      $in0,$tmp1
239          or     $in1,$tmp3
240         dsrl    $tmp1,$in0,32
241          dsrl   $tmp3,$in1,32
242         dsll    $in0,32
243          dsll   $in1,32
244         or      $in0,$tmp1
245          or     $in1,$tmp3
246 # endif
247 #endif
248         daddu   $h0,$in0                # accumulate input
249         daddu   $h1,$in1
250         sltu    $tmp0,$h0,$in0
251         sltu    $tmp1,$h1,$in1
252         daddu   $h1,$tmp0
253
254         dmultu  $r0,$h0                 # h0*r0
255          daddu  $h2,$padbit
256          sltu   $tmp0,$h1,$tmp0
257         mflo    $d0
258         mfhi    $d1
259
260         dmultu  $s1,$h1                 # h1*5*r1
261          daddu  $tmp0,$tmp1
262          daddu  $h2,$tmp0
263         mflo    $tmp0
264         mfhi    $tmp1
265
266         dmultu  $r1,$h0                 # h0*r1
267          daddu  $d0,$tmp0
268          daddu  $d1,$tmp1
269         mflo    $tmp2
270         mfhi    $d2
271          sltu   $tmp0,$d0,$tmp0
272          daddu  $d1,$tmp0
273
274         dmultu  $r0,$h1                 # h1*r0
275          daddu  $d1,$tmp2
276          sltu   $tmp2,$d1,$tmp2
277         mflo    $tmp0
278         mfhi    $tmp1
279          daddu  $d2,$tmp2
280
281         dmultu  $s1,$h2                 # h2*5*r1
282          daddu  $d1,$tmp0
283          daddu  $d2,$tmp1
284         mflo    $tmp2
285
286         dmultu  $r0,$h2                 # h2*r0
287          sltu   $tmp0,$d1,$tmp0
288          daddu  $d2,$tmp0
289         mflo    $tmp3
290
291         daddu   $d1,$tmp2
292         daddu   $d2,$tmp3
293         sltu    $tmp2,$d1,$tmp2
294         daddu   $d2,$tmp2
295
296         li      $tmp0,-4                # final reduction
297         and     $tmp0,$d2
298         dsrl    $tmp1,$d2,2
299         andi    $h2,$d2,3
300         daddu   $tmp0,$tmp1
301         daddu   $h0,$d0,$tmp0
302         sltu    $tmp0,$h0,$tmp0
303         daddu   $h1,$d1,$tmp0
304         sltu    $tmp0,$h1,$tmp0
305         daddu   $h2,$h2,$tmp0
306
307         bnez    $len,.Loop
308
309         sd      $h0,0($ctx)             # store hash value
310         sd      $h1,8($ctx)
311         sd      $h2,16($ctx)
312
313         .set    noreorder
314         ld      $s5,0($sp)              # epilogue
315         ld      $s4,8($sp)
316 ___
317 $code.=<<___ if ($flavour =~ /nubi/i);  # optimize non-nubi epilogue
318         ld      $s3,16($sp)
319         ld      $s2,24($sp)
320         ld      $s1,32($sp)
321         ld      $s0,40($sp)
322 ___
323 $code.=<<___;
324         dadd    $sp,8*8
325
326 .Lno_data:
327         jr      $ra
328         nop
329 .end    poly1305_blocks
330 ___
331 }
332 {
333 my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
334
335 $code.=<<___;
336 .align  5
337 .globl  poly1305_emit
338 .ent    poly1305_emit
339 poly1305_emit:
340         .frame  $sp,0,$ra
341         .set    reorder
342
343         ld      $tmp0,0($ctx)
344         ld      $tmp1,8($ctx)
345         ld      $tmp2,16($ctx)
346
347         daddiu  $in0,$tmp0,5            # compare to modulus
348         sltiu   $tmp3,$in0,5
349         daddu   $in1,$tmp1,$tmp3
350         sltu    $tmp3,$in1,$tmp3
351         daddu   $tmp2,$tmp2,$tmp3
352
353         dsrl    $tmp2,2                 # see if it carried/borrowed
354         dsubu   $tmp2,$zero,$tmp2
355         nor     $tmp3,$zero,$tmp2
356
357         and     $in0,$tmp2
358         and     $tmp0,$tmp3
359         and     $in1,$tmp2
360         and     $tmp1,$tmp3
361         or      $in0,$tmp0
362         or      $in1,$tmp1
363
364         lwu     $tmp0,0($nonce)         # load nonce
365         lwu     $tmp1,4($nonce)
366         lwu     $tmp2,8($nonce)
367         lwu     $tmp3,12($nonce)
368         dsll    $tmp1,32
369         dsll    $tmp3,32
370         or      $tmp0,$tmp1
371         or      $tmp2,$tmp3
372
373         daddu   $in0,$tmp0              # accumulate nonce
374         daddu   $in1,$tmp2
375         sltu    $tmp0,$in0,$tmp0
376         daddu   $in1,$tmp0
377
378         dsrl    $tmp0,$in0,8            # write mac value
379         dsrl    $tmp1,$in0,16
380         dsrl    $tmp2,$in0,24
381         sb      $in0,0($mac)
382         dsrl    $tmp3,$in0,32
383         sb      $tmp0,1($mac)
384         dsrl    $tmp0,$in0,40
385         sb      $tmp1,2($mac)
386         dsrl    $tmp1,$in0,48
387         sb      $tmp2,3($mac)
388         dsrl    $tmp2,$in0,56
389         sb      $tmp3,4($mac)
390         dsrl    $tmp3,$in1,8
391         sb      $tmp0,5($mac)
392         dsrl    $tmp0,$in1,16
393         sb      $tmp1,6($mac)
394         dsrl    $tmp1,$in1,24
395         sb      $tmp2,7($mac)
396
397         sb      $in1,8($mac)
398         dsrl    $tmp2,$in1,32
399         sb      $tmp3,9($mac)
400         dsrl    $tmp3,$in1,40
401         sb      $tmp0,10($mac)
402         dsrl    $tmp0,$in1,48
403         sb      $tmp1,11($mac)
404         dsrl    $tmp1,$in1,56
405         sb      $tmp2,12($mac)
406         sb      $tmp3,13($mac)
407         sb      $tmp0,14($mac)
408         sb      $tmp1,15($mac)
409
410         jr      $ra
411 .end    poly1305_emit
412 .rdata
413 .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
414 .align  2
415 ___
416 }
417
418 $output=pop and open STDOUT,">$output";
419 print $code;
420 close STDOUT;
421