1 /* Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
3 Permission is hereby granted, free of charge, to any person obtaining
4 a copy of this software and associated documentation files (the
5 "Software"), to deal in the Software without restriction, including
6 without limitation the rights to use, copy, modify, merge, publish,
7 distribute, sublicense, and/or sell copies of the Software, and to
8 permit persons to whom the Software is furnished to do so, subject to
9 the following conditions:
11 The above copyright notice and this permission notice shall be
12 included in all copies or substantial portions of the Software.
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
22 // Common registers are assigned as follows:
26 // t0 Const Tbl Ptr TPtr
27 // t1 Round Constant TRound
28 // t4 Block residual LenResid
29 // t5 Residual Data DTmp
31 // {in,out}0 Block 0 Cycle RotateM0
32 // {in,out}1 Block Value 12 M12
33 // {in,out}2 Block Value 8 M8
34 // {in,out}3 Block Value 4 M4
35 // {in,out}4 Block Value 0 M0
36 // {in,out}5 Block 1 Cycle RotateM1
37 // {in,out}6 Block Value 13 M13
38 // {in,out}7 Block Value 9 M9
39 // {in,out}8 Block Value 5 M5
40 // {in,out}9 Block Value 1 M1
41 // {in,out}10 Block 2 Cycle RotateM2
42 // {in,out}11 Block Value 14 M14
43 // {in,out}12 Block Value 10 M10
44 // {in,out}13 Block Value 6 M6
45 // {in,out}14 Block Value 2 M2
46 // {in,out}15 Block 3 Cycle RotateM3
47 // {in,out}16 Block Value 15 M15
48 // {in,out}17 Block Value 11 M11
49 // {in,out}18 Block Value 7 M7
50 // {in,out}19 Block Value 3 M3
51 // {in,out}20 Scratch Z
52 // {in,out}21 Scratch Y
53 // {in,out}22 Scratch X
54 // {in,out}23 Scratch W
55 // {in,out}24 Digest A A
56 // {in,out}25 Digest B B
57 // {in,out}26 Digest C C
58 // {in,out}27 Digest D D
59 // {in,out}28 Active Data Ptr DPtr
61 // out28 Dummy Value -
62 // bt0 Coroutine Link QUICK_RTN
64 /// These predicates are used for computing the padding block(s) and
65 /// are shared between the driver and digest co-routines
67 // pt0 Extra Pad Block pExtra
68 // pt1 Load next word pLoad
69 // pt2 Skip next word pSkip
70 // pt3 Search for Pad pNoPad
71 // pt4 Pad Word 0 pPad0
72 // pt5 Pad Word 1 pPad1
73 // pt6 Pad Word 2 pPad2
74 // pt7 Pad Word 3 pPad3
89 // This two below shall remain constant througout whole routine
90 #define pDataOrder p14
91 #define pHostOrder p15
114 #define RotateM0_ out0
115 #define RotateM1_ out5
116 #define RotateM2_ out10
117 #define RotateM3_ out15
146 #define RotateM2 in10
147 #define RotateM3 in15
153 /* register stack configuration for md5_block_asm_host_order(): */
159 /* register stack configuration for helpers: */
160 #define _NINPUTS MD5_NOUT
163 #define _NROTATE 24 /* this must be <= _NINPUTS */
165 #if defined(_HPUX_SOURCE) && !defined(_LP64)
171 // Macros for getting the left and right portions of little-endian words
173 #define GETLW(dst, src, align) dep.z dst = src, 32 - 8 * align, 8 * align
174 #define GETRW(dst, src, align) extr.u dst = src, 8 * align, 32 - 8 * align
178 // Reads an input block, then calls the digest block
179 // subroutine and adds the results to the accumulated
180 // digest. It allocates 32 outs which the subroutine
181 // uses as it's inputs and rotating
182 // registers. Initializes the round constant pointer and
183 // takes care of saving/restoring ar.lc
187 // in0 Context Ptr CtxPtr0
188 // in1 Input Data Ptr DPtrIn
189 // in2 Integral Blocks BlockCount
190 // rp Return Address -
194 // v2 Input Align InAlign
195 // t0 Shared w/digest -
196 // t1 Shared w/digest -
197 // t2 Shared w/digest -
198 // t3 Shared w/digest -
199 // t4 Shared w/digest -
200 // t5 Shared w/digest -
201 // t6 PFS Save PFSSave
202 // t7 ar.lc Save LCSave
203 // t8 Saved PR PRSave
204 // t9 2nd CtxPtr CtxPtr1
205 // t10 Table Base CTable
206 // t11 Table[0] CTable0
207 // t13 Accumulator A AccumA
208 // t14 Accumulator B AccumB
209 // t15 Accumulator C AccumC
210 // t16 Accumulator D AccumD
211 // pt0 Shared w/digest -
212 // pt1 Shared w/digest -
213 // pt2 Shared w/digest -
214 // pt3 Shared w/digest -
215 // pt4 Shared w/digest -
216 // pt5 Shared w/digest -
217 // pt6 Shared w/digest -
218 // pt7 Shared w/digest -
219 // pt8 Not Aligned pOff
220 // pt8 Blocks Left pAgain
231 #define BlockCount in2
241 /* md5_block_asm_host_order(MD5_CTX *c, const void *data, size_t num)
244 c: a pointer to a structure of this type:
246 typedef struct MD5state_st
250 MD5_LONG data[MD5_LBLOCK];
255 data: a pointer to the input data (may be misaligned)
256 num: the number of 16-byte blocks to hash (i.e., the length
261 .type md5_block_asm_data_order, @function
262 .global md5_block_asm_data_order
264 .proc md5_block_asm_data_order
265 md5_block_asm_data_order:
267 cmp.eq pDataOrder,pHostOrder = r0,r0
268 br.sptk.many .md5_block
270 .endp md5_block_asm_data_order
272 .type md5_block_asm_host_order, @function
273 .global md5_block_asm_host_order
275 .proc md5_block_asm_host_order
276 md5_block_asm_host_order:
279 cmp.eq pHostOrder,pDataOrder = r0,r0
283 .save ar.pfs, PFSSave
284 alloc PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
285 ADDP CtxPtr1 = 8, CtxPtr0
289 ADDP DPtrIn = 0, DPtrIn
290 ADDP CtxPtr0 = 0, CtxPtr0
295 .pred.rel "mutex",pDataOrder,pHostOrder
297 (pDataOrder) add CTable = .md5_tbl_data_order#-.md5_block#, CTable
298 (pHostOrder) add CTable = .md5_tbl_host_order#-.md5_block#, CTable
299 and InAlign = 0x3, DPtrIn
303 ld4 AccumA = [CtxPtr0], 4
304 ld4 AccumC = [CtxPtr1], 4
311 ld4 AccumB = [CtxPtr0]
312 ld4 AccumD = [CtxPtr1]
313 dep DPtr_ = 0, DPtrIn, 0, 2
315 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
316 (pDataOrder) rum psr.be;; // switch to little-endian
319 ld4 CTable0 = [CTable], 4
320 cmp.ne pOff, p0 = 0, InAlign
321 (pOff) br.cond.spnt.many .md5_unaligned
324 // The FF load/compute loop rotates values three times, so that
325 // loading into M12 here produces the M0 value, M13 -> M1, etc.
329 ld4 M12_ = [DPtr_], 4
334 ld4 M13_ = [DPtr_], 4
339 ld4 M14_ = [DPtr_], 4
344 ld4 M15_ = [DPtr_], 4
345 add BlockCount = -1, BlockCount
346 br.call.sptk.many QUICK_RTN = md5_digest_block0
349 // Now, we add the new digest values and do some clean-up
350 // before checking if there's another full block to process
353 add AccumA = AccumA, A_
354 add AccumB = AccumB, B_
355 cmp.ne pAgain, p0 = 0, BlockCount
358 add AccumC = AccumC, C_
359 add AccumD = AccumD, D_
360 (pAgain) br.cond.dptk.many .md5_block_loop0
364 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
365 (pDataOrder) sum psr.be;; // switch back to big-endian mode
368 st4 [CtxPtr0] = AccumB, -4
369 st4 [CtxPtr1] = AccumD, -4
370 mov pr = PRSave, 0x1ffff ;;
373 st4 [CtxPtr0] = AccumA
374 st4 [CtxPtr1] = AccumC
382 #define MD5UNALIGNED(offset) \
383 .md5_process##offset: \
386 GETRW(DTmp, DTmp, offset) ; \
388 .md5_block_loop##offset: \
390 ld4 Y_ = [DPtr_], 4 ; \
391 mov TPtr = CTable ; \
392 mov TRound = CTable0 ; \
395 ld4 M13_ = [DPtr_], 4 ; \
400 ld4 M14_ = [DPtr_], 4 ; \
401 GETLW(W_, Y_, offset) ; \
406 or M12_ = W_, DTmp ; \
407 GETRW(DTmp, Y_, offset) ; \
410 ld4 M15_ = [DPtr_], 4 ; \
411 add BlockCount = -1, BlockCount ; \
412 br.call.sptk.many QUICK_RTN = md5_digest_block##offset; \
415 add AccumA = AccumA, A_ ; \
416 add AccumB = AccumB, B_ ; \
417 cmp.ne pAgain, p0 = 0, BlockCount ; \
420 add AccumC = AccumC, C_ ; \
421 add AccumD = AccumD, D_ ; \
422 (pAgain) br.cond.dptk.many .md5_block_loop##offset ; \
427 br.cond.sptk.many .md5_exit ; \
433 // Because variable shifts are expensive, we special case each of
434 // the four alignements. In practice, this won't hurt too much
435 // since only one working set of code will be loaded.
438 ld4 DTmp = [DPtr_], 4
439 cmp.eq pOff, p0 = 1, InAlign
440 (pOff) br.cond.dpnt.many .md5_process1
443 cmp.eq pOff, p0 = 2, InAlign
445 (pOff) br.cond.dpnt.many .md5_process2
451 .endp md5_block_asm_host_order
454 // MD5 Perform the F function and load
456 // Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values,
457 // computes the FF() round of functions, then branches to the common
458 // digest code to finish up with GG(), HH, and II().
462 // rp Return Address -
466 // v0 PFS bit bucket PFS
467 // v1 Loop Trip Count LTrip
468 // pt0 Load next word pMore
475 /* For GHI rounds: */
480 #define COMPUTE(a, b, s, M, R) \
483 ld4 TRound = [TPtr], 4 ; \
484 dep.z Y = Z, 32, 32 ;; \
485 shrp Z = Z, Y, 64 - s ; \
494 #define LOOP(a, b, s, M, R, label) \
496 ld4 TRound = [TPtr], 4 ; \
497 dep.z Y = Z, 32, 32 ;; \
498 shrp Z = Z, Y, 64 - s ; \
503 br.ctop.sptk.many label ; \
506 // G(B, C, D) = (B & D) | (C & ~D)
508 #define G(a, b, c, d, M) \
510 add Z = M, TRound ; \
520 // H(B, C, D) = B ^ C ^ D
522 #define H(a, b, c, d, M) \
524 add Z = M, TRound ; \
534 // I(B, C, D) = C ^ (B | ~D)
536 // However, since we have an andcm operator, we use the fact that
540 // to rewrite the expression as
542 // I(B, C, D) = ~C ^ (~B & D)
544 #define I(a, b, c, d, M) \
546 add Z = M, TRound ; \
558 COMPUTE(A, B, 5, M0, RotateM0) \
560 COMPUTE(D, A, 9, M1, RotateM1) \
562 COMPUTE(C, D, 14, M2, RotateM2) \
564 LOOP(B, C, 20, M3, RotateM3, label)
568 COMPUTE(A, B, 4, M0, RotateM0) \
570 COMPUTE(D, A, 11, M1, RotateM1) \
572 COMPUTE(C, D, 16, M2, RotateM2) \
574 LOOP(B, C, 23, M3, RotateM3, label)
578 COMPUTE(A, B, 6, M0, RotateM0) \
580 COMPUTE(D, A, 10, M1, RotateM1) \
582 COMPUTE(C, D, 15, M2, RotateM2) \
584 LOOP(B, C, 21, M3, RotateM3, label)
586 #define FFLOAD(a, b, c, d, M, N, s) \
588 (pMore) ld4 N = [DPtr], 4 ; \
589 add Z = M, TRound ; \
598 ld4 TRound = [TPtr], 4 ; \
600 dep.z Y = Z, 32, 32 ; \
604 shrp Z = Z, Y, 64 - s ;; \
608 #define FFLOOP(a, b, c, d, M, N, s, dest) \
610 (pMore) ld4 N = [DPtr], 4 ; \
611 add Z = M, TRound ; \
620 ld4 TRound = [TPtr], 4 ; \
622 dep.z Y = Z, 32, 32 ; \
626 shrp Z = Z, Y, 64 - s ;; \
630 cmp.ne pMore, p0 = 0, LTrip ; \
631 add LTrip = -1, LTrip ; \
632 br.ctop.dptk.many dest ; \
635 .type md5_digest_block0, @function
638 .proc md5_digest_block0
644 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
649 cmp.eq pMore, p0 = r0, r0
655 FFLOAD(A, B, C, D, M12, RotateM0, 7)
656 FFLOAD(D, A, B, C, M13, RotateM1, 12)
657 FFLOAD(C, D, A, B, M14, RotateM2, 17)
658 FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0)
660 // !!! Fall through to md5_digest_GHI
662 .endp md5_digest_block0
664 .type md5_digest_GHI, @function
669 .regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
674 // The following sequence shuffles the block counstants round for the
677 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
678 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
727 // The following sequence shuffles the block constants round for the
730 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
731 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
780 // The following sequence shuffles the block constants round for the
783 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
784 // 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9
836 br.ret.sptk.many QUICK_RTN
841 #define FFLOADU(a, b, c, d, M, P, N, s, offset) \
843 (pMore) ld4 N = [DPtr], 4 ; \
844 add Z = M, TRound ; \
853 ld4 TRound = [TPtr], 4 ; \
854 GETLW(W, P, offset) ; \
859 dep.z Y = Z, 32, 32 ;; \
860 shrp Z = Z, Y, 64 - s ; \
864 GETRW(DTmp, P, offset) ; \
868 #define FFLOOPU(a, b, c, d, M, P, N, s, offset) \
870 (pMore) ld4 N = [DPtr], 4 ; \
871 add Z = M, TRound ; \
880 ld4 TRound = [TPtr], 4 ; \
881 (pMore) GETLW(W, P, offset) ; \
885 (pMore) or W = W, DTmp ; \
886 dep.z Y = Z, 32, 32 ;; \
887 shrp Z = Z, Y, 64 - s ; \
891 (pMore) GETRW(DTmp, P, offset) ; \
892 (pMore) mov P = W ; \
895 cmp.ne pMore, p0 = 0, LTrip ; \
896 add LTrip = -1, LTrip ; \
897 br.ctop.sptk.many .md5_FF_round##offset ; \
900 #define MD5FBLOCK(offset) \
901 .type md5_digest_block##offset, @function ; \
904 .proc md5_digest_block##offset ; \
908 md5_digest_block##offset: \
910 alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ; \
915 cmp.eq pMore, p0 = r0, r0 ; \
920 .pred.rel "mutex", pLoad, pSkip ; \
921 .md5_FF_round##offset: \
922 FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset) \
923 FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset) \
924 FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset) \
925 FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset) \
930 br.cond.sptk.many md5_digest_GHI ; \
932 .endp md5digestBlock ## offset
939 .type md5_constants, @object
941 .md5_tbl_data_order: // To ensure little-endian data
942 // order, code as bytes.
943 data1 0x78, 0xa4, 0x6a, 0xd7 // 0
944 data1 0x56, 0xb7, 0xc7, 0xe8 // 1
945 data1 0xdb, 0x70, 0x20, 0x24 // 2
946 data1 0xee, 0xce, 0xbd, 0xc1 // 3
947 data1 0xaf, 0x0f, 0x7c, 0xf5 // 4
948 data1 0x2a, 0xc6, 0x87, 0x47 // 5
949 data1 0x13, 0x46, 0x30, 0xa8 // 6
950 data1 0x01, 0x95, 0x46, 0xfd // 7
951 data1 0xd8, 0x98, 0x80, 0x69 // 8
952 data1 0xaf, 0xf7, 0x44, 0x8b // 9
953 data1 0xb1, 0x5b, 0xff, 0xff // 10
954 data1 0xbe, 0xd7, 0x5c, 0x89 // 11
955 data1 0x22, 0x11, 0x90, 0x6b // 12
956 data1 0x93, 0x71, 0x98, 0xfd // 13
957 data1 0x8e, 0x43, 0x79, 0xa6 // 14
958 data1 0x21, 0x08, 0xb4, 0x49 // 15
959 data1 0x62, 0x25, 0x1e, 0xf6 // 16
960 data1 0x40, 0xb3, 0x40, 0xc0 // 17
961 data1 0x51, 0x5a, 0x5e, 0x26 // 18
962 data1 0xaa, 0xc7, 0xb6, 0xe9 // 19
963 data1 0x5d, 0x10, 0x2f, 0xd6 // 20
964 data1 0x53, 0x14, 0x44, 0x02 // 21
965 data1 0x81, 0xe6, 0xa1, 0xd8 // 22
966 data1 0xc8, 0xfb, 0xd3, 0xe7 // 23
967 data1 0xe6, 0xcd, 0xe1, 0x21 // 24
968 data1 0xd6, 0x07, 0x37, 0xc3 // 25
969 data1 0x87, 0x0d, 0xd5, 0xf4 // 26
970 data1 0xed, 0x14, 0x5a, 0x45 // 27
971 data1 0x05, 0xe9, 0xe3, 0xa9 // 28
972 data1 0xf8, 0xa3, 0xef, 0xfc // 29
973 data1 0xd9, 0x02, 0x6f, 0x67 // 30
974 data1 0x8a, 0x4c, 0x2a, 0x8d // 31
975 data1 0x42, 0x39, 0xfa, 0xff // 32
976 data1 0x81, 0xf6, 0x71, 0x87 // 33
977 data1 0x22, 0x61, 0x9d, 0x6d // 34
978 data1 0x0c, 0x38, 0xe5, 0xfd // 35
979 data1 0x44, 0xea, 0xbe, 0xa4 // 36
980 data1 0xa9, 0xcf, 0xde, 0x4b // 37
981 data1 0x60, 0x4b, 0xbb, 0xf6 // 38
982 data1 0x70, 0xbc, 0xbf, 0xbe // 39
983 data1 0xc6, 0x7e, 0x9b, 0x28 // 40
984 data1 0xfa, 0x27, 0xa1, 0xea // 41
985 data1 0x85, 0x30, 0xef, 0xd4 // 42
986 data1 0x05, 0x1d, 0x88, 0x04 // 43
987 data1 0x39, 0xd0, 0xd4, 0xd9 // 44
988 data1 0xe5, 0x99, 0xdb, 0xe6 // 45
989 data1 0xf8, 0x7c, 0xa2, 0x1f // 46
990 data1 0x65, 0x56, 0xac, 0xc4 // 47
991 data1 0x44, 0x22, 0x29, 0xf4 // 48
992 data1 0x97, 0xff, 0x2a, 0x43 // 49
993 data1 0xa7, 0x23, 0x94, 0xab // 50
994 data1 0x39, 0xa0, 0x93, 0xfc // 51
995 data1 0xc3, 0x59, 0x5b, 0x65 // 52
996 data1 0x92, 0xcc, 0x0c, 0x8f // 53
997 data1 0x7d, 0xf4, 0xef, 0xff // 54
998 data1 0xd1, 0x5d, 0x84, 0x85 // 55
999 data1 0x4f, 0x7e, 0xa8, 0x6f // 56
1000 data1 0xe0, 0xe6, 0x2c, 0xfe // 57
1001 data1 0x14, 0x43, 0x01, 0xa3 // 58
1002 data1 0xa1, 0x11, 0x08, 0x4e // 59
1003 data1 0x82, 0x7e, 0x53, 0xf7 // 60
1004 data1 0x35, 0xf2, 0x3a, 0xbd // 61
1005 data1 0xbb, 0xd2, 0xd7, 0x2a // 62
1006 data1 0x91, 0xd3, 0x86, 0xeb // 63
1008 .md5_tbl_host_order: // OS data order, might as well
1009 // be little-endian.
1010 data4 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee // 0
1011 data4 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 // 4
1012 data4 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be // 8
1013 data4 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 // 12
1014 data4 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa // 16
1015 data4 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 // 20
1016 data4 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed // 24
1017 data4 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a // 28
1018 data4 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c // 32
1019 data4 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 // 36
1020 data4 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 // 40
1021 data4 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 // 44
1022 data4 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 // 48
1023 data4 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 // 52
1024 data4 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 // 56
1025 data4 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 // 60
1026 .size md5_constants#,64*4*2