blob: 4d64a17d69c127a25765fd3fb2b7f0956179ae6f [file] [log] [blame]
kesavandc71914f2022-03-25 11:19:03 +05301// +build gc,!purego,!noasm
2
3#include "textflag.h"
4
5// Register allocation.
6#define digest R1
7#define h R2 // Return value.
8#define p R3 // Input pointer.
9#define len R4
10#define nblocks R5 // len / 32.
11#define prime1 R7
12#define prime2 R8
13#define prime3 R9
14#define prime4 R10
15#define prime5 R11
16#define v1 R12
17#define v2 R13
18#define v3 R14
19#define v4 R15
20#define x1 R20
21#define x2 R21
22#define x3 R22
23#define x4 R23
24
25#define round(acc, x) \
26 MADD prime2, acc, x, acc \
27 ROR $64-31, acc \
28 MUL prime1, acc \
29
30// x = round(0, x).
31#define round0(x) \
32 MUL prime2, x \
33 ROR $64-31, x \
34 MUL prime1, x \
35
36#define mergeRound(x) \
37 round0(x) \
38 EOR x, h \
39 MADD h, prime4, prime1, h \
40
41// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
42#define blocksLoop() \
43 LSR $5, len, nblocks \
44 PCALIGN $16 \
45 loop: \
46 LDP.P 32(p), (x1, x2) \
47 round(v1, x1) \
48 LDP -16(p), (x3, x4) \
49 round(v2, x2) \
50 SUB $1, nblocks \
51 round(v3, x3) \
52 round(v4, x4) \
53 CBNZ nblocks, loop \
54
55// The primes are repeated here to ensure that they're stored
56// in a contiguous array, so we can load them with LDP.
57DATA primes<> +0(SB)/8, $11400714785074694791
58DATA primes<> +8(SB)/8, $14029467366897019727
59DATA primes<>+16(SB)/8, $1609587929392839161
60DATA primes<>+24(SB)/8, $9650029242287828579
61DATA primes<>+32(SB)/8, $2870177450012600261
62GLOBL primes<>(SB), NOPTR+RODATA, $40
63
64// func Sum64(b []byte) uint64
65TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
66 LDP b_base+0(FP), (p, len)
67
68 LDP primes<> +0(SB), (prime1, prime2)
69 LDP primes<>+16(SB), (prime3, prime4)
70 MOVD primes<>+32(SB), prime5
71
72 CMP $32, len
73 CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
74 BLO afterLoop
75
76 ADD prime1, prime2, v1
77 MOVD prime2, v2
78 MOVD $0, v3
79 NEG prime1, v4
80
81 blocksLoop()
82
83 ROR $64-1, v1, x1
84 ROR $64-7, v2, x2
85 ADD x1, x2
86 ROR $64-12, v3, x3
87 ROR $64-18, v4, x4
88 ADD x3, x4
89 ADD x2, x4, h
90
91 mergeRound(v1)
92 mergeRound(v2)
93 mergeRound(v3)
94 mergeRound(v4)
95
96afterLoop:
97 ADD len, h
98
99 TBZ $4, len, try8
100 LDP.P 16(p), (x1, x2)
101
102 round0(x1)
103 ROR $64-27, h
104 EOR x1 @> 64-27, h, h
105 MADD h, prime4, prime1, h
106
107 round0(x2)
108 ROR $64-27, h
109 EOR x2 @> 64-27, h
110 MADD h, prime4, prime1, h
111
112try8:
113 TBZ $3, len, try4
114 MOVD.P 8(p), x1
115
116 round0(x1)
117 ROR $64-27, h
118 EOR x1 @> 64-27, h
119 MADD h, prime4, prime1, h
120
121try4:
122 TBZ $2, len, try2
123 MOVWU.P 4(p), x2
124
125 MUL prime1, x2
126 ROR $64-23, h
127 EOR x2 @> 64-23, h
128 MADD h, prime3, prime2, h
129
130try2:
131 TBZ $1, len, try1
132 MOVHU.P 2(p), x3
133 AND $255, x3, x1
134 LSR $8, x3, x2
135
136 MUL prime5, x1
137 ROR $64-11, h
138 EOR x1 @> 64-11, h
139 MUL prime1, h
140
141 MUL prime5, x2
142 ROR $64-11, h
143 EOR x2 @> 64-11, h
144 MUL prime1, h
145
146try1:
147 TBZ $0, len, end
148 MOVBU (p), x4
149
150 MUL prime5, x4
151 ROR $64-11, h
152 EOR x4 @> 64-11, h
153 MUL prime1, h
154
155end:
156 EOR h >> 33, h
157 MUL prime2, h
158 EOR h >> 29, h
159 MUL prime3, h
160 EOR h >> 32, h
161
162 MOVD h, ret+24(FP)
163 RET
164
165// func writeBlocks(d *Digest, b []byte) int
166//
167// Assumes len(b) >= 32.
168TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
169 LDP primes<>(SB), (prime1, prime2)
170
171 // Load state. Assume v[1-4] are stored contiguously.
172 MOVD d+0(FP), digest
173 LDP 0(digest), (v1, v2)
174 LDP 16(digest), (v3, v4)
175
176 LDP b_base+8(FP), (p, len)
177
178 blocksLoop()
179
180 // Store updated state.
181 STP (v1, v2), 0(digest)
182 STP (v3, v4), 16(digest)
183
184 BIC $31, len
185 MOVD len, ret+32(FP)
186 RET