blob: 757f2011f0f2c5fc2b4982107c9a433a90644a25 [file] [log] [blame]
Matteo Scandolod525ae32020-04-02 17:27:29 -07001// +build !appengine
2// +build gc
3// +build !purego
4
5#include "textflag.h"
6
7// Register allocation:
8// AX h
9// CX pointer to advance through b
10// DX n
11// BX loop end
12// R8 v1, k1
13// R9 v2
14// R10 v3
15// R11 v4
16// R12 tmp
17// R13 prime1v
18// R14 prime2v
19// R15 prime4v
20
21// round reads from and advances the buffer pointer in CX.
22// It assumes that R13 has prime1v and R14 has prime2v.
23#define round(r) \
24 MOVQ (CX), R12 \
25 ADDQ $8, CX \
26 IMULQ R14, R12 \
27 ADDQ R12, r \
28 ROLQ $31, r \
29 IMULQ R13, r
30
31// mergeRound applies a merge round on the two registers acc and val.
32// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v.
33#define mergeRound(acc, val) \
34 IMULQ R14, val \
35 ROLQ $31, val \
36 IMULQ R13, val \
37 XORQ val, acc \
38 IMULQ R13, acc \
39 ADDQ R15, acc
40
41// func Sum64(b []byte) uint64
42TEXT ·Sum64(SB), NOSPLIT, $0-32
43 // Load fixed primes.
44 MOVQ ·prime1v(SB), R13
45 MOVQ ·prime2v(SB), R14
46 MOVQ ·prime4v(SB), R15
47
48 // Load slice.
49 MOVQ b_base+0(FP), CX
50 MOVQ b_len+8(FP), DX
51 LEAQ (CX)(DX*1), BX
52
53 // The first loop limit will be len(b)-32.
54 SUBQ $32, BX
55
56 // Check whether we have at least one block.
57 CMPQ DX, $32
58 JLT noBlocks
59
60 // Set up initial state (v1, v2, v3, v4).
61 MOVQ R13, R8
62 ADDQ R14, R8
63 MOVQ R14, R9
64 XORQ R10, R10
65 XORQ R11, R11
66 SUBQ R13, R11
67
68 // Loop until CX > BX.
69blockLoop:
70 round(R8)
71 round(R9)
72 round(R10)
73 round(R11)
74
75 CMPQ CX, BX
76 JLE blockLoop
77
78 MOVQ R8, AX
79 ROLQ $1, AX
80 MOVQ R9, R12
81 ROLQ $7, R12
82 ADDQ R12, AX
83 MOVQ R10, R12
84 ROLQ $12, R12
85 ADDQ R12, AX
86 MOVQ R11, R12
87 ROLQ $18, R12
88 ADDQ R12, AX
89
90 mergeRound(AX, R8)
91 mergeRound(AX, R9)
92 mergeRound(AX, R10)
93 mergeRound(AX, R11)
94
95 JMP afterBlocks
96
97noBlocks:
98 MOVQ ·prime5v(SB), AX
99
100afterBlocks:
101 ADDQ DX, AX
102
103 // Right now BX has len(b)-32, and we want to loop until CX > len(b)-8.
104 ADDQ $24, BX
105
106 CMPQ CX, BX
107 JG fourByte
108
109wordLoop:
110 // Calculate k1.
111 MOVQ (CX), R8
112 ADDQ $8, CX
113 IMULQ R14, R8
114 ROLQ $31, R8
115 IMULQ R13, R8
116
117 XORQ R8, AX
118 ROLQ $27, AX
119 IMULQ R13, AX
120 ADDQ R15, AX
121
122 CMPQ CX, BX
123 JLE wordLoop
124
125fourByte:
126 ADDQ $4, BX
127 CMPQ CX, BX
128 JG singles
129
130 MOVL (CX), R8
131 ADDQ $4, CX
132 IMULQ R13, R8
133 XORQ R8, AX
134
135 ROLQ $23, AX
136 IMULQ R14, AX
137 ADDQ ·prime3v(SB), AX
138
139singles:
140 ADDQ $4, BX
141 CMPQ CX, BX
142 JGE finalize
143
144singlesLoop:
145 MOVBQZX (CX), R12
146 ADDQ $1, CX
147 IMULQ ·prime5v(SB), R12
148 XORQ R12, AX
149
150 ROLQ $11, AX
151 IMULQ R13, AX
152
153 CMPQ CX, BX
154 JL singlesLoop
155
156finalize:
157 MOVQ AX, R12
158 SHRQ $33, R12
159 XORQ R12, AX
160 IMULQ R14, AX
161 MOVQ AX, R12
162 SHRQ $29, R12
163 XORQ R12, AX
164 IMULQ ·prime3v(SB), AX
165 MOVQ AX, R12
166 SHRQ $32, R12
167 XORQ R12, AX
168
169 MOVQ AX, ret+24(FP)
170 RET
171
172// writeBlocks uses the same registers as above except that it uses AX to store
173// the x pointer.
174
175// func writeBlocks(x *xxh, b []byte) []byte
176TEXT ·writeBlocks(SB), NOSPLIT, $0-56
177 // Load fixed primes needed for round.
178 MOVQ ·prime1v(SB), R13
179 MOVQ ·prime2v(SB), R14
180
181 // Load slice.
182 MOVQ b_base+8(FP), CX
183 MOVQ CX, ret_base+32(FP) // initialize return base pointer; see NOTE below
184 MOVQ b_len+16(FP), DX
185 LEAQ (CX)(DX*1), BX
186 SUBQ $32, BX
187
188 // Load vN from x.
189 MOVQ x+0(FP), AX
190 MOVQ 0(AX), R8 // v1
191 MOVQ 8(AX), R9 // v2
192 MOVQ 16(AX), R10 // v3
193 MOVQ 24(AX), R11 // v4
194
195 // We don't need to check the loop condition here; this function is
196 // always called with at least one block of data to process.
197blockLoop:
198 round(R8)
199 round(R9)
200 round(R10)
201 round(R11)
202
203 CMPQ CX, BX
204 JLE blockLoop
205
206 // Copy vN back to x.
207 MOVQ R8, 0(AX)
208 MOVQ R9, 8(AX)
209 MOVQ R10, 16(AX)
210 MOVQ R11, 24(AX)
211
212 // Construct return slice.
213 // NOTE: It's important that we don't construct a slice that has a base
214 // pointer off the end of the original slice, as in Go 1.7+ this will
215 // cause runtime crashes. (See discussion in, for example,
216 // https://github.com/golang/go/issues/16772.)
217 // Therefore, we calculate the length/cap first, and if they're zero, we
218 // keep the old base. This is what the compiler does as well if you
219 // write code like
220 // b = b[len(b):]
221
222 // New length is 32 - (CX - BX) -> BX+32 - CX.
223 ADDQ $32, BX
224 SUBQ CX, BX
225 JZ afterSetBase
226
227 MOVQ CX, ret_base+32(FP)
228
229afterSetBase:
230 MOVQ BX, ret_len+40(FP)
231 MOVQ BX, ret_cap+48(FP) // set cap == len
232
233 RET