blob: cea178561970915c80ee85bbf879a8b5e63059ea [file] [log] [blame]
kesavandc71914f2022-03-25 11:19:03 +05301// +build !appengine
2// +build gc
3// +build !purego
4// +build !noasm
5
6#include "textflag.h"
7
8// Register allocation:
9// AX h
10// SI pointer to advance through b
11// DX n
12// BX loop end
13// R8 v1, k1
14// R9 v2
15// R10 v3
16// R11 v4
17// R12 tmp
18// R13 prime1v
19// R14 prime2v
20// DI prime4v
21
22// round reads from and advances the buffer pointer in SI.
23// It assumes that R13 has prime1v and R14 has prime2v.
24#define round(r) \
25 MOVQ (SI), R12 \
26 ADDQ $8, SI \
27 IMULQ R14, R12 \
28 ADDQ R12, r \
29 ROLQ $31, r \
30 IMULQ R13, r
31
32// mergeRound applies a merge round on the two registers acc and val.
33// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
34#define mergeRound(acc, val) \
35 IMULQ R14, val \
36 ROLQ $31, val \
37 IMULQ R13, val \
38 XORQ val, acc \
39 IMULQ R13, acc \
40 ADDQ DI, acc
41
42// func Sum64(b []byte) uint64
43TEXT ·Sum64(SB), NOSPLIT, $0-32
44 // Load fixed primes.
45 MOVQ ·prime1v(SB), R13
46 MOVQ ·prime2v(SB), R14
47 MOVQ ·prime4v(SB), DI
48
49 // Load slice.
50 MOVQ b_base+0(FP), SI
51 MOVQ b_len+8(FP), DX
52 LEAQ (SI)(DX*1), BX
53
54 // The first loop limit will be len(b)-32.
55 SUBQ $32, BX
56
57 // Check whether we have at least one block.
58 CMPQ DX, $32
59 JLT noBlocks
60
61 // Set up initial state (v1, v2, v3, v4).
62 MOVQ R13, R8
63 ADDQ R14, R8
64 MOVQ R14, R9
65 XORQ R10, R10
66 XORQ R11, R11
67 SUBQ R13, R11
68
69 // Loop until SI > BX.
70blockLoop:
71 round(R8)
72 round(R9)
73 round(R10)
74 round(R11)
75
76 CMPQ SI, BX
77 JLE blockLoop
78
79 MOVQ R8, AX
80 ROLQ $1, AX
81 MOVQ R9, R12
82 ROLQ $7, R12
83 ADDQ R12, AX
84 MOVQ R10, R12
85 ROLQ $12, R12
86 ADDQ R12, AX
87 MOVQ R11, R12
88 ROLQ $18, R12
89 ADDQ R12, AX
90
91 mergeRound(AX, R8)
92 mergeRound(AX, R9)
93 mergeRound(AX, R10)
94 mergeRound(AX, R11)
95
96 JMP afterBlocks
97
98noBlocks:
99 MOVQ ·prime5v(SB), AX
100
101afterBlocks:
102 ADDQ DX, AX
103
104 // Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
105 ADDQ $24, BX
106
107 CMPQ SI, BX
108 JG fourByte
109
110wordLoop:
111 // Calculate k1.
112 MOVQ (SI), R8
113 ADDQ $8, SI
114 IMULQ R14, R8
115 ROLQ $31, R8
116 IMULQ R13, R8
117
118 XORQ R8, AX
119 ROLQ $27, AX
120 IMULQ R13, AX
121 ADDQ DI, AX
122
123 CMPQ SI, BX
124 JLE wordLoop
125
126fourByte:
127 ADDQ $4, BX
128 CMPQ SI, BX
129 JG singles
130
131 MOVL (SI), R8
132 ADDQ $4, SI
133 IMULQ R13, R8
134 XORQ R8, AX
135
136 ROLQ $23, AX
137 IMULQ R14, AX
138 ADDQ ·prime3v(SB), AX
139
140singles:
141 ADDQ $4, BX
142 CMPQ SI, BX
143 JGE finalize
144
145singlesLoop:
146 MOVBQZX (SI), R12
147 ADDQ $1, SI
148 IMULQ ·prime5v(SB), R12
149 XORQ R12, AX
150
151 ROLQ $11, AX
152 IMULQ R13, AX
153
154 CMPQ SI, BX
155 JL singlesLoop
156
157finalize:
158 MOVQ AX, R12
159 SHRQ $33, R12
160 XORQ R12, AX
161 IMULQ R14, AX
162 MOVQ AX, R12
163 SHRQ $29, R12
164 XORQ R12, AX
165 IMULQ ·prime3v(SB), AX
166 MOVQ AX, R12
167 SHRQ $32, R12
168 XORQ R12, AX
169
170 MOVQ AX, ret+24(FP)
171 RET
172
173// writeBlocks uses the same registers as above except that it uses AX to store
174// the d pointer.
175
176// func writeBlocks(d *Digest, b []byte) int
177TEXT ·writeBlocks(SB), NOSPLIT, $0-40
178 // Load fixed primes needed for round.
179 MOVQ ·prime1v(SB), R13
180 MOVQ ·prime2v(SB), R14
181
182 // Load slice.
183 MOVQ b_base+8(FP), SI
184 MOVQ b_len+16(FP), DX
185 LEAQ (SI)(DX*1), BX
186 SUBQ $32, BX
187
188 // Load vN from d.
189 MOVQ d+0(FP), AX
190 MOVQ 0(AX), R8 // v1
191 MOVQ 8(AX), R9 // v2
192 MOVQ 16(AX), R10 // v3
193 MOVQ 24(AX), R11 // v4
194
195 // We don't need to check the loop condition here; this function is
196 // always called with at least one block of data to process.
197blockLoop:
198 round(R8)
199 round(R9)
200 round(R10)
201 round(R11)
202
203 CMPQ SI, BX
204 JLE blockLoop
205
206 // Copy vN back to d.
207 MOVQ R8, 0(AX)
208 MOVQ R9, 8(AX)
209 MOVQ R10, 16(AX)
210 MOVQ R11, 24(AX)
211
212 // The number of bytes written is SI minus the old base pointer.
213 SUBQ b_base+8(FP), SI
214 MOVQ SI, ret+32(FP)
215
216 RET