blob: d580e32aed4afb344b5a652d1654878cfb9f2abc [file] [log] [blame]
Joey Armstronge8c091f2023-01-17 16:56:26 -05001// +build !appengine
2// +build gc
3// +build !purego
4
5#include "textflag.h"
6
7// Register allocation:
8// AX h
9// CX pointer to advance through b
10// DX n
11// BX loop end
12// R8 v1, k1
13// R9 v2
14// R10 v3
15// R11 v4
16// R12 tmp
17// R13 prime1v
18// R14 prime2v
19// R15 prime4v
20
21// round reads from and advances the buffer pointer in CX.
22// It assumes that R13 has prime1v and R14 has prime2v.
23#define round(r) \
24 MOVQ (CX), R12 \
25 ADDQ $8, CX \
26 IMULQ R14, R12 \
27 ADDQ R12, r \
28 ROLQ $31, r \
29 IMULQ R13, r
30
31// mergeRound applies a merge round on the two registers acc and val.
32// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v.
33#define mergeRound(acc, val) \
34 IMULQ R14, val \
35 ROLQ $31, val \
36 IMULQ R13, val \
37 XORQ val, acc \
38 IMULQ R13, acc \
39 ADDQ R15, acc
40
41// func Sum64(b []byte) uint64
42TEXT ·Sum64(SB), NOSPLIT, $0-32
43 // Load fixed primes.
44 MOVQ ·prime1v(SB), R13
45 MOVQ ·prime2v(SB), R14
46 MOVQ ·prime4v(SB), R15
47
48 // Load slice.
49 MOVQ b_base+0(FP), CX
50 MOVQ b_len+8(FP), DX
51 LEAQ (CX)(DX*1), BX
52
53 // The first loop limit will be len(b)-32.
54 SUBQ $32, BX
55
56 // Check whether we have at least one block.
57 CMPQ DX, $32
58 JLT noBlocks
59
60 // Set up initial state (v1, v2, v3, v4).
61 MOVQ R13, R8
62 ADDQ R14, R8
63 MOVQ R14, R9
64 XORQ R10, R10
65 XORQ R11, R11
66 SUBQ R13, R11
67
68 // Loop until CX > BX.
69blockLoop:
70 round(R8)
71 round(R9)
72 round(R10)
73 round(R11)
74
75 CMPQ CX, BX
76 JLE blockLoop
77
78 MOVQ R8, AX
79 ROLQ $1, AX
80 MOVQ R9, R12
81 ROLQ $7, R12
82 ADDQ R12, AX
83 MOVQ R10, R12
84 ROLQ $12, R12
85 ADDQ R12, AX
86 MOVQ R11, R12
87 ROLQ $18, R12
88 ADDQ R12, AX
89
90 mergeRound(AX, R8)
91 mergeRound(AX, R9)
92 mergeRound(AX, R10)
93 mergeRound(AX, R11)
94
95 JMP afterBlocks
96
97noBlocks:
98 MOVQ ·prime5v(SB), AX
99
100afterBlocks:
101 ADDQ DX, AX
102
103 // Right now BX has len(b)-32, and we want to loop until CX > len(b)-8.
104 ADDQ $24, BX
105
106 CMPQ CX, BX
107 JG fourByte
108
109wordLoop:
110 // Calculate k1.
111 MOVQ (CX), R8
112 ADDQ $8, CX
113 IMULQ R14, R8
114 ROLQ $31, R8
115 IMULQ R13, R8
116
117 XORQ R8, AX
118 ROLQ $27, AX
119 IMULQ R13, AX
120 ADDQ R15, AX
121
122 CMPQ CX, BX
123 JLE wordLoop
124
125fourByte:
126 ADDQ $4, BX
127 CMPQ CX, BX
128 JG singles
129
130 MOVL (CX), R8
131 ADDQ $4, CX
132 IMULQ R13, R8
133 XORQ R8, AX
134
135 ROLQ $23, AX
136 IMULQ R14, AX
137 ADDQ ·prime3v(SB), AX
138
139singles:
140 ADDQ $4, BX
141 CMPQ CX, BX
142 JGE finalize
143
144singlesLoop:
145 MOVBQZX (CX), R12
146 ADDQ $1, CX
147 IMULQ ·prime5v(SB), R12
148 XORQ R12, AX
149
150 ROLQ $11, AX
151 IMULQ R13, AX
152
153 CMPQ CX, BX
154 JL singlesLoop
155
156finalize:
157 MOVQ AX, R12
158 SHRQ $33, R12
159 XORQ R12, AX
160 IMULQ R14, AX
161 MOVQ AX, R12
162 SHRQ $29, R12
163 XORQ R12, AX
164 IMULQ ·prime3v(SB), AX
165 MOVQ AX, R12
166 SHRQ $32, R12
167 XORQ R12, AX
168
169 MOVQ AX, ret+24(FP)
170 RET
171
172// writeBlocks uses the same registers as above except that it uses AX to store
173// the d pointer.
174
175// func writeBlocks(d *Digest, b []byte) int
176TEXT ·writeBlocks(SB), NOSPLIT, $0-40
177 // Load fixed primes needed for round.
178 MOVQ ·prime1v(SB), R13
179 MOVQ ·prime2v(SB), R14
180
181 // Load slice.
182 MOVQ b_base+8(FP), CX
183 MOVQ b_len+16(FP), DX
184 LEAQ (CX)(DX*1), BX
185 SUBQ $32, BX
186
187 // Load vN from d.
188 MOVQ d+0(FP), AX
189 MOVQ 0(AX), R8 // v1
190 MOVQ 8(AX), R9 // v2
191 MOVQ 16(AX), R10 // v3
192 MOVQ 24(AX), R11 // v4
193
194 // We don't need to check the loop condition here; this function is
195 // always called with at least one block of data to process.
196blockLoop:
197 round(R8)
198 round(R9)
199 round(R10)
200 round(R11)
201
202 CMPQ CX, BX
203 JLE blockLoop
204
205 // Copy vN back to d.
206 MOVQ R8, 0(AX)
207 MOVQ R9, 8(AX)
208 MOVQ R10, 16(AX)
209 MOVQ R11, 24(AX)
210
211 // The number of bytes written is CX minus the old base pointer.
212 SUBQ b_base+8(FP), CX
213 MOVQ CX, ret+32(FP)
214
215 RET