blob: 3e8b132579ec2ea32773fd3839f63e2a772b4ad6 [file] [log] [blame]
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +05301//go:build !appengine && gc && !purego
Joey Armstronge8c091f2023-01-17 16:56:26 -05002// +build !appengine
3// +build gc
4// +build !purego
5
6#include "textflag.h"
7
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +05308// Registers:
9#define h AX
10#define d AX
11#define p SI // pointer to advance through b
12#define n DX
13#define end BX // loop end
14#define v1 R8
15#define v2 R9
16#define v3 R10
17#define v4 R11
18#define x R12
19#define prime1 R13
20#define prime2 R14
21#define prime4 DI
Joey Armstronge8c091f2023-01-17 16:56:26 -050022
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +053023#define round(acc, x) \
24 IMULQ prime2, x \
25 ADDQ x, acc \
26 ROLQ $31, acc \
27 IMULQ prime1, acc
Joey Armstronge8c091f2023-01-17 16:56:26 -050028
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +053029// round0 performs the operation x = round(0, x).
30#define round0(x) \
31 IMULQ prime2, x \
32 ROLQ $31, x \
33 IMULQ prime1, x
34
35// mergeRound applies a merge round on the two registers acc and x.
36// It assumes that prime1, prime2, and prime4 have been loaded.
37#define mergeRound(acc, x) \
38 round0(x) \
39 XORQ x, acc \
40 IMULQ prime1, acc \
41 ADDQ prime4, acc
42
43// blockLoop processes as many 32-byte blocks as possible,
44// updating v1, v2, v3, and v4. It assumes that there is at least one block
45// to process.
46#define blockLoop() \
47loop: \
48 MOVQ +0(p), x \
49 round(v1, x) \
50 MOVQ +8(p), x \
51 round(v2, x) \
52 MOVQ +16(p), x \
53 round(v3, x) \
54 MOVQ +24(p), x \
55 round(v4, x) \
56 ADDQ $32, p \
57 CMPQ p, end \
58 JLE loop
Joey Armstronge8c091f2023-01-17 16:56:26 -050059
60// func Sum64(b []byte) uint64
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +053061TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
Joey Armstronge8c091f2023-01-17 16:56:26 -050062 // Load fixed primes.
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +053063 MOVQ ·primes+0(SB), prime1
64 MOVQ ·primes+8(SB), prime2
65 MOVQ ·primes+24(SB), prime4
Joey Armstronge8c091f2023-01-17 16:56:26 -050066
67 // Load slice.
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +053068 MOVQ b_base+0(FP), p
69 MOVQ b_len+8(FP), n
70 LEAQ (p)(n*1), end
Joey Armstronge8c091f2023-01-17 16:56:26 -050071
72 // The first loop limit will be len(b)-32.
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +053073 SUBQ $32, end
Joey Armstronge8c091f2023-01-17 16:56:26 -050074
75 // Check whether we have at least one block.
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +053076 CMPQ n, $32
Joey Armstronge8c091f2023-01-17 16:56:26 -050077 JLT noBlocks
78
79 // Set up initial state (v1, v2, v3, v4).
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +053080 MOVQ prime1, v1
81 ADDQ prime2, v1
82 MOVQ prime2, v2
83 XORQ v3, v3
84 XORQ v4, v4
85 SUBQ prime1, v4
Joey Armstronge8c091f2023-01-17 16:56:26 -050086
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +053087 blockLoop()
Joey Armstronge8c091f2023-01-17 16:56:26 -050088
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +053089 MOVQ v1, h
90 ROLQ $1, h
91 MOVQ v2, x
92 ROLQ $7, x
93 ADDQ x, h
94 MOVQ v3, x
95 ROLQ $12, x
96 ADDQ x, h
97 MOVQ v4, x
98 ROLQ $18, x
99 ADDQ x, h
Joey Armstronge8c091f2023-01-17 16:56:26 -0500100
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530101 mergeRound(h, v1)
102 mergeRound(h, v2)
103 mergeRound(h, v3)
104 mergeRound(h, v4)
Joey Armstronge8c091f2023-01-17 16:56:26 -0500105
106 JMP afterBlocks
107
108noBlocks:
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530109 MOVQ ·primes+32(SB), h
Joey Armstronge8c091f2023-01-17 16:56:26 -0500110
111afterBlocks:
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530112 ADDQ n, h
Joey Armstronge8c091f2023-01-17 16:56:26 -0500113
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530114 ADDQ $24, end
115 CMPQ p, end
116 JG try4
Joey Armstronge8c091f2023-01-17 16:56:26 -0500117
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530118loop8:
119 MOVQ (p), x
120 ADDQ $8, p
121 round0(x)
122 XORQ x, h
123 ROLQ $27, h
124 IMULQ prime1, h
125 ADDQ prime4, h
Joey Armstronge8c091f2023-01-17 16:56:26 -0500126
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530127 CMPQ p, end
128 JLE loop8
Joey Armstronge8c091f2023-01-17 16:56:26 -0500129
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530130try4:
131 ADDQ $4, end
132 CMPQ p, end
133 JG try1
Joey Armstronge8c091f2023-01-17 16:56:26 -0500134
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530135 MOVL (p), x
136 ADDQ $4, p
137 IMULQ prime1, x
138 XORQ x, h
Joey Armstronge8c091f2023-01-17 16:56:26 -0500139
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530140 ROLQ $23, h
141 IMULQ prime2, h
142 ADDQ ·primes+16(SB), h
Joey Armstronge8c091f2023-01-17 16:56:26 -0500143
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530144try1:
145 ADDQ $4, end
146 CMPQ p, end
Joey Armstronge8c091f2023-01-17 16:56:26 -0500147 JGE finalize
148
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530149loop1:
150 MOVBQZX (p), x
151 ADDQ $1, p
152 IMULQ ·primes+32(SB), x
153 XORQ x, h
154 ROLQ $11, h
155 IMULQ prime1, h
Joey Armstronge8c091f2023-01-17 16:56:26 -0500156
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530157 CMPQ p, end
158 JL loop1
Joey Armstronge8c091f2023-01-17 16:56:26 -0500159
160finalize:
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530161 MOVQ h, x
162 SHRQ $33, x
163 XORQ x, h
164 IMULQ prime2, h
165 MOVQ h, x
166 SHRQ $29, x
167 XORQ x, h
168 IMULQ ·primes+16(SB), h
169 MOVQ h, x
170 SHRQ $32, x
171 XORQ x, h
Joey Armstronge8c091f2023-01-17 16:56:26 -0500172
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530173 MOVQ h, ret+24(FP)
Joey Armstronge8c091f2023-01-17 16:56:26 -0500174 RET
175
Joey Armstronge8c091f2023-01-17 16:56:26 -0500176// func writeBlocks(d *Digest, b []byte) int
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530177TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
Joey Armstronge8c091f2023-01-17 16:56:26 -0500178 // Load fixed primes needed for round.
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530179 MOVQ ·primes+0(SB), prime1
180 MOVQ ·primes+8(SB), prime2
Joey Armstronge8c091f2023-01-17 16:56:26 -0500181
182 // Load slice.
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530183 MOVQ b_base+8(FP), p
184 MOVQ b_len+16(FP), n
185 LEAQ (p)(n*1), end
186 SUBQ $32, end
Joey Armstronge8c091f2023-01-17 16:56:26 -0500187
188 // Load vN from d.
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530189 MOVQ s+0(FP), d
190 MOVQ 0(d), v1
191 MOVQ 8(d), v2
192 MOVQ 16(d), v3
193 MOVQ 24(d), v4
Joey Armstronge8c091f2023-01-17 16:56:26 -0500194
195 // We don't need to check the loop condition here; this function is
196 // always called with at least one block of data to process.
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530197 blockLoop()
Joey Armstronge8c091f2023-01-17 16:56:26 -0500198
199 // Copy vN back to d.
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530200 MOVQ v1, 0(d)
201 MOVQ v2, 8(d)
202 MOVQ v3, 16(d)
203 MOVQ v4, 24(d)
Joey Armstronge8c091f2023-01-17 16:56:26 -0500204
Akash Reddy Kankanalac28f0e22025-06-16 11:00:55 +0530205 // The number of bytes written is p minus the old base pointer.
206 SUBQ b_base+8(FP), p
207 MOVQ p, ret+32(FP)
Joey Armstronge8c091f2023-01-17 16:56:26 -0500208
209 RET