blob: 20fef39759cb6548cd3921d18efc7dfce7b28b08 [file] [log] [blame]
Scott Baker611f6bd2019-10-18 13:45:19 -07001// +build !appengine
2// +build gc
3// +build !noasm
4
5#include "textflag.h"
6
7// AX scratch
8// BX scratch
9// CX scratch
10// DX token
11//
12// DI &dst
13// SI &src
14// R8 &dst + len(dst)
15// R9 &src + len(src)
16// R11 &dst
17// R12 short output end
18// R13 short input end
19// func decodeBlock(dst, src []byte) int
20// using 50 bytes of stack currently
21TEXT ·decodeBlock(SB), NOSPLIT, $64-56
22 MOVQ dst_base+0(FP), DI
23 MOVQ DI, R11
24 MOVQ dst_len+8(FP), R8
25 ADDQ DI, R8
26
27 MOVQ src_base+24(FP), SI
28 MOVQ src_len+32(FP), R9
29 ADDQ SI, R9
30
31 // shortcut ends
32 // short output end
33 MOVQ R8, R12
34 SUBQ $32, R12
35 // short input end
36 MOVQ R9, R13
37 SUBQ $16, R13
38
39loop:
40 // for si < len(src)
41 CMPQ SI, R9
42 JGE end
43
44 // token := uint32(src[si])
45 MOVBQZX (SI), DX
46 INCQ SI
47
48 // lit_len = token >> 4
49 // if lit_len > 0
50 // CX = lit_len
51 MOVQ DX, CX
52 SHRQ $4, CX
53
54 // if lit_len != 0xF
55 CMPQ CX, $0xF
56 JEQ lit_len_loop_pre
57 CMPQ DI, R12
58 JGE lit_len_loop_pre
59 CMPQ SI, R13
60 JGE lit_len_loop_pre
61
62 // copy shortcut
63
64 // A two-stage shortcut for the most common case:
65 // 1) If the literal length is 0..14, and there is enough space,
66 // enter the shortcut and copy 16 bytes on behalf of the literals
67 // (in the fast mode, only 8 bytes can be safely copied this way).
68 // 2) Further if the match length is 4..18, copy 18 bytes in a similar
69 // manner; but we ensure that there's enough space in the output for
70 // those 18 bytes earlier, upon entering the shortcut (in other words,
71 // there is a combined check for both stages).
72
73 // copy literal
74 MOVOU (SI), X0
75 MOVOU X0, (DI)
76 ADDQ CX, DI
77 ADDQ CX, SI
78
79 MOVQ DX, CX
80 ANDQ $0xF, CX
81
82 // The second stage: prepare for match copying, decode full info.
83 // If it doesn't work out, the info won't be wasted.
84 // offset := uint16(data[:2])
85 MOVWQZX (SI), DX
86 ADDQ $2, SI
87
88 MOVQ DI, AX
89 SUBQ DX, AX
90 CMPQ AX, DI
91 JGT err_short_buf
92
93 // if we can't do the second stage then jump straight to read the
94 // match length, we already have the offset.
95 CMPQ CX, $0xF
96 JEQ match_len_loop_pre
97 CMPQ DX, $8
98 JLT match_len_loop_pre
99 CMPQ AX, R11
100 JLT err_short_buf
101
102 // memcpy(op + 0, match + 0, 8);
103 MOVQ (AX), BX
104 MOVQ BX, (DI)
105 // memcpy(op + 8, match + 8, 8);
106 MOVQ 8(AX), BX
107 MOVQ BX, 8(DI)
108 // memcpy(op +16, match +16, 2);
109 MOVW 16(AX), BX
110 MOVW BX, 16(DI)
111
112 ADDQ $4, DI // minmatch
113 ADDQ CX, DI
114
115 // shortcut complete, load next token
116 JMP loop
117
118lit_len_loop_pre:
119 // if lit_len > 0
120 CMPQ CX, $0
121 JEQ offset
122 CMPQ CX, $0xF
123 JNE copy_literal
124
125lit_len_loop:
126 // for src[si] == 0xFF
127 CMPB (SI), $0xFF
128 JNE lit_len_finalise
129
130 // bounds check src[si+1]
131 MOVQ SI, AX
132 ADDQ $1, AX
133 CMPQ AX, R9
134 JGT err_short_buf
135
136 // lit_len += 0xFF
137 ADDQ $0xFF, CX
138 INCQ SI
139 JMP lit_len_loop
140
141lit_len_finalise:
142 // lit_len += int(src[si])
143 // si++
144 MOVBQZX (SI), AX
145 ADDQ AX, CX
146 INCQ SI
147
148copy_literal:
149 // bounds check src and dst
150 MOVQ SI, AX
151 ADDQ CX, AX
152 CMPQ AX, R9
153 JGT err_short_buf
154
155 MOVQ DI, AX
156 ADDQ CX, AX
157 CMPQ AX, R8
158 JGT err_short_buf
159
160 // whats a good cut off to call memmove?
161 CMPQ CX, $16
162 JGT memmove_lit
163
164 // if len(dst[di:]) < 16
165 MOVQ R8, AX
166 SUBQ DI, AX
167 CMPQ AX, $16
168 JLT memmove_lit
169
170 // if len(src[si:]) < 16
171 MOVQ R9, AX
172 SUBQ SI, AX
173 CMPQ AX, $16
174 JLT memmove_lit
175
176 MOVOU (SI), X0
177 MOVOU X0, (DI)
178
179 JMP finish_lit_copy
180
181memmove_lit:
182 // memmove(to, from, len)
183 MOVQ DI, 0(SP)
184 MOVQ SI, 8(SP)
185 MOVQ CX, 16(SP)
186 // spill
187 MOVQ DI, 24(SP)
188 MOVQ SI, 32(SP)
189 MOVQ CX, 40(SP) // need len to inc SI, DI after
190 MOVB DX, 48(SP)
191 CALL runtime·memmove(SB)
192
193 // restore registers
194 MOVQ 24(SP), DI
195 MOVQ 32(SP), SI
196 MOVQ 40(SP), CX
197 MOVB 48(SP), DX
198
199 // recalc initial values
200 MOVQ dst_base+0(FP), R8
201 MOVQ R8, R11
202 ADDQ dst_len+8(FP), R8
203 MOVQ src_base+24(FP), R9
204 ADDQ src_len+32(FP), R9
205 MOVQ R8, R12
206 SUBQ $32, R12
207 MOVQ R9, R13
208 SUBQ $16, R13
209
210finish_lit_copy:
211 ADDQ CX, SI
212 ADDQ CX, DI
213
214 CMPQ SI, R9
215 JGE end
216
217offset:
218 // CX := mLen
219 // free up DX to use for offset
220 MOVQ DX, CX
221
222 MOVQ SI, AX
223 ADDQ $2, AX
224 CMPQ AX, R9
225 JGT err_short_buf
226
227 // offset
228 // DX := int(src[si]) | int(src[si+1])<<8
229 MOVWQZX (SI), DX
230 ADDQ $2, SI
231
232 // 0 offset is invalid
233 CMPQ DX, $0
234 JEQ err_corrupt
235
236 ANDB $0xF, CX
237
238match_len_loop_pre:
239 // if mlen != 0xF
240 CMPB CX, $0xF
241 JNE copy_match
242
243match_len_loop:
244 // for src[si] == 0xFF
245 // lit_len += 0xFF
246 CMPB (SI), $0xFF
247 JNE match_len_finalise
248
249 // bounds check src[si+1]
250 MOVQ SI, AX
251 ADDQ $1, AX
252 CMPQ AX, R9
253 JGT err_short_buf
254
255 ADDQ $0xFF, CX
256 INCQ SI
257 JMP match_len_loop
258
259match_len_finalise:
260 // lit_len += int(src[si])
261 // si++
262 MOVBQZX (SI), AX
263 ADDQ AX, CX
264 INCQ SI
265
266copy_match:
267 // mLen += minMatch
268 ADDQ $4, CX
269
270 // check we have match_len bytes left in dst
271 // di+match_len < len(dst)
272 MOVQ DI, AX
273 ADDQ CX, AX
274 CMPQ AX, R8
275 JGT err_short_buf
276
277 // DX = offset
278 // CX = match_len
279 // BX = &dst + (di - offset)
280 MOVQ DI, BX
281 SUBQ DX, BX
282
283 // check BX is within dst
284 // if BX < &dst
285 CMPQ BX, R11
286 JLT err_short_buf
287
288 // if offset + match_len < di
289 MOVQ BX, AX
290 ADDQ CX, AX
291 CMPQ DI, AX
292 JGT copy_interior_match
293
294 // AX := len(dst[:di])
295 // MOVQ DI, AX
296 // SUBQ R11, AX
297
298 // copy 16 bytes at a time
299 // if di-offset < 16 copy 16-(di-offset) bytes to di
300 // then do the remaining
301
302copy_match_loop:
303 // for match_len >= 0
304 // dst[di] = dst[i]
305 // di++
306 // i++
307 MOVB (BX), AX
308 MOVB AX, (DI)
309 INCQ DI
310 INCQ BX
311 DECQ CX
312
313 CMPQ CX, $0
314 JGT copy_match_loop
315
316 JMP loop
317
318copy_interior_match:
319 CMPQ CX, $16
320 JGT memmove_match
321
322 // if len(dst[di:]) < 16
323 MOVQ R8, AX
324 SUBQ DI, AX
325 CMPQ AX, $16
326 JLT memmove_match
327
328 MOVOU (BX), X0
329 MOVOU X0, (DI)
330
331 ADDQ CX, DI
332 JMP loop
333
334memmove_match:
335 // memmove(to, from, len)
336 MOVQ DI, 0(SP)
337 MOVQ BX, 8(SP)
338 MOVQ CX, 16(SP)
339 // spill
340 MOVQ DI, 24(SP)
341 MOVQ SI, 32(SP)
342 MOVQ CX, 40(SP) // need len to inc SI, DI after
343 CALL runtime·memmove(SB)
344
345 // restore registers
346 MOVQ 24(SP), DI
347 MOVQ 32(SP), SI
348 MOVQ 40(SP), CX
349
350 // recalc initial values
351 MOVQ dst_base+0(FP), R8
352 MOVQ R8, R11 // TODO: make these sensible numbers
353 ADDQ dst_len+8(FP), R8
354 MOVQ src_base+24(FP), R9
355 ADDQ src_len+32(FP), R9
356 MOVQ R8, R12
357 SUBQ $32, R12
358 MOVQ R9, R13
359 SUBQ $16, R13
360
361 ADDQ CX, DI
362 JMP loop
363
364err_corrupt:
365 MOVQ $-1, ret+48(FP)
366 RET
367
368err_short_buf:
369 MOVQ $-2, ret+48(FP)
370 RET
371
372end:
373 SUBQ R11, DI
374 MOVQ DI, ret+48(FP)
375 RET