blob: e6179f65e3511d6da76e25c749c6d781c5e337a7 [file] [log] [blame]
Matteo Scandolo9a2772a2018-11-19 14:56:26 -08001// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !appengine
6// +build gc
7// +build !noasm
8
9#include "textflag.h"
10
11// The asm code generally follows the pure Go code in decode_other.go, except
12// where marked with a "!!!".
13
14// func decode(dst, src []byte) int
15//
16// All local variables fit into registers. The non-zero stack size is only to
17// spill registers and push args when issuing a CALL. The register allocation:
18// - AX scratch
19// - BX scratch
20// - CX length or x
21// - DX offset
22// - SI &src[s]
23// - DI &dst[d]
24// + R8 dst_base
25// + R9 dst_len
26// + R10 dst_base + dst_len
27// + R11 src_base
28// + R12 src_len
29// + R13 src_base + src_len
30// - R14 used by doCopy
31// - R15 used by doCopy
32//
33// The registers R8-R13 (marked with a "+") are set at the start of the
34// function, and after a CALL returns, and are not otherwise modified.
35//
36// The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI.
37// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
38TEXT ·decode(SB), NOSPLIT, $48-56
39 // Initialize SI, DI and R8-R13.
40 MOVQ dst_base+0(FP), R8
41 MOVQ dst_len+8(FP), R9
42 MOVQ R8, DI
43 MOVQ R8, R10
44 ADDQ R9, R10
45 MOVQ src_base+24(FP), R11
46 MOVQ src_len+32(FP), R12
47 MOVQ R11, SI
48 MOVQ R11, R13
49 ADDQ R12, R13
50
51loop:
52 // for s < len(src)
53 CMPQ SI, R13
54 JEQ end
55
56 // CX = uint32(src[s])
57 //
58 // switch src[s] & 0x03
59 MOVBLZX (SI), CX
60 MOVL CX, BX
61 ANDL $3, BX
62 CMPL BX, $1
63 JAE tagCopy
64
65 // ----------------------------------------
66 // The code below handles literal tags.
67
68 // case tagLiteral:
69 // x := uint32(src[s] >> 2)
70 // switch
71 SHRL $2, CX
72 CMPL CX, $60
73 JAE tagLit60Plus
74
75 // case x < 60:
76 // s++
77 INCQ SI
78
79doLit:
80 // This is the end of the inner "switch", when we have a literal tag.
81 //
82 // We assume that CX == x and x fits in a uint32, where x is the variable
83 // used in the pure Go decode_other.go code.
84
85 // length = int(x) + 1
86 //
87 // Unlike the pure Go code, we don't need to check if length <= 0 because
88 // CX can hold 64 bits, so the increment cannot overflow.
89 INCQ CX
90
91 // Prepare to check if copying length bytes will run past the end of dst or
92 // src.
93 //
94 // AX = len(dst) - d
95 // BX = len(src) - s
96 MOVQ R10, AX
97 SUBQ DI, AX
98 MOVQ R13, BX
99 SUBQ SI, BX
100
101 // !!! Try a faster technique for short (16 or fewer bytes) copies.
102 //
103 // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
104 // goto callMemmove // Fall back on calling runtime·memmove.
105 // }
106 //
107 // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
108 // against 21 instead of 16, because it cannot assume that all of its input
109 // is contiguous in memory and so it needs to leave enough source bytes to
110 // read the next tag without refilling buffers, but Go's Decode assumes
111 // contiguousness (the src argument is a []byte).
112 CMPQ CX, $16
113 JGT callMemmove
114 CMPQ AX, $16
115 JLT callMemmove
116 CMPQ BX, $16
117 JLT callMemmove
118
119 // !!! Implement the copy from src to dst as a 16-byte load and store.
120 // (Decode's documentation says that dst and src must not overlap.)
121 //
122 // This always copies 16 bytes, instead of only length bytes, but that's
123 // OK. If the input is a valid Snappy encoding then subsequent iterations
124 // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
125 // non-nil error), so the overrun will be ignored.
126 //
127 // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
128 // 16-byte loads and stores. This technique probably wouldn't be as
129 // effective on architectures that are fussier about alignment.
130 MOVOU 0(SI), X0
131 MOVOU X0, 0(DI)
132
133 // d += length
134 // s += length
135 ADDQ CX, DI
136 ADDQ CX, SI
137 JMP loop
138
139callMemmove:
140 // if length > len(dst)-d || length > len(src)-s { etc }
141 CMPQ CX, AX
142 JGT errCorrupt
143 CMPQ CX, BX
144 JGT errCorrupt
145
146 // copy(dst[d:], src[s:s+length])
147 //
148 // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
149 // DI, SI and CX as arguments. Coincidentally, we also need to spill those
150 // three registers to the stack, to save local variables across the CALL.
151 MOVQ DI, 0(SP)
152 MOVQ SI, 8(SP)
153 MOVQ CX, 16(SP)
154 MOVQ DI, 24(SP)
155 MOVQ SI, 32(SP)
156 MOVQ CX, 40(SP)
157 CALL runtime·memmove(SB)
158
159 // Restore local variables: unspill registers from the stack and
160 // re-calculate R8-R13.
161 MOVQ 24(SP), DI
162 MOVQ 32(SP), SI
163 MOVQ 40(SP), CX
164 MOVQ dst_base+0(FP), R8
165 MOVQ dst_len+8(FP), R9
166 MOVQ R8, R10
167 ADDQ R9, R10
168 MOVQ src_base+24(FP), R11
169 MOVQ src_len+32(FP), R12
170 MOVQ R11, R13
171 ADDQ R12, R13
172
173 // d += length
174 // s += length
175 ADDQ CX, DI
176 ADDQ CX, SI
177 JMP loop
178
179tagLit60Plus:
180 // !!! This fragment does the
181 //
182 // s += x - 58; if uint(s) > uint(len(src)) { etc }
183 //
184 // checks. In the asm version, we code it once instead of once per switch case.
185 ADDQ CX, SI
186 SUBQ $58, SI
187 MOVQ SI, BX
188 SUBQ R11, BX
189 CMPQ BX, R12
190 JA errCorrupt
191
192 // case x == 60:
193 CMPL CX, $61
194 JEQ tagLit61
195 JA tagLit62Plus
196
197 // x = uint32(src[s-1])
198 MOVBLZX -1(SI), CX
199 JMP doLit
200
201tagLit61:
202 // case x == 61:
203 // x = uint32(src[s-2]) | uint32(src[s-1])<<8
204 MOVWLZX -2(SI), CX
205 JMP doLit
206
207tagLit62Plus:
208 CMPL CX, $62
209 JA tagLit63
210
211 // case x == 62:
212 // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
213 MOVWLZX -3(SI), CX
214 MOVBLZX -1(SI), BX
215 SHLL $16, BX
216 ORL BX, CX
217 JMP doLit
218
219tagLit63:
220 // case x == 63:
221 // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
222 MOVL -4(SI), CX
223 JMP doLit
224
225// The code above handles literal tags.
226// ----------------------------------------
227// The code below handles copy tags.
228
229tagCopy4:
230 // case tagCopy4:
231 // s += 5
232 ADDQ $5, SI
233
234 // if uint(s) > uint(len(src)) { etc }
235 MOVQ SI, BX
236 SUBQ R11, BX
237 CMPQ BX, R12
238 JA errCorrupt
239
240 // length = 1 + int(src[s-5])>>2
241 SHRQ $2, CX
242 INCQ CX
243
244 // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
245 MOVLQZX -4(SI), DX
246 JMP doCopy
247
248tagCopy2:
249 // case tagCopy2:
250 // s += 3
251 ADDQ $3, SI
252
253 // if uint(s) > uint(len(src)) { etc }
254 MOVQ SI, BX
255 SUBQ R11, BX
256 CMPQ BX, R12
257 JA errCorrupt
258
259 // length = 1 + int(src[s-3])>>2
260 SHRQ $2, CX
261 INCQ CX
262
263 // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
264 MOVWQZX -2(SI), DX
265 JMP doCopy
266
267tagCopy:
268 // We have a copy tag. We assume that:
269 // - BX == src[s] & 0x03
270 // - CX == src[s]
271 CMPQ BX, $2
272 JEQ tagCopy2
273 JA tagCopy4
274
275 // case tagCopy1:
276 // s += 2
277 ADDQ $2, SI
278
279 // if uint(s) > uint(len(src)) { etc }
280 MOVQ SI, BX
281 SUBQ R11, BX
282 CMPQ BX, R12
283 JA errCorrupt
284
285 // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
286 MOVQ CX, DX
287 ANDQ $0xe0, DX
288 SHLQ $3, DX
289 MOVBQZX -1(SI), BX
290 ORQ BX, DX
291
292 // length = 4 + int(src[s-2])>>2&0x7
293 SHRQ $2, CX
294 ANDQ $7, CX
295 ADDQ $4, CX
296
297doCopy:
298 // This is the end of the outer "switch", when we have a copy tag.
299 //
300 // We assume that:
301 // - CX == length && CX > 0
302 // - DX == offset
303
304 // if offset <= 0 { etc }
305 CMPQ DX, $0
306 JLE errCorrupt
307
308 // if d < offset { etc }
309 MOVQ DI, BX
310 SUBQ R8, BX
311 CMPQ BX, DX
312 JLT errCorrupt
313
314 // if length > len(dst)-d { etc }
315 MOVQ R10, BX
316 SUBQ DI, BX
317 CMPQ CX, BX
318 JGT errCorrupt
319
320 // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
321 //
322 // Set:
323 // - R14 = len(dst)-d
324 // - R15 = &dst[d-offset]
325 MOVQ R10, R14
326 SUBQ DI, R14
327 MOVQ DI, R15
328 SUBQ DX, R15
329
330 // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
331 //
332 // First, try using two 8-byte load/stores, similar to the doLit technique
333 // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
334 // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
335 // and not one 16-byte load/store, and the first store has to be before the
336 // second load, due to the overlap if offset is in the range [8, 16).
337 //
338 // if length > 16 || offset < 8 || len(dst)-d < 16 {
339 // goto slowForwardCopy
340 // }
341 // copy 16 bytes
342 // d += length
343 CMPQ CX, $16
344 JGT slowForwardCopy
345 CMPQ DX, $8
346 JLT slowForwardCopy
347 CMPQ R14, $16
348 JLT slowForwardCopy
349 MOVQ 0(R15), AX
350 MOVQ AX, 0(DI)
351 MOVQ 8(R15), BX
352 MOVQ BX, 8(DI)
353 ADDQ CX, DI
354 JMP loop
355
356slowForwardCopy:
357 // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
358 // can still try 8-byte load stores, provided we can overrun up to 10 extra
359 // bytes. As above, the overrun will be fixed up by subsequent iterations
360 // of the outermost loop.
361 //
362 // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
363 // commentary says:
364 //
365 // ----
366 //
367 // The main part of this loop is a simple copy of eight bytes at a time
368 // until we've copied (at least) the requested amount of bytes. However,
369 // if d and d-offset are less than eight bytes apart (indicating a
370 // repeating pattern of length < 8), we first need to expand the pattern in
371 // order to get the correct results. For instance, if the buffer looks like
372 // this, with the eight-byte <d-offset> and <d> patterns marked as
373 // intervals:
374 //
375 // abxxxxxxxxxxxx
376 // [------] d-offset
377 // [------] d
378 //
379 // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
380 // once, after which we can move <d> two bytes without moving <d-offset>:
381 //
382 // ababxxxxxxxxxx
383 // [------] d-offset
384 // [------] d
385 //
386 // and repeat the exercise until the two no longer overlap.
387 //
388 // This allows us to do very well in the special case of one single byte
389 // repeated many times, without taking a big hit for more general cases.
390 //
391 // The worst case of extra writing past the end of the match occurs when
392 // offset == 1 and length == 1; the last copy will read from byte positions
393 // [0..7] and write to [4..11], whereas it was only supposed to write to
394 // position 1. Thus, ten excess bytes.
395 //
396 // ----
397 //
398 // That "10 byte overrun" worst case is confirmed by Go's
399 // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
400 // and finishSlowForwardCopy algorithm.
401 //
402 // if length > len(dst)-d-10 {
403 // goto verySlowForwardCopy
404 // }
405 SUBQ $10, R14
406 CMPQ CX, R14
407 JGT verySlowForwardCopy
408
409makeOffsetAtLeast8:
410 // !!! As above, expand the pattern so that offset >= 8 and we can use
411 // 8-byte load/stores.
412 //
413 // for offset < 8 {
414 // copy 8 bytes from dst[d-offset:] to dst[d:]
415 // length -= offset
416 // d += offset
417 // offset += offset
418 // // The two previous lines together means that d-offset, and therefore
419 // // R15, is unchanged.
420 // }
421 CMPQ DX, $8
422 JGE fixUpSlowForwardCopy
423 MOVQ (R15), BX
424 MOVQ BX, (DI)
425 SUBQ DX, CX
426 ADDQ DX, DI
427 ADDQ DX, DX
428 JMP makeOffsetAtLeast8
429
430fixUpSlowForwardCopy:
431 // !!! Add length (which might be negative now) to d (implied by DI being
432 // &dst[d]) so that d ends up at the right place when we jump back to the
433 // top of the loop. Before we do that, though, we save DI to AX so that, if
434 // length is positive, copying the remaining length bytes will write to the
435 // right place.
436 MOVQ DI, AX
437 ADDQ CX, DI
438
439finishSlowForwardCopy:
440 // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
441 // length means that we overrun, but as above, that will be fixed up by
442 // subsequent iterations of the outermost loop.
443 CMPQ CX, $0
444 JLE loop
445 MOVQ (R15), BX
446 MOVQ BX, (AX)
447 ADDQ $8, R15
448 ADDQ $8, AX
449 SUBQ $8, CX
450 JMP finishSlowForwardCopy
451
452verySlowForwardCopy:
453 // verySlowForwardCopy is a simple implementation of forward copy. In C
454 // parlance, this is a do/while loop instead of a while loop, since we know
455 // that length > 0. In Go syntax:
456 //
457 // for {
458 // dst[d] = dst[d - offset]
459 // d++
460 // length--
461 // if length == 0 {
462 // break
463 // }
464 // }
465 MOVB (R15), BX
466 MOVB BX, (DI)
467 INCQ R15
468 INCQ DI
469 DECQ CX
470 JNZ verySlowForwardCopy
471 JMP loop
472
473// The code above handles copy tags.
474// ----------------------------------------
475
476end:
477 // This is the end of the "for s < len(src)".
478 //
479 // if d != len(dst) { etc }
480 CMPQ DI, R10
481 JNE errCorrupt
482
483 // return 0
484 MOVQ $0, ret+48(FP)
485 RET
486
487errCorrupt:
488 // return decodeErrCodeCorrupt
489 MOVQ $1, ret+48(FP)
490 RET