blob: adfd979fe277aa548dc545ab9940a9ad0118fe2d [file] [log] [blame]
Takahiro Suzukid7bf8202020-12-17 20:21:59 +09001// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !appengine
6// +build gc
7// +build !noasm
8
9#include "textflag.h"
10
11// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
12// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
13// https://github.com/golang/snappy/issues/29
14//
15// As a workaround, the package was built with a known good assembler, and
16// those instructions were disassembled by "objdump -d" to yield the
17// 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
18// style comments, in AT&T asm syntax. Note that rsp here is a physical
19// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
20// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
21// fine on Go 1.6.
22
23// The asm code generally follows the pure Go code in encode_other.go, except
24// where marked with a "!!!".
25
26// ----------------------------------------------------------------------------
27
28// func emitLiteral(dst, lit []byte) int
29//
30// All local variables fit into registers. The register allocation:
31// - AX len(lit)
32// - BX n
33// - DX return value
34// - DI &dst[i]
35// - R10 &lit[0]
36//
37// The 24 bytes of stack space is to call runtime·memmove.
38//
39// The unusual register allocation of local variables, such as R10 for the
40// source pointer, matches the allocation used at the call site in encodeBlock,
41// which makes it easier to manually inline this function.
42TEXT ·emitLiteral(SB), NOSPLIT, $24-56
43 MOVQ dst_base+0(FP), DI
44 MOVQ lit_base+24(FP), R10
45 MOVQ lit_len+32(FP), AX
46 MOVQ AX, DX
47 MOVL AX, BX
48 SUBL $1, BX
49
50 CMPL BX, $60
51 JLT oneByte
52 CMPL BX, $256
53 JLT twoBytes
54
55threeBytes:
56 MOVB $0xf4, 0(DI)
57 MOVW BX, 1(DI)
58 ADDQ $3, DI
59 ADDQ $3, DX
60 JMP memmove
61
62twoBytes:
63 MOVB $0xf0, 0(DI)
64 MOVB BX, 1(DI)
65 ADDQ $2, DI
66 ADDQ $2, DX
67 JMP memmove
68
69oneByte:
70 SHLB $2, BX
71 MOVB BX, 0(DI)
72 ADDQ $1, DI
73 ADDQ $1, DX
74
75memmove:
76 MOVQ DX, ret+48(FP)
77
78 // copy(dst[i:], lit)
79 //
80 // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
81 // DI, R10 and AX as arguments.
82 MOVQ DI, 0(SP)
83 MOVQ R10, 8(SP)
84 MOVQ AX, 16(SP)
85 CALL runtime·memmove(SB)
86 RET
87
88// ----------------------------------------------------------------------------
89
90// func emitCopy(dst []byte, offset, length int) int
91//
92// All local variables fit into registers. The register allocation:
93// - AX length
94// - SI &dst[0]
95// - DI &dst[i]
96// - R11 offset
97//
98// The unusual register allocation of local variables, such as R11 for the
99// offset, matches the allocation used at the call site in encodeBlock, which
100// makes it easier to manually inline this function.
101TEXT ·emitCopy(SB), NOSPLIT, $0-48
102 MOVQ dst_base+0(FP), DI
103 MOVQ DI, SI
104 MOVQ offset+24(FP), R11
105 MOVQ length+32(FP), AX
106
107loop0:
108 // for length >= 68 { etc }
109 CMPL AX, $68
110 JLT step1
111
112 // Emit a length 64 copy, encoded as 3 bytes.
113 MOVB $0xfe, 0(DI)
114 MOVW R11, 1(DI)
115 ADDQ $3, DI
116 SUBL $64, AX
117 JMP loop0
118
119step1:
120 // if length > 64 { etc }
121 CMPL AX, $64
122 JLE step2
123
124 // Emit a length 60 copy, encoded as 3 bytes.
125 MOVB $0xee, 0(DI)
126 MOVW R11, 1(DI)
127 ADDQ $3, DI
128 SUBL $60, AX
129
130step2:
131 // if length >= 12 || offset >= 2048 { goto step3 }
132 CMPL AX, $12
133 JGE step3
134 CMPL R11, $2048
135 JGE step3
136
137 // Emit the remaining copy, encoded as 2 bytes.
138 MOVB R11, 1(DI)
139 SHRL $8, R11
140 SHLB $5, R11
141 SUBB $4, AX
142 SHLB $2, AX
143 ORB AX, R11
144 ORB $1, R11
145 MOVB R11, 0(DI)
146 ADDQ $2, DI
147
148 // Return the number of bytes written.
149 SUBQ SI, DI
150 MOVQ DI, ret+40(FP)
151 RET
152
153step3:
154 // Emit the remaining copy, encoded as 3 bytes.
155 SUBL $1, AX
156 SHLB $2, AX
157 ORB $2, AX
158 MOVB AX, 0(DI)
159 MOVW R11, 1(DI)
160 ADDQ $3, DI
161
162 // Return the number of bytes written.
163 SUBQ SI, DI
164 MOVQ DI, ret+40(FP)
165 RET
166
167// ----------------------------------------------------------------------------
168
169// func extendMatch(src []byte, i, j int) int
170//
171// All local variables fit into registers. The register allocation:
172// - DX &src[0]
173// - SI &src[j]
174// - R13 &src[len(src) - 8]
175// - R14 &src[len(src)]
176// - R15 &src[i]
177//
178// The unusual register allocation of local variables, such as R15 for a source
179// pointer, matches the allocation used at the call site in encodeBlock, which
180// makes it easier to manually inline this function.
181TEXT ·extendMatch(SB), NOSPLIT, $0-48
182 MOVQ src_base+0(FP), DX
183 MOVQ src_len+8(FP), R14
184 MOVQ i+24(FP), R15
185 MOVQ j+32(FP), SI
186 ADDQ DX, R14
187 ADDQ DX, R15
188 ADDQ DX, SI
189 MOVQ R14, R13
190 SUBQ $8, R13
191
192cmp8:
193 // As long as we are 8 or more bytes before the end of src, we can load and
194 // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
195 CMPQ SI, R13
196 JA cmp1
197 MOVQ (R15), AX
198 MOVQ (SI), BX
199 CMPQ AX, BX
200 JNE bsf
201 ADDQ $8, R15
202 ADDQ $8, SI
203 JMP cmp8
204
205bsf:
206 // If those 8 bytes were not equal, XOR the two 8 byte values, and return
207 // the index of the first byte that differs. The BSF instruction finds the
208 // least significant 1 bit, the amd64 architecture is little-endian, and
209 // the shift by 3 converts a bit index to a byte index.
210 XORQ AX, BX
211 BSFQ BX, BX
212 SHRQ $3, BX
213 ADDQ BX, SI
214
215 // Convert from &src[ret] to ret.
216 SUBQ DX, SI
217 MOVQ SI, ret+40(FP)
218 RET
219
220cmp1:
221 // In src's tail, compare 1 byte at a time.
222 CMPQ SI, R14
223 JAE extendMatchEnd
224 MOVB (R15), AX
225 MOVB (SI), BX
226 CMPB AX, BX
227 JNE extendMatchEnd
228 ADDQ $1, R15
229 ADDQ $1, SI
230 JMP cmp1
231
232extendMatchEnd:
233 // Convert from &src[ret] to ret.
234 SUBQ DX, SI
235 MOVQ SI, ret+40(FP)
236 RET
237
238// ----------------------------------------------------------------------------
239
240// func encodeBlock(dst, src []byte) (d int)
241//
242// All local variables fit into registers, other than "var table". The register
243// allocation:
244// - AX . .
245// - BX . .
246// - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
247// - DX 64 &src[0], tableSize
248// - SI 72 &src[s]
249// - DI 80 &dst[d]
250// - R9 88 sLimit
251// - R10 . &src[nextEmit]
252// - R11 96 prevHash, currHash, nextHash, offset
253// - R12 104 &src[base], skip
254// - R13 . &src[nextS], &src[len(src) - 8]
255// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
256// - R15 112 candidate
257//
258// The second column (56, 64, etc) is the stack offset to spill the registers
259// when calling other functions. We could pack this slightly tighter, but it's
260// simpler to have a dedicated spill map independent of the function called.
261//
262// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
263// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
264// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
265TEXT ·encodeBlock(SB), 0, $32888-56
266 MOVQ dst_base+0(FP), DI
267 MOVQ src_base+24(FP), SI
268 MOVQ src_len+32(FP), R14
269
270 // shift, tableSize := uint32(32-8), 1<<8
271 MOVQ $24, CX
272 MOVQ $256, DX
273
274calcShift:
275 // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
276 // shift--
277 // }
278 CMPQ DX, $16384
279 JGE varTable
280 CMPQ DX, R14
281 JGE varTable
282 SUBQ $1, CX
283 SHLQ $1, DX
284 JMP calcShift
285
286varTable:
287 // var table [maxTableSize]uint16
288 //
289 // In the asm code, unlike the Go code, we can zero-initialize only the
290 // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
291 // writes 16 bytes, so we can do only tableSize/8 writes instead of the
292 // 2048 writes that would zero-initialize all of table's 32768 bytes.
293 SHRQ $3, DX
294 LEAQ table-32768(SP), BX
295 PXOR X0, X0
296
297memclr:
298 MOVOU X0, 0(BX)
299 ADDQ $16, BX
300 SUBQ $1, DX
301 JNZ memclr
302
303 // !!! DX = &src[0]
304 MOVQ SI, DX
305
306 // sLimit := len(src) - inputMargin
307 MOVQ R14, R9
308 SUBQ $15, R9
309
310 // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
311 // change for the rest of the function.
312 MOVQ CX, 56(SP)
313 MOVQ DX, 64(SP)
314 MOVQ R9, 88(SP)
315
316 // nextEmit := 0
317 MOVQ DX, R10
318
319 // s := 1
320 ADDQ $1, SI
321
322 // nextHash := hash(load32(src, s), shift)
323 MOVL 0(SI), R11
324 IMULL $0x1e35a7bd, R11
325 SHRL CX, R11
326
327outer:
328 // for { etc }
329
330 // skip := 32
331 MOVQ $32, R12
332
333 // nextS := s
334 MOVQ SI, R13
335
336 // candidate := 0
337 MOVQ $0, R15
338
339inner0:
340 // for { etc }
341
342 // s := nextS
343 MOVQ R13, SI
344
345 // bytesBetweenHashLookups := skip >> 5
346 MOVQ R12, R14
347 SHRQ $5, R14
348
349 // nextS = s + bytesBetweenHashLookups
350 ADDQ R14, R13
351
352 // skip += bytesBetweenHashLookups
353 ADDQ R14, R12
354
355 // if nextS > sLimit { goto emitRemainder }
356 MOVQ R13, AX
357 SUBQ DX, AX
358 CMPQ AX, R9
359 JA emitRemainder
360
361 // candidate = int(table[nextHash])
362 // XXX: MOVWQZX table-32768(SP)(R11*2), R15
363 // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
364 BYTE $0x4e
365 BYTE $0x0f
366 BYTE $0xb7
367 BYTE $0x7c
368 BYTE $0x5c
369 BYTE $0x78
370
371 // table[nextHash] = uint16(s)
372 MOVQ SI, AX
373 SUBQ DX, AX
374
375 // XXX: MOVW AX, table-32768(SP)(R11*2)
376 // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
377 BYTE $0x66
378 BYTE $0x42
379 BYTE $0x89
380 BYTE $0x44
381 BYTE $0x5c
382 BYTE $0x78
383
384 // nextHash = hash(load32(src, nextS), shift)
385 MOVL 0(R13), R11
386 IMULL $0x1e35a7bd, R11
387 SHRL CX, R11
388
389 // if load32(src, s) != load32(src, candidate) { continue } break
390 MOVL 0(SI), AX
391 MOVL (DX)(R15*1), BX
392 CMPL AX, BX
393 JNE inner0
394
395fourByteMatch:
396 // As per the encode_other.go code:
397 //
398 // A 4-byte match has been found. We'll later see etc.
399
400 // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
401 // on inputMargin in encode.go.
402 MOVQ SI, AX
403 SUBQ R10, AX
404 CMPQ AX, $16
405 JLE emitLiteralFastPath
406
407 // ----------------------------------------
408 // Begin inline of the emitLiteral call.
409 //
410 // d += emitLiteral(dst[d:], src[nextEmit:s])
411
412 MOVL AX, BX
413 SUBL $1, BX
414
415 CMPL BX, $60
416 JLT inlineEmitLiteralOneByte
417 CMPL BX, $256
418 JLT inlineEmitLiteralTwoBytes
419
420inlineEmitLiteralThreeBytes:
421 MOVB $0xf4, 0(DI)
422 MOVW BX, 1(DI)
423 ADDQ $3, DI
424 JMP inlineEmitLiteralMemmove
425
426inlineEmitLiteralTwoBytes:
427 MOVB $0xf0, 0(DI)
428 MOVB BX, 1(DI)
429 ADDQ $2, DI
430 JMP inlineEmitLiteralMemmove
431
432inlineEmitLiteralOneByte:
433 SHLB $2, BX
434 MOVB BX, 0(DI)
435 ADDQ $1, DI
436
437inlineEmitLiteralMemmove:
438 // Spill local variables (registers) onto the stack; call; unspill.
439 //
440 // copy(dst[i:], lit)
441 //
442 // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
443 // DI, R10 and AX as arguments.
444 MOVQ DI, 0(SP)
445 MOVQ R10, 8(SP)
446 MOVQ AX, 16(SP)
447 ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)".
448 MOVQ SI, 72(SP)
449 MOVQ DI, 80(SP)
450 MOVQ R15, 112(SP)
451 CALL runtime·memmove(SB)
452 MOVQ 56(SP), CX
453 MOVQ 64(SP), DX
454 MOVQ 72(SP), SI
455 MOVQ 80(SP), DI
456 MOVQ 88(SP), R9
457 MOVQ 112(SP), R15
458 JMP inner1
459
460inlineEmitLiteralEnd:
461 // End inline of the emitLiteral call.
462 // ----------------------------------------
463
464emitLiteralFastPath:
465 // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
466 MOVB AX, BX
467 SUBB $1, BX
468 SHLB $2, BX
469 MOVB BX, (DI)
470 ADDQ $1, DI
471
472 // !!! Implement the copy from lit to dst as a 16-byte load and store.
473 // (Encode's documentation says that dst and src must not overlap.)
474 //
475 // This always copies 16 bytes, instead of only len(lit) bytes, but that's
476 // OK. Subsequent iterations will fix up the overrun.
477 //
478 // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
479 // 16-byte loads and stores. This technique probably wouldn't be as
480 // effective on architectures that are fussier about alignment.
481 MOVOU 0(R10), X0
482 MOVOU X0, 0(DI)
483 ADDQ AX, DI
484
485inner1:
486 // for { etc }
487
488 // base := s
489 MOVQ SI, R12
490
491 // !!! offset := base - candidate
492 MOVQ R12, R11
493 SUBQ R15, R11
494 SUBQ DX, R11
495
496 // ----------------------------------------
497 // Begin inline of the extendMatch call.
498 //
499 // s = extendMatch(src, candidate+4, s+4)
500
501 // !!! R14 = &src[len(src)]
502 MOVQ src_len+32(FP), R14
503 ADDQ DX, R14
504
505 // !!! R13 = &src[len(src) - 8]
506 MOVQ R14, R13
507 SUBQ $8, R13
508
509 // !!! R15 = &src[candidate + 4]
510 ADDQ $4, R15
511 ADDQ DX, R15
512
513 // !!! s += 4
514 ADDQ $4, SI
515
516inlineExtendMatchCmp8:
517 // As long as we are 8 or more bytes before the end of src, we can load and
518 // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
519 CMPQ SI, R13
520 JA inlineExtendMatchCmp1
521 MOVQ (R15), AX
522 MOVQ (SI), BX
523 CMPQ AX, BX
524 JNE inlineExtendMatchBSF
525 ADDQ $8, R15
526 ADDQ $8, SI
527 JMP inlineExtendMatchCmp8
528
529inlineExtendMatchBSF:
530 // If those 8 bytes were not equal, XOR the two 8 byte values, and return
531 // the index of the first byte that differs. The BSF instruction finds the
532 // least significant 1 bit, the amd64 architecture is little-endian, and
533 // the shift by 3 converts a bit index to a byte index.
534 XORQ AX, BX
535 BSFQ BX, BX
536 SHRQ $3, BX
537 ADDQ BX, SI
538 JMP inlineExtendMatchEnd
539
540inlineExtendMatchCmp1:
541 // In src's tail, compare 1 byte at a time.
542 CMPQ SI, R14
543 JAE inlineExtendMatchEnd
544 MOVB (R15), AX
545 MOVB (SI), BX
546 CMPB AX, BX
547 JNE inlineExtendMatchEnd
548 ADDQ $1, R15
549 ADDQ $1, SI
550 JMP inlineExtendMatchCmp1
551
552inlineExtendMatchEnd:
553 // End inline of the extendMatch call.
554 // ----------------------------------------
555
556 // ----------------------------------------
557 // Begin inline of the emitCopy call.
558 //
559 // d += emitCopy(dst[d:], base-candidate, s-base)
560
561 // !!! length := s - base
562 MOVQ SI, AX
563 SUBQ R12, AX
564
565inlineEmitCopyLoop0:
566 // for length >= 68 { etc }
567 CMPL AX, $68
568 JLT inlineEmitCopyStep1
569
570 // Emit a length 64 copy, encoded as 3 bytes.
571 MOVB $0xfe, 0(DI)
572 MOVW R11, 1(DI)
573 ADDQ $3, DI
574 SUBL $64, AX
575 JMP inlineEmitCopyLoop0
576
577inlineEmitCopyStep1:
578 // if length > 64 { etc }
579 CMPL AX, $64
580 JLE inlineEmitCopyStep2
581
582 // Emit a length 60 copy, encoded as 3 bytes.
583 MOVB $0xee, 0(DI)
584 MOVW R11, 1(DI)
585 ADDQ $3, DI
586 SUBL $60, AX
587
588inlineEmitCopyStep2:
589 // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
590 CMPL AX, $12
591 JGE inlineEmitCopyStep3
592 CMPL R11, $2048
593 JGE inlineEmitCopyStep3
594
595 // Emit the remaining copy, encoded as 2 bytes.
596 MOVB R11, 1(DI)
597 SHRL $8, R11
598 SHLB $5, R11
599 SUBB $4, AX
600 SHLB $2, AX
601 ORB AX, R11
602 ORB $1, R11
603 MOVB R11, 0(DI)
604 ADDQ $2, DI
605 JMP inlineEmitCopyEnd
606
607inlineEmitCopyStep3:
608 // Emit the remaining copy, encoded as 3 bytes.
609 SUBL $1, AX
610 SHLB $2, AX
611 ORB $2, AX
612 MOVB AX, 0(DI)
613 MOVW R11, 1(DI)
614 ADDQ $3, DI
615
616inlineEmitCopyEnd:
617 // End inline of the emitCopy call.
618 // ----------------------------------------
619
620 // nextEmit = s
621 MOVQ SI, R10
622
623 // if s >= sLimit { goto emitRemainder }
624 MOVQ SI, AX
625 SUBQ DX, AX
626 CMPQ AX, R9
627 JAE emitRemainder
628
629 // As per the encode_other.go code:
630 //
631 // We could immediately etc.
632
633 // x := load64(src, s-1)
634 MOVQ -1(SI), R14
635
636 // prevHash := hash(uint32(x>>0), shift)
637 MOVL R14, R11
638 IMULL $0x1e35a7bd, R11
639 SHRL CX, R11
640
641 // table[prevHash] = uint16(s-1)
642 MOVQ SI, AX
643 SUBQ DX, AX
644 SUBQ $1, AX
645
646 // XXX: MOVW AX, table-32768(SP)(R11*2)
647 // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
648 BYTE $0x66
649 BYTE $0x42
650 BYTE $0x89
651 BYTE $0x44
652 BYTE $0x5c
653 BYTE $0x78
654
655 // currHash := hash(uint32(x>>8), shift)
656 SHRQ $8, R14
657 MOVL R14, R11
658 IMULL $0x1e35a7bd, R11
659 SHRL CX, R11
660
661 // candidate = int(table[currHash])
662 // XXX: MOVWQZX table-32768(SP)(R11*2), R15
663 // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
664 BYTE $0x4e
665 BYTE $0x0f
666 BYTE $0xb7
667 BYTE $0x7c
668 BYTE $0x5c
669 BYTE $0x78
670
671 // table[currHash] = uint16(s)
672 ADDQ $1, AX
673
674 // XXX: MOVW AX, table-32768(SP)(R11*2)
675 // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
676 BYTE $0x66
677 BYTE $0x42
678 BYTE $0x89
679 BYTE $0x44
680 BYTE $0x5c
681 BYTE $0x78
682
683 // if uint32(x>>8) == load32(src, candidate) { continue }
684 MOVL (DX)(R15*1), BX
685 CMPL R14, BX
686 JEQ inner1
687
688 // nextHash = hash(uint32(x>>16), shift)
689 SHRQ $8, R14
690 MOVL R14, R11
691 IMULL $0x1e35a7bd, R11
692 SHRL CX, R11
693
694 // s++
695 ADDQ $1, SI
696
697 // break out of the inner1 for loop, i.e. continue the outer loop.
698 JMP outer
699
700emitRemainder:
701 // if nextEmit < len(src) { etc }
702 MOVQ src_len+32(FP), AX
703 ADDQ DX, AX
704 CMPQ R10, AX
705 JEQ encodeBlockEnd
706
707 // d += emitLiteral(dst[d:], src[nextEmit:])
708 //
709 // Push args.
710 MOVQ DI, 0(SP)
711 MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
712 MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
713 MOVQ R10, 24(SP)
714 SUBQ R10, AX
715 MOVQ AX, 32(SP)
716 MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
717
718 // Spill local variables (registers) onto the stack; call; unspill.
719 MOVQ DI, 80(SP)
720 CALL ·emitLiteral(SB)
721 MOVQ 80(SP), DI
722
723 // Finish the "d +=" part of "d += emitLiteral(etc)".
724 ADDQ 48(SP), DI
725
726encodeBlockEnd:
727 MOVQ dst_base+0(FP), AX
728 SUBQ AX, DI
729 MOVQ DI, d+48(FP)
730 RET