blob: bf83667d711f7d2bf4abb6d1d5acb23ff334316b [file] [log] [blame]
khenaidood948f772021-08-11 17:49:24 -04001// Copyright 2020 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !appengine
6// +build gc
7// +build !noasm
8
9#include "textflag.h"
10
11// The asm code generally follows the pure Go code in encode_other.go, except
12// where marked with a "!!!".
13
14// ----------------------------------------------------------------------------
15
16// func emitLiteral(dst, lit []byte) int
17//
18// All local variables fit into registers. The register allocation:
19// - R3 len(lit)
20// - R4 n
21// - R6 return value
22// - R8 &dst[i]
23// - R10 &lit[0]
24//
25// The 32 bytes of stack space is to call runtime·memmove.
26//
27// The unusual register allocation of local variables, such as R10 for the
28// source pointer, matches the allocation used at the call site in encodeBlock,
29// which makes it easier to manually inline this function.
30TEXT ·emitLiteral(SB), NOSPLIT, $32-56
31 MOVD dst_base+0(FP), R8
32 MOVD lit_base+24(FP), R10
33 MOVD lit_len+32(FP), R3
34 MOVD R3, R6
35 MOVW R3, R4
36 SUBW $1, R4, R4
37
38 CMPW $60, R4
39 BLT oneByte
40 CMPW $256, R4
41 BLT twoBytes
42
43threeBytes:
44 MOVD $0xf4, R2
45 MOVB R2, 0(R8)
46 MOVW R4, 1(R8)
47 ADD $3, R8, R8
48 ADD $3, R6, R6
49 B memmove
50
51twoBytes:
52 MOVD $0xf0, R2
53 MOVB R2, 0(R8)
54 MOVB R4, 1(R8)
55 ADD $2, R8, R8
56 ADD $2, R6, R6
57 B memmove
58
59oneByte:
60 LSLW $2, R4, R4
61 MOVB R4, 0(R8)
62 ADD $1, R8, R8
63 ADD $1, R6, R6
64
65memmove:
66 MOVD R6, ret+48(FP)
67
68 // copy(dst[i:], lit)
69 //
70 // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
71 // R8, R10 and R3 as arguments.
72 MOVD R8, 8(RSP)
73 MOVD R10, 16(RSP)
74 MOVD R3, 24(RSP)
75 CALL runtime·memmove(SB)
76 RET
77
78// ----------------------------------------------------------------------------
79
80// func emitCopy(dst []byte, offset, length int) int
81//
82// All local variables fit into registers. The register allocation:
83// - R3 length
84// - R7 &dst[0]
85// - R8 &dst[i]
86// - R11 offset
87//
88// The unusual register allocation of local variables, such as R11 for the
89// offset, matches the allocation used at the call site in encodeBlock, which
90// makes it easier to manually inline this function.
91TEXT ·emitCopy(SB), NOSPLIT, $0-48
92 MOVD dst_base+0(FP), R8
93 MOVD R8, R7
94 MOVD offset+24(FP), R11
95 MOVD length+32(FP), R3
96
97loop0:
98 // for length >= 68 { etc }
99 CMPW $68, R3
100 BLT step1
101
102 // Emit a length 64 copy, encoded as 3 bytes.
103 MOVD $0xfe, R2
104 MOVB R2, 0(R8)
105 MOVW R11, 1(R8)
106 ADD $3, R8, R8
107 SUB $64, R3, R3
108 B loop0
109
110step1:
111 // if length > 64 { etc }
112 CMP $64, R3
113 BLE step2
114
115 // Emit a length 60 copy, encoded as 3 bytes.
116 MOVD $0xee, R2
117 MOVB R2, 0(R8)
118 MOVW R11, 1(R8)
119 ADD $3, R8, R8
120 SUB $60, R3, R3
121
122step2:
123 // if length >= 12 || offset >= 2048 { goto step3 }
124 CMP $12, R3
125 BGE step3
126 CMPW $2048, R11
127 BGE step3
128
129 // Emit the remaining copy, encoded as 2 bytes.
130 MOVB R11, 1(R8)
131 LSRW $3, R11, R11
132 AND $0xe0, R11, R11
133 SUB $4, R3, R3
134 LSLW $2, R3
135 AND $0xff, R3, R3
136 ORRW R3, R11, R11
137 ORRW $1, R11, R11
138 MOVB R11, 0(R8)
139 ADD $2, R8, R8
140
141 // Return the number of bytes written.
142 SUB R7, R8, R8
143 MOVD R8, ret+40(FP)
144 RET
145
146step3:
147 // Emit the remaining copy, encoded as 3 bytes.
148 SUB $1, R3, R3
149 AND $0xff, R3, R3
150 LSLW $2, R3, R3
151 ORRW $2, R3, R3
152 MOVB R3, 0(R8)
153 MOVW R11, 1(R8)
154 ADD $3, R8, R8
155
156 // Return the number of bytes written.
157 SUB R7, R8, R8
158 MOVD R8, ret+40(FP)
159 RET
160
161// ----------------------------------------------------------------------------
162
163// func extendMatch(src []byte, i, j int) int
164//
165// All local variables fit into registers. The register allocation:
166// - R6 &src[0]
167// - R7 &src[j]
168// - R13 &src[len(src) - 8]
169// - R14 &src[len(src)]
170// - R15 &src[i]
171//
172// The unusual register allocation of local variables, such as R15 for a source
173// pointer, matches the allocation used at the call site in encodeBlock, which
174// makes it easier to manually inline this function.
175TEXT ·extendMatch(SB), NOSPLIT, $0-48
176 MOVD src_base+0(FP), R6
177 MOVD src_len+8(FP), R14
178 MOVD i+24(FP), R15
179 MOVD j+32(FP), R7
180 ADD R6, R14, R14
181 ADD R6, R15, R15
182 ADD R6, R7, R7
183 MOVD R14, R13
184 SUB $8, R13, R13
185
186cmp8:
187 // As long as we are 8 or more bytes before the end of src, we can load and
188 // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
189 CMP R13, R7
190 BHI cmp1
191 MOVD (R15), R3
192 MOVD (R7), R4
193 CMP R4, R3
194 BNE bsf
195 ADD $8, R15, R15
196 ADD $8, R7, R7
197 B cmp8
198
199bsf:
200 // If those 8 bytes were not equal, XOR the two 8 byte values, and return
201 // the index of the first byte that differs.
202 // RBIT reverses the bit order, then CLZ counts the leading zeros, the
203 // combination of which finds the least significant bit which is set.
204 // The arm64 architecture is little-endian, and the shift by 3 converts
205 // a bit index to a byte index.
206 EOR R3, R4, R4
207 RBIT R4, R4
208 CLZ R4, R4
209 ADD R4>>3, R7, R7
210
211 // Convert from &src[ret] to ret.
212 SUB R6, R7, R7
213 MOVD R7, ret+40(FP)
214 RET
215
216cmp1:
217 // In src's tail, compare 1 byte at a time.
218 CMP R7, R14
219 BLS extendMatchEnd
220 MOVB (R15), R3
221 MOVB (R7), R4
222 CMP R4, R3
223 BNE extendMatchEnd
224 ADD $1, R15, R15
225 ADD $1, R7, R7
226 B cmp1
227
228extendMatchEnd:
229 // Convert from &src[ret] to ret.
230 SUB R6, R7, R7
231 MOVD R7, ret+40(FP)
232 RET
233
234// ----------------------------------------------------------------------------
235
236// func encodeBlock(dst, src []byte) (d int)
237//
238// All local variables fit into registers, other than "var table". The register
239// allocation:
240// - R3 . .
241// - R4 . .
242// - R5 64 shift
243// - R6 72 &src[0], tableSize
244// - R7 80 &src[s]
245// - R8 88 &dst[d]
246// - R9 96 sLimit
247// - R10 . &src[nextEmit]
248// - R11 104 prevHash, currHash, nextHash, offset
249// - R12 112 &src[base], skip
250// - R13 . &src[nextS], &src[len(src) - 8]
251// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
252// - R15 120 candidate
253// - R16 . hash constant, 0x1e35a7bd
254// - R17 . &table
255// - . 128 table
256//
257// The second column (64, 72, etc) is the stack offset to spill the registers
258// when calling other functions. We could pack this slightly tighter, but it's
259// simpler to have a dedicated spill map independent of the function called.
260//
261// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
262// extra 64 bytes, to call other functions, and an extra 64 bytes, to spill
263// local variables (registers) during calls gives 32768 + 64 + 64 = 32896.
264TEXT ·encodeBlock(SB), 0, $32896-56
265 MOVD dst_base+0(FP), R8
266 MOVD src_base+24(FP), R7
267 MOVD src_len+32(FP), R14
268
269 // shift, tableSize := uint32(32-8), 1<<8
270 MOVD $24, R5
271 MOVD $256, R6
272 MOVW $0xa7bd, R16
273 MOVKW $(0x1e35<<16), R16
274
275calcShift:
276 // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
277 // shift--
278 // }
279 MOVD $16384, R2
280 CMP R2, R6
281 BGE varTable
282 CMP R14, R6
283 BGE varTable
284 SUB $1, R5, R5
285 LSL $1, R6, R6
286 B calcShift
287
288varTable:
289 // var table [maxTableSize]uint16
290 //
291 // In the asm code, unlike the Go code, we can zero-initialize only the
292 // first tableSize elements. Each uint16 element is 2 bytes and each
293 // iterations writes 64 bytes, so we can do only tableSize/32 writes
294 // instead of the 2048 writes that would zero-initialize all of table's
295 // 32768 bytes. This clear could overrun the first tableSize elements, but
296 // it won't overrun the allocated stack size.
297 ADD $128, RSP, R17
298 MOVD R17, R4
299
300 // !!! R6 = &src[tableSize]
301 ADD R6<<1, R17, R6
302
303memclr:
304 STP.P (ZR, ZR), 64(R4)
305 STP (ZR, ZR), -48(R4)
306 STP (ZR, ZR), -32(R4)
307 STP (ZR, ZR), -16(R4)
308 CMP R4, R6
309 BHI memclr
310
311 // !!! R6 = &src[0]
312 MOVD R7, R6
313
314 // sLimit := len(src) - inputMargin
315 MOVD R14, R9
316 SUB $15, R9, R9
317
318 // !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't
319 // change for the rest of the function.
320 MOVD R5, 64(RSP)
321 MOVD R6, 72(RSP)
322 MOVD R9, 96(RSP)
323
324 // nextEmit := 0
325 MOVD R6, R10
326
327 // s := 1
328 ADD $1, R7, R7
329
330 // nextHash := hash(load32(src, s), shift)
331 MOVW 0(R7), R11
332 MULW R16, R11, R11
333 LSRW R5, R11, R11
334
335outer:
336 // for { etc }
337
338 // skip := 32
339 MOVD $32, R12
340
341 // nextS := s
342 MOVD R7, R13
343
344 // candidate := 0
345 MOVD $0, R15
346
347inner0:
348 // for { etc }
349
350 // s := nextS
351 MOVD R13, R7
352
353 // bytesBetweenHashLookups := skip >> 5
354 MOVD R12, R14
355 LSR $5, R14, R14
356
357 // nextS = s + bytesBetweenHashLookups
358 ADD R14, R13, R13
359
360 // skip += bytesBetweenHashLookups
361 ADD R14, R12, R12
362
363 // if nextS > sLimit { goto emitRemainder }
364 MOVD R13, R3
365 SUB R6, R3, R3
366 CMP R9, R3
367 BHI emitRemainder
368
369 // candidate = int(table[nextHash])
370 MOVHU 0(R17)(R11<<1), R15
371
372 // table[nextHash] = uint16(s)
373 MOVD R7, R3
374 SUB R6, R3, R3
375
376 MOVH R3, 0(R17)(R11<<1)
377
378 // nextHash = hash(load32(src, nextS), shift)
379 MOVW 0(R13), R11
380 MULW R16, R11
381 LSRW R5, R11, R11
382
383 // if load32(src, s) != load32(src, candidate) { continue } break
384 MOVW 0(R7), R3
385 MOVW (R6)(R15*1), R4
386 CMPW R4, R3
387 BNE inner0
388
389fourByteMatch:
390 // As per the encode_other.go code:
391 //
392 // A 4-byte match has been found. We'll later see etc.
393
394 // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
395 // on inputMargin in encode.go.
396 MOVD R7, R3
397 SUB R10, R3, R3
398 CMP $16, R3
399 BLE emitLiteralFastPath
400
401 // ----------------------------------------
402 // Begin inline of the emitLiteral call.
403 //
404 // d += emitLiteral(dst[d:], src[nextEmit:s])
405
406 MOVW R3, R4
407 SUBW $1, R4, R4
408
409 MOVW $60, R2
410 CMPW R2, R4
411 BLT inlineEmitLiteralOneByte
412 MOVW $256, R2
413 CMPW R2, R4
414 BLT inlineEmitLiteralTwoBytes
415
416inlineEmitLiteralThreeBytes:
417 MOVD $0xf4, R1
418 MOVB R1, 0(R8)
419 MOVW R4, 1(R8)
420 ADD $3, R8, R8
421 B inlineEmitLiteralMemmove
422
423inlineEmitLiteralTwoBytes:
424 MOVD $0xf0, R1
425 MOVB R1, 0(R8)
426 MOVB R4, 1(R8)
427 ADD $2, R8, R8
428 B inlineEmitLiteralMemmove
429
430inlineEmitLiteralOneByte:
431 LSLW $2, R4, R4
432 MOVB R4, 0(R8)
433 ADD $1, R8, R8
434
435inlineEmitLiteralMemmove:
436 // Spill local variables (registers) onto the stack; call; unspill.
437 //
438 // copy(dst[i:], lit)
439 //
440 // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
441 // R8, R10 and R3 as arguments.
442 MOVD R8, 8(RSP)
443 MOVD R10, 16(RSP)
444 MOVD R3, 24(RSP)
445
446 // Finish the "d +=" part of "d += emitLiteral(etc)".
447 ADD R3, R8, R8
448 MOVD R7, 80(RSP)
449 MOVD R8, 88(RSP)
450 MOVD R15, 120(RSP)
451 CALL runtime·memmove(SB)
452 MOVD 64(RSP), R5
453 MOVD 72(RSP), R6
454 MOVD 80(RSP), R7
455 MOVD 88(RSP), R8
456 MOVD 96(RSP), R9
457 MOVD 120(RSP), R15
458 ADD $128, RSP, R17
459 MOVW $0xa7bd, R16
460 MOVKW $(0x1e35<<16), R16
461 B inner1
462
463inlineEmitLiteralEnd:
464 // End inline of the emitLiteral call.
465 // ----------------------------------------
466
467emitLiteralFastPath:
468 // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
469 MOVB R3, R4
470 SUBW $1, R4, R4
471 AND $0xff, R4, R4
472 LSLW $2, R4, R4
473 MOVB R4, (R8)
474 ADD $1, R8, R8
475
476 // !!! Implement the copy from lit to dst as a 16-byte load and store.
477 // (Encode's documentation says that dst and src must not overlap.)
478 //
479 // This always copies 16 bytes, instead of only len(lit) bytes, but that's
480 // OK. Subsequent iterations will fix up the overrun.
481 //
482 // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
483 // 16-byte loads and stores. This technique probably wouldn't be as
484 // effective on architectures that are fussier about alignment.
485 LDP 0(R10), (R0, R1)
486 STP (R0, R1), 0(R8)
487 ADD R3, R8, R8
488
489inner1:
490 // for { etc }
491
492 // base := s
493 MOVD R7, R12
494
495 // !!! offset := base - candidate
496 MOVD R12, R11
497 SUB R15, R11, R11
498 SUB R6, R11, R11
499
500 // ----------------------------------------
501 // Begin inline of the extendMatch call.
502 //
503 // s = extendMatch(src, candidate+4, s+4)
504
505 // !!! R14 = &src[len(src)]
506 MOVD src_len+32(FP), R14
507 ADD R6, R14, R14
508
509 // !!! R13 = &src[len(src) - 8]
510 MOVD R14, R13
511 SUB $8, R13, R13
512
513 // !!! R15 = &src[candidate + 4]
514 ADD $4, R15, R15
515 ADD R6, R15, R15
516
517 // !!! s += 4
518 ADD $4, R7, R7
519
520inlineExtendMatchCmp8:
521 // As long as we are 8 or more bytes before the end of src, we can load and
522 // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
523 CMP R13, R7
524 BHI inlineExtendMatchCmp1
525 MOVD (R15), R3
526 MOVD (R7), R4
527 CMP R4, R3
528 BNE inlineExtendMatchBSF
529 ADD $8, R15, R15
530 ADD $8, R7, R7
531 B inlineExtendMatchCmp8
532
533inlineExtendMatchBSF:
534 // If those 8 bytes were not equal, XOR the two 8 byte values, and return
535 // the index of the first byte that differs.
536 // RBIT reverses the bit order, then CLZ counts the leading zeros, the
537 // combination of which finds the least significant bit which is set.
538 // The arm64 architecture is little-endian, and the shift by 3 converts
539 // a bit index to a byte index.
540 EOR R3, R4, R4
541 RBIT R4, R4
542 CLZ R4, R4
543 ADD R4>>3, R7, R7
544 B inlineExtendMatchEnd
545
546inlineExtendMatchCmp1:
547 // In src's tail, compare 1 byte at a time.
548 CMP R7, R14
549 BLS inlineExtendMatchEnd
550 MOVB (R15), R3
551 MOVB (R7), R4
552 CMP R4, R3
553 BNE inlineExtendMatchEnd
554 ADD $1, R15, R15
555 ADD $1, R7, R7
556 B inlineExtendMatchCmp1
557
558inlineExtendMatchEnd:
559 // End inline of the extendMatch call.
560 // ----------------------------------------
561
562 // ----------------------------------------
563 // Begin inline of the emitCopy call.
564 //
565 // d += emitCopy(dst[d:], base-candidate, s-base)
566
567 // !!! length := s - base
568 MOVD R7, R3
569 SUB R12, R3, R3
570
571inlineEmitCopyLoop0:
572 // for length >= 68 { etc }
573 MOVW $68, R2
574 CMPW R2, R3
575 BLT inlineEmitCopyStep1
576
577 // Emit a length 64 copy, encoded as 3 bytes.
578 MOVD $0xfe, R1
579 MOVB R1, 0(R8)
580 MOVW R11, 1(R8)
581 ADD $3, R8, R8
582 SUBW $64, R3, R3
583 B inlineEmitCopyLoop0
584
585inlineEmitCopyStep1:
586 // if length > 64 { etc }
587 MOVW $64, R2
588 CMPW R2, R3
589 BLE inlineEmitCopyStep2
590
591 // Emit a length 60 copy, encoded as 3 bytes.
592 MOVD $0xee, R1
593 MOVB R1, 0(R8)
594 MOVW R11, 1(R8)
595 ADD $3, R8, R8
596 SUBW $60, R3, R3
597
598inlineEmitCopyStep2:
599 // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
600 MOVW $12, R2
601 CMPW R2, R3
602 BGE inlineEmitCopyStep3
603 MOVW $2048, R2
604 CMPW R2, R11
605 BGE inlineEmitCopyStep3
606
607 // Emit the remaining copy, encoded as 2 bytes.
608 MOVB R11, 1(R8)
609 LSRW $8, R11, R11
610 LSLW $5, R11, R11
611 SUBW $4, R3, R3
612 AND $0xff, R3, R3
613 LSLW $2, R3, R3
614 ORRW R3, R11, R11
615 ORRW $1, R11, R11
616 MOVB R11, 0(R8)
617 ADD $2, R8, R8
618 B inlineEmitCopyEnd
619
620inlineEmitCopyStep3:
621 // Emit the remaining copy, encoded as 3 bytes.
622 SUBW $1, R3, R3
623 LSLW $2, R3, R3
624 ORRW $2, R3, R3
625 MOVB R3, 0(R8)
626 MOVW R11, 1(R8)
627 ADD $3, R8, R8
628
629inlineEmitCopyEnd:
630 // End inline of the emitCopy call.
631 // ----------------------------------------
632
633 // nextEmit = s
634 MOVD R7, R10
635
636 // if s >= sLimit { goto emitRemainder }
637 MOVD R7, R3
638 SUB R6, R3, R3
639 CMP R3, R9
640 BLS emitRemainder
641
642 // As per the encode_other.go code:
643 //
644 // We could immediately etc.
645
646 // x := load64(src, s-1)
647 MOVD -1(R7), R14
648
649 // prevHash := hash(uint32(x>>0), shift)
650 MOVW R14, R11
651 MULW R16, R11, R11
652 LSRW R5, R11, R11
653
654 // table[prevHash] = uint16(s-1)
655 MOVD R7, R3
656 SUB R6, R3, R3
657 SUB $1, R3, R3
658
659 MOVHU R3, 0(R17)(R11<<1)
660
661 // currHash := hash(uint32(x>>8), shift)
662 LSR $8, R14, R14
663 MOVW R14, R11
664 MULW R16, R11, R11
665 LSRW R5, R11, R11
666
667 // candidate = int(table[currHash])
668 MOVHU 0(R17)(R11<<1), R15
669
670 // table[currHash] = uint16(s)
671 ADD $1, R3, R3
672 MOVHU R3, 0(R17)(R11<<1)
673
674 // if uint32(x>>8) == load32(src, candidate) { continue }
675 MOVW (R6)(R15*1), R4
676 CMPW R4, R14
677 BEQ inner1
678
679 // nextHash = hash(uint32(x>>16), shift)
680 LSR $8, R14, R14
681 MOVW R14, R11
682 MULW R16, R11, R11
683 LSRW R5, R11, R11
684
685 // s++
686 ADD $1, R7, R7
687
688 // break out of the inner1 for loop, i.e. continue the outer loop.
689 B outer
690
691emitRemainder:
692 // if nextEmit < len(src) { etc }
693 MOVD src_len+32(FP), R3
694 ADD R6, R3, R3
695 CMP R3, R10
696 BEQ encodeBlockEnd
697
698 // d += emitLiteral(dst[d:], src[nextEmit:])
699 //
700 // Push args.
701 MOVD R8, 8(RSP)
702 MOVD $0, 16(RSP) // Unnecessary, as the callee ignores it, but conservative.
703 MOVD $0, 24(RSP) // Unnecessary, as the callee ignores it, but conservative.
704 MOVD R10, 32(RSP)
705 SUB R10, R3, R3
706 MOVD R3, 40(RSP)
707 MOVD R3, 48(RSP) // Unnecessary, as the callee ignores it, but conservative.
708
709 // Spill local variables (registers) onto the stack; call; unspill.
710 MOVD R8, 88(RSP)
711 CALL ·emitLiteral(SB)
712 MOVD 88(RSP), R8
713
714 // Finish the "d +=" part of "d += emitLiteral(etc)".
715 MOVD 56(RSP), R1
716 ADD R1, R8, R8
717
718encodeBlockEnd:
719 MOVD dst_base+0(FP), R3
720 SUB R3, R8, R8
721 MOVD R8, d+48(FP)
722 RET