Blame - vendor/github.com/golang/snappy/encode_amd64.s - voltctl

blob: adfd979fe277aa548dc545ab9940a9ad0118fe2d [file] [log] [blame]

Scott Baker	ed4efab	2020-01-13 19:12:25 -0800	[diff] [blame^]	1	// Copyright 2016 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	// +build !appengine
				6	// +build gc
				7	// +build !noasm
				8
				9	#include "textflag.h"
				10
				11	// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
				12	// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
				13	// https://github.com/golang/snappy/issues/29
				14	//
				15	// As a workaround, the package was built with a known good assembler, and
				16	// those instructions were disassembled by "objdump -d" to yield the
				17	// 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
				18	// style comments, in AT&T asm syntax. Note that rsp here is a physical
				19	// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
				20	// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
				21	// fine on Go 1.6.
				22
				23	// The asm code generally follows the pure Go code in encode_other.go, except
				24	// where marked with a "!!!".
				25
				26	// ----------------------------------------------------------------------------
				27
				28	// func emitLiteral(dst, lit []byte) int
				29	//
				30	// All local variables fit into registers. The register allocation:
				31	// - AX len(lit)
				32	// - BX n
				33	// - DX return value
				34	// - DI &dst[i]
				35	// - R10 &lit[0]
				36	//
				37	// The 24 bytes of stack space is to call runtime·memmove.
				38	//
				39	// The unusual register allocation of local variables, such as R10 for the
				40	// source pointer, matches the allocation used at the call site in encodeBlock,
				41	// which makes it easier to manually inline this function.
				42	TEXT ·emitLiteral(SB), NOSPLIT, $24-56
				43	MOVQ dst_base+0(FP), DI
				44	MOVQ lit_base+24(FP), R10
				45	MOVQ lit_len+32(FP), AX
				46	MOVQ AX, DX
				47	MOVL AX, BX
				48	SUBL $1, BX
				49
				50	CMPL BX, $60
				51	JLT oneByte
				52	CMPL BX, $256
				53	JLT twoBytes
				54
				55	threeBytes:
				56	MOVB $0xf4, 0(DI)
				57	MOVW BX, 1(DI)
				58	ADDQ $3, DI
				59	ADDQ $3, DX
				60	JMP memmove
				61
				62	twoBytes:
				63	MOVB $0xf0, 0(DI)
				64	MOVB BX, 1(DI)
				65	ADDQ $2, DI
				66	ADDQ $2, DX
				67	JMP memmove
				68
				69	oneByte:
				70	SHLB $2, BX
				71	MOVB BX, 0(DI)
				72	ADDQ $1, DI
				73	ADDQ $1, DX
				74
				75	memmove:
				76	MOVQ DX, ret+48(FP)
				77
				78	// copy(dst[i:], lit)
				79	//
				80	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
				81	// DI, R10 and AX as arguments.
				82	MOVQ DI, 0(SP)
				83	MOVQ R10, 8(SP)
				84	MOVQ AX, 16(SP)
				85	CALL runtime·memmove(SB)
				86	RET
				87
				88	// ----------------------------------------------------------------------------
				89
				90	// func emitCopy(dst []byte, offset, length int) int
				91	//
				92	// All local variables fit into registers. The register allocation:
				93	// - AX length
				94	// - SI &dst[0]
				95	// - DI &dst[i]
				96	// - R11 offset
				97	//
				98	// The unusual register allocation of local variables, such as R11 for the
				99	// offset, matches the allocation used at the call site in encodeBlock, which
				100	// makes it easier to manually inline this function.
				101	TEXT ·emitCopy(SB), NOSPLIT, $0-48
				102	MOVQ dst_base+0(FP), DI
				103	MOVQ DI, SI
				104	MOVQ offset+24(FP), R11
				105	MOVQ length+32(FP), AX
				106
				107	loop0:
				108	// for length >= 68 { etc }
				109	CMPL AX, $68
				110	JLT step1
				111
				112	// Emit a length 64 copy, encoded as 3 bytes.
				113	MOVB $0xfe, 0(DI)
				114	MOVW R11, 1(DI)
				115	ADDQ $3, DI
				116	SUBL $64, AX
				117	JMP loop0
				118
				119	step1:
				120	// if length > 64 { etc }
				121	CMPL AX, $64
				122	JLE step2
				123
				124	// Emit a length 60 copy, encoded as 3 bytes.
				125	MOVB $0xee, 0(DI)
				126	MOVW R11, 1(DI)
				127	ADDQ $3, DI
				128	SUBL $60, AX
				129
				130	step2:
				131	// if length >= 12 \|\| offset >= 2048 { goto step3 }
				132	CMPL AX, $12
				133	JGE step3
				134	CMPL R11, $2048
				135	JGE step3
				136
				137	// Emit the remaining copy, encoded as 2 bytes.
				138	MOVB R11, 1(DI)
				139	SHRL $8, R11
				140	SHLB $5, R11
				141	SUBB $4, AX
				142	SHLB $2, AX
				143	ORB AX, R11
				144	ORB $1, R11
				145	MOVB R11, 0(DI)
				146	ADDQ $2, DI
				147
				148	// Return the number of bytes written.
				149	SUBQ SI, DI
				150	MOVQ DI, ret+40(FP)
				151	RET
				152
				153	step3:
				154	// Emit the remaining copy, encoded as 3 bytes.
				155	SUBL $1, AX
				156	SHLB $2, AX
				157	ORB $2, AX
				158	MOVB AX, 0(DI)
				159	MOVW R11, 1(DI)
				160	ADDQ $3, DI
				161
				162	// Return the number of bytes written.
				163	SUBQ SI, DI
				164	MOVQ DI, ret+40(FP)
				165	RET
				166
				167	// ----------------------------------------------------------------------------
				168
				169	// func extendMatch(src []byte, i, j int) int
				170	//
				171	// All local variables fit into registers. The register allocation:
				172	// - DX &src[0]
				173	// - SI &src[j]
				174	// - R13 &src[len(src) - 8]
				175	// - R14 &src[len(src)]
				176	// - R15 &src[i]
				177	//
				178	// The unusual register allocation of local variables, such as R15 for a source
				179	// pointer, matches the allocation used at the call site in encodeBlock, which
				180	// makes it easier to manually inline this function.
				181	TEXT ·extendMatch(SB), NOSPLIT, $0-48
				182	MOVQ src_base+0(FP), DX
				183	MOVQ src_len+8(FP), R14
				184	MOVQ i+24(FP), R15
				185	MOVQ j+32(FP), SI
				186	ADDQ DX, R14
				187	ADDQ DX, R15
				188	ADDQ DX, SI
				189	MOVQ R14, R13
				190	SUBQ $8, R13
				191
				192	cmp8:
				193	// As long as we are 8 or more bytes before the end of src, we can load and
				194	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
				195	CMPQ SI, R13
				196	JA cmp1
				197	MOVQ (R15), AX
				198	MOVQ (SI), BX
				199	CMPQ AX, BX
				200	JNE bsf
				201	ADDQ $8, R15
				202	ADDQ $8, SI
				203	JMP cmp8
				204
				205	bsf:
				206	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
				207	// the index of the first byte that differs. The BSF instruction finds the
				208	// least significant 1 bit, the amd64 architecture is little-endian, and
				209	// the shift by 3 converts a bit index to a byte index.
				210	XORQ AX, BX
				211	BSFQ BX, BX
				212	SHRQ $3, BX
				213	ADDQ BX, SI
				214
				215	// Convert from &src[ret] to ret.
				216	SUBQ DX, SI
				217	MOVQ SI, ret+40(FP)
				218	RET
				219
				220	cmp1:
				221	// In src's tail, compare 1 byte at a time.
				222	CMPQ SI, R14
				223	JAE extendMatchEnd
				224	MOVB (R15), AX
				225	MOVB (SI), BX
				226	CMPB AX, BX
				227	JNE extendMatchEnd
				228	ADDQ $1, R15
				229	ADDQ $1, SI
				230	JMP cmp1
				231
				232	extendMatchEnd:
				233	// Convert from &src[ret] to ret.
				234	SUBQ DX, SI
				235	MOVQ SI, ret+40(FP)
				236	RET
				237
				238	// ----------------------------------------------------------------------------
				239
				240	// func encodeBlock(dst, src []byte) (d int)
				241	//
				242	// All local variables fit into registers, other than "var table". The register
				243	// allocation:
				244	// - AX . .
				245	// - BX . .
				246	// - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
				247	// - DX 64 &src[0], tableSize
				248	// - SI 72 &src[s]
				249	// - DI 80 &dst[d]
				250	// - R9 88 sLimit
				251	// - R10 . &src[nextEmit]
				252	// - R11 96 prevHash, currHash, nextHash, offset
				253	// - R12 104 &src[base], skip
				254	// - R13 . &src[nextS], &src[len(src) - 8]
				255	// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
				256	// - R15 112 candidate
				257	//
				258	// The second column (56, 64, etc) is the stack offset to spill the registers
				259	// when calling other functions. We could pack this slightly tighter, but it's
				260	// simpler to have a dedicated spill map independent of the function called.
				261	//
				262	// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
				263	// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
				264	// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
				265	TEXT ·encodeBlock(SB), 0, $32888-56
				266	MOVQ dst_base+0(FP), DI
				267	MOVQ src_base+24(FP), SI
				268	MOVQ src_len+32(FP), R14
				269
				270	// shift, tableSize := uint32(32-8), 1<<8
				271	MOVQ $24, CX
				272	MOVQ $256, DX
				273
				274	calcShift:
				275	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
				276	// shift--
				277	// }
				278	CMPQ DX, $16384
				279	JGE varTable
				280	CMPQ DX, R14
				281	JGE varTable
				282	SUBQ $1, CX
				283	SHLQ $1, DX
				284	JMP calcShift
				285
				286	varTable:
				287	// var table [maxTableSize]uint16
				288	//
				289	// In the asm code, unlike the Go code, we can zero-initialize only the
				290	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
				291	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
				292	// 2048 writes that would zero-initialize all of table's 32768 bytes.
				293	SHRQ $3, DX
				294	LEAQ table-32768(SP), BX
				295	PXOR X0, X0
				296
				297	memclr:
				298	MOVOU X0, 0(BX)
				299	ADDQ $16, BX
				300	SUBQ $1, DX
				301	JNZ memclr
				302
				303	// !!! DX = &src[0]
				304	MOVQ SI, DX
				305
				306	// sLimit := len(src) - inputMargin
				307	MOVQ R14, R9
				308	SUBQ $15, R9
				309
				310	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
				311	// change for the rest of the function.
				312	MOVQ CX, 56(SP)
				313	MOVQ DX, 64(SP)
				314	MOVQ R9, 88(SP)
				315
				316	// nextEmit := 0
				317	MOVQ DX, R10
				318
				319	// s := 1
				320	ADDQ $1, SI
				321
				322	// nextHash := hash(load32(src, s), shift)
				323	MOVL 0(SI), R11
				324	IMULL $0x1e35a7bd, R11
				325	SHRL CX, R11
				326
				327	outer:
				328	// for { etc }
				329
				330	// skip := 32
				331	MOVQ $32, R12
				332
				333	// nextS := s
				334	MOVQ SI, R13
				335
				336	// candidate := 0
				337	MOVQ $0, R15
				338
				339	inner0:
				340	// for { etc }
				341
				342	// s := nextS
				343	MOVQ R13, SI
				344
				345	// bytesBetweenHashLookups := skip >> 5
				346	MOVQ R12, R14
				347	SHRQ $5, R14
				348
				349	// nextS = s + bytesBetweenHashLookups
				350	ADDQ R14, R13
				351
				352	// skip += bytesBetweenHashLookups
				353	ADDQ R14, R12
				354
				355	// if nextS > sLimit { goto emitRemainder }
				356	MOVQ R13, AX
				357	SUBQ DX, AX
				358	CMPQ AX, R9
				359	JA emitRemainder
				360
				361	// candidate = int(table[nextHash])
				362	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
				363	// XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
				364	BYTE $0x4e
				365	BYTE $0x0f
				366	BYTE $0xb7
				367	BYTE $0x7c
				368	BYTE $0x5c
				369	BYTE $0x78
				370
				371	// table[nextHash] = uint16(s)
				372	MOVQ SI, AX
				373	SUBQ DX, AX
				374
				375	// XXX: MOVW AX, table-32768(SP)(R11*2)
				376	// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
				377	BYTE $0x66
				378	BYTE $0x42
				379	BYTE $0x89
				380	BYTE $0x44
				381	BYTE $0x5c
				382	BYTE $0x78
				383
				384	// nextHash = hash(load32(src, nextS), shift)
				385	MOVL 0(R13), R11
				386	IMULL $0x1e35a7bd, R11
				387	SHRL CX, R11
				388
				389	// if load32(src, s) != load32(src, candidate) { continue } break
				390	MOVL 0(SI), AX
				391	MOVL (DX)(R15*1), BX
				392	CMPL AX, BX
				393	JNE inner0
				394
				395	fourByteMatch:
				396	// As per the encode_other.go code:
				397	//
				398	// A 4-byte match has been found. We'll later see etc.
				399
				400	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
				401	// on inputMargin in encode.go.
				402	MOVQ SI, AX
				403	SUBQ R10, AX
				404	CMPQ AX, $16
				405	JLE emitLiteralFastPath
				406
				407	// ----------------------------------------
				408	// Begin inline of the emitLiteral call.
				409	//
				410	// d += emitLiteral(dst[d:], src[nextEmit:s])
				411
				412	MOVL AX, BX
				413	SUBL $1, BX
				414
				415	CMPL BX, $60
				416	JLT inlineEmitLiteralOneByte
				417	CMPL BX, $256
				418	JLT inlineEmitLiteralTwoBytes
				419
				420	inlineEmitLiteralThreeBytes:
				421	MOVB $0xf4, 0(DI)
				422	MOVW BX, 1(DI)
				423	ADDQ $3, DI
				424	JMP inlineEmitLiteralMemmove
				425
				426	inlineEmitLiteralTwoBytes:
				427	MOVB $0xf0, 0(DI)
				428	MOVB BX, 1(DI)
				429	ADDQ $2, DI
				430	JMP inlineEmitLiteralMemmove
				431
				432	inlineEmitLiteralOneByte:
				433	SHLB $2, BX
				434	MOVB BX, 0(DI)
				435	ADDQ $1, DI
				436
				437	inlineEmitLiteralMemmove:
				438	// Spill local variables (registers) onto the stack; call; unspill.
				439	//
				440	// copy(dst[i:], lit)
				441	//
				442	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
				443	// DI, R10 and AX as arguments.
				444	MOVQ DI, 0(SP)
				445	MOVQ R10, 8(SP)
				446	MOVQ AX, 16(SP)
				447	ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)".
				448	MOVQ SI, 72(SP)
				449	MOVQ DI, 80(SP)
				450	MOVQ R15, 112(SP)
				451	CALL runtime·memmove(SB)
				452	MOVQ 56(SP), CX
				453	MOVQ 64(SP), DX
				454	MOVQ 72(SP), SI
				455	MOVQ 80(SP), DI
				456	MOVQ 88(SP), R9
				457	MOVQ 112(SP), R15
				458	JMP inner1
				459
				460	inlineEmitLiteralEnd:
				461	// End inline of the emitLiteral call.
				462	// ----------------------------------------
				463
				464	emitLiteralFastPath:
				465	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
				466	MOVB AX, BX
				467	SUBB $1, BX
				468	SHLB $2, BX
				469	MOVB BX, (DI)
				470	ADDQ $1, DI
				471
				472	// !!! Implement the copy from lit to dst as a 16-byte load and store.
				473	// (Encode's documentation says that dst and src must not overlap.)
				474	//
				475	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
				476	// OK. Subsequent iterations will fix up the overrun.
				477	//
				478	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
				479	// 16-byte loads and stores. This technique probably wouldn't be as
				480	// effective on architectures that are fussier about alignment.
				481	MOVOU 0(R10), X0
				482	MOVOU X0, 0(DI)
				483	ADDQ AX, DI
				484
				485	inner1:
				486	// for { etc }
				487
				488	// base := s
				489	MOVQ SI, R12
				490
				491	// !!! offset := base - candidate
				492	MOVQ R12, R11
				493	SUBQ R15, R11
				494	SUBQ DX, R11
				495
				496	// ----------------------------------------
				497	// Begin inline of the extendMatch call.
				498	//
				499	// s = extendMatch(src, candidate+4, s+4)
				500
				501	// !!! R14 = &src[len(src)]
				502	MOVQ src_len+32(FP), R14
				503	ADDQ DX, R14
				504
				505	// !!! R13 = &src[len(src) - 8]
				506	MOVQ R14, R13
				507	SUBQ $8, R13
				508
				509	// !!! R15 = &src[candidate + 4]
				510	ADDQ $4, R15
				511	ADDQ DX, R15
				512
				513	// !!! s += 4
				514	ADDQ $4, SI
				515
				516	inlineExtendMatchCmp8:
				517	// As long as we are 8 or more bytes before the end of src, we can load and
				518	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
				519	CMPQ SI, R13
				520	JA inlineExtendMatchCmp1
				521	MOVQ (R15), AX
				522	MOVQ (SI), BX
				523	CMPQ AX, BX
				524	JNE inlineExtendMatchBSF
				525	ADDQ $8, R15
				526	ADDQ $8, SI
				527	JMP inlineExtendMatchCmp8
				528
				529	inlineExtendMatchBSF:
				530	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
				531	// the index of the first byte that differs. The BSF instruction finds the
				532	// least significant 1 bit, the amd64 architecture is little-endian, and
				533	// the shift by 3 converts a bit index to a byte index.
				534	XORQ AX, BX
				535	BSFQ BX, BX
				536	SHRQ $3, BX
				537	ADDQ BX, SI
				538	JMP inlineExtendMatchEnd
				539
				540	inlineExtendMatchCmp1:
				541	// In src's tail, compare 1 byte at a time.
				542	CMPQ SI, R14
				543	JAE inlineExtendMatchEnd
				544	MOVB (R15), AX
				545	MOVB (SI), BX
				546	CMPB AX, BX
				547	JNE inlineExtendMatchEnd
				548	ADDQ $1, R15
				549	ADDQ $1, SI
				550	JMP inlineExtendMatchCmp1
				551
				552	inlineExtendMatchEnd:
				553	// End inline of the extendMatch call.
				554	// ----------------------------------------
				555
				556	// ----------------------------------------
				557	// Begin inline of the emitCopy call.
				558	//
				559	// d += emitCopy(dst[d:], base-candidate, s-base)
				560
				561	// !!! length := s - base
				562	MOVQ SI, AX
				563	SUBQ R12, AX
				564
				565	inlineEmitCopyLoop0:
				566	// for length >= 68 { etc }
				567	CMPL AX, $68
				568	JLT inlineEmitCopyStep1
				569
				570	// Emit a length 64 copy, encoded as 3 bytes.
				571	MOVB $0xfe, 0(DI)
				572	MOVW R11, 1(DI)
				573	ADDQ $3, DI
				574	SUBL $64, AX
				575	JMP inlineEmitCopyLoop0
				576
				577	inlineEmitCopyStep1:
				578	// if length > 64 { etc }
				579	CMPL AX, $64
				580	JLE inlineEmitCopyStep2
				581
				582	// Emit a length 60 copy, encoded as 3 bytes.
				583	MOVB $0xee, 0(DI)
				584	MOVW R11, 1(DI)
				585	ADDQ $3, DI
				586	SUBL $60, AX
				587
				588	inlineEmitCopyStep2:
				589	// if length >= 12 \|\| offset >= 2048 { goto inlineEmitCopyStep3 }
				590	CMPL AX, $12
				591	JGE inlineEmitCopyStep3
				592	CMPL R11, $2048
				593	JGE inlineEmitCopyStep3
				594
				595	// Emit the remaining copy, encoded as 2 bytes.
				596	MOVB R11, 1(DI)
				597	SHRL $8, R11
				598	SHLB $5, R11
				599	SUBB $4, AX
				600	SHLB $2, AX
				601	ORB AX, R11
				602	ORB $1, R11
				603	MOVB R11, 0(DI)
				604	ADDQ $2, DI
				605	JMP inlineEmitCopyEnd
				606
				607	inlineEmitCopyStep3:
				608	// Emit the remaining copy, encoded as 3 bytes.
				609	SUBL $1, AX
				610	SHLB $2, AX
				611	ORB $2, AX
				612	MOVB AX, 0(DI)
				613	MOVW R11, 1(DI)
				614	ADDQ $3, DI
				615
				616	inlineEmitCopyEnd:
				617	// End inline of the emitCopy call.
				618	// ----------------------------------------
				619
				620	// nextEmit = s
				621	MOVQ SI, R10
				622
				623	// if s >= sLimit { goto emitRemainder }
				624	MOVQ SI, AX
				625	SUBQ DX, AX
				626	CMPQ AX, R9
				627	JAE emitRemainder
				628
				629	// As per the encode_other.go code:
				630	//
				631	// We could immediately etc.
				632
				633	// x := load64(src, s-1)
				634	MOVQ -1(SI), R14
				635
				636	// prevHash := hash(uint32(x>>0), shift)
				637	MOVL R14, R11
				638	IMULL $0x1e35a7bd, R11
				639	SHRL CX, R11
				640
				641	// table[prevHash] = uint16(s-1)
				642	MOVQ SI, AX
				643	SUBQ DX, AX
				644	SUBQ $1, AX
				645
				646	// XXX: MOVW AX, table-32768(SP)(R11*2)
				647	// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
				648	BYTE $0x66
				649	BYTE $0x42
				650	BYTE $0x89
				651	BYTE $0x44
				652	BYTE $0x5c
				653	BYTE $0x78
				654
				655	// currHash := hash(uint32(x>>8), shift)
				656	SHRQ $8, R14
				657	MOVL R14, R11
				658	IMULL $0x1e35a7bd, R11
				659	SHRL CX, R11
				660
				661	// candidate = int(table[currHash])
				662	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
				663	// XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
				664	BYTE $0x4e
				665	BYTE $0x0f
				666	BYTE $0xb7
				667	BYTE $0x7c
				668	BYTE $0x5c
				669	BYTE $0x78
				670
				671	// table[currHash] = uint16(s)
				672	ADDQ $1, AX
				673
				674	// XXX: MOVW AX, table-32768(SP)(R11*2)
				675	// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
				676	BYTE $0x66
				677	BYTE $0x42
				678	BYTE $0x89
				679	BYTE $0x44
				680	BYTE $0x5c
				681	BYTE $0x78
				682
				683	// if uint32(x>>8) == load32(src, candidate) { continue }
				684	MOVL (DX)(R15*1), BX
				685	CMPL R14, BX
				686	JEQ inner1
				687
				688	// nextHash = hash(uint32(x>>16), shift)
				689	SHRQ $8, R14
				690	MOVL R14, R11
				691	IMULL $0x1e35a7bd, R11
				692	SHRL CX, R11
				693
				694	// s++
				695	ADDQ $1, SI
				696
				697	// break out of the inner1 for loop, i.e. continue the outer loop.
				698	JMP outer
				699
				700	emitRemainder:
				701	// if nextEmit < len(src) { etc }
				702	MOVQ src_len+32(FP), AX
				703	ADDQ DX, AX
				704	CMPQ R10, AX
				705	JEQ encodeBlockEnd
				706
				707	// d += emitLiteral(dst[d:], src[nextEmit:])
				708	//
				709	// Push args.
				710	MOVQ DI, 0(SP)
				711	MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
				712	MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
				713	MOVQ R10, 24(SP)
				714	SUBQ R10, AX
				715	MOVQ AX, 32(SP)
				716	MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
				717
				718	// Spill local variables (registers) onto the stack; call; unspill.
				719	MOVQ DI, 80(SP)
				720	CALL ·emitLiteral(SB)
				721	MOVQ 80(SP), DI
				722
				723	// Finish the "d +=" part of "d += emitLiteral(etc)".
				724	ADDQ 48(SP), DI
				725
				726	encodeBlockEnd:
				727	MOVQ dst_base+0(FP), AX
				728	SUBQ AX, DI
				729	MOVQ DI, d+48(FP)
				730	RET