Blame - vendor/github.com/golang/snappy/encode_arm64.s - voltha-go

blob: bf83667d711f7d2bf4abb6d1d5acb23ff334316b [file] [log] [blame]

khenaidoo	d948f77	2021-08-11 17:49:24 -0400	[diff] [blame^]	1	// Copyright 2020 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	// +build !appengine
				6	// +build gc
				7	// +build !noasm
				8
				9	#include "textflag.h"
				10
				11	// The asm code generally follows the pure Go code in encode_other.go, except
				12	// where marked with a "!!!".
				13
				14	// ----------------------------------------------------------------------------
				15
				16	// func emitLiteral(dst, lit []byte) int
				17	//
				18	// All local variables fit into registers. The register allocation:
				19	// - R3 len(lit)
				20	// - R4 n
				21	// - R6 return value
				22	// - R8 &dst[i]
				23	// - R10 &lit[0]
				24	//
				25	// The 32 bytes of stack space is to call runtime·memmove.
				26	//
				27	// The unusual register allocation of local variables, such as R10 for the
				28	// source pointer, matches the allocation used at the call site in encodeBlock,
				29	// which makes it easier to manually inline this function.
				30	TEXT ·emitLiteral(SB), NOSPLIT, $32-56
				31	MOVD dst_base+0(FP), R8
				32	MOVD lit_base+24(FP), R10
				33	MOVD lit_len+32(FP), R3
				34	MOVD R3, R6
				35	MOVW R3, R4
				36	SUBW $1, R4, R4
				37
				38	CMPW $60, R4
				39	BLT oneByte
				40	CMPW $256, R4
				41	BLT twoBytes
				42
				43	threeBytes:
				44	MOVD $0xf4, R2
				45	MOVB R2, 0(R8)
				46	MOVW R4, 1(R8)
				47	ADD $3, R8, R8
				48	ADD $3, R6, R6
				49	B memmove
				50
				51	twoBytes:
				52	MOVD $0xf0, R2
				53	MOVB R2, 0(R8)
				54	MOVB R4, 1(R8)
				55	ADD $2, R8, R8
				56	ADD $2, R6, R6
				57	B memmove
				58
				59	oneByte:
				60	LSLW $2, R4, R4
				61	MOVB R4, 0(R8)
				62	ADD $1, R8, R8
				63	ADD $1, R6, R6
				64
				65	memmove:
				66	MOVD R6, ret+48(FP)
				67
				68	// copy(dst[i:], lit)
				69	//
				70	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
				71	// R8, R10 and R3 as arguments.
				72	MOVD R8, 8(RSP)
				73	MOVD R10, 16(RSP)
				74	MOVD R3, 24(RSP)
				75	CALL runtime·memmove(SB)
				76	RET
				77
				78	// ----------------------------------------------------------------------------
				79
				80	// func emitCopy(dst []byte, offset, length int) int
				81	//
				82	// All local variables fit into registers. The register allocation:
				83	// - R3 length
				84	// - R7 &dst[0]
				85	// - R8 &dst[i]
				86	// - R11 offset
				87	//
				88	// The unusual register allocation of local variables, such as R11 for the
				89	// offset, matches the allocation used at the call site in encodeBlock, which
				90	// makes it easier to manually inline this function.
				91	TEXT ·emitCopy(SB), NOSPLIT, $0-48
				92	MOVD dst_base+0(FP), R8
				93	MOVD R8, R7
				94	MOVD offset+24(FP), R11
				95	MOVD length+32(FP), R3
				96
				97	loop0:
				98	// for length >= 68 { etc }
				99	CMPW $68, R3
				100	BLT step1
				101
				102	// Emit a length 64 copy, encoded as 3 bytes.
				103	MOVD $0xfe, R2
				104	MOVB R2, 0(R8)
				105	MOVW R11, 1(R8)
				106	ADD $3, R8, R8
				107	SUB $64, R3, R3
				108	B loop0
				109
				110	step1:
				111	// if length > 64 { etc }
				112	CMP $64, R3
				113	BLE step2
				114
				115	// Emit a length 60 copy, encoded as 3 bytes.
				116	MOVD $0xee, R2
				117	MOVB R2, 0(R8)
				118	MOVW R11, 1(R8)
				119	ADD $3, R8, R8
				120	SUB $60, R3, R3
				121
				122	step2:
				123	// if length >= 12 \|\| offset >= 2048 { goto step3 }
				124	CMP $12, R3
				125	BGE step3
				126	CMPW $2048, R11
				127	BGE step3
				128
				129	// Emit the remaining copy, encoded as 2 bytes.
				130	MOVB R11, 1(R8)
				131	LSRW $3, R11, R11
				132	AND $0xe0, R11, R11
				133	SUB $4, R3, R3
				134	LSLW $2, R3
				135	AND $0xff, R3, R3
				136	ORRW R3, R11, R11
				137	ORRW $1, R11, R11
				138	MOVB R11, 0(R8)
				139	ADD $2, R8, R8
				140
				141	// Return the number of bytes written.
				142	SUB R7, R8, R8
				143	MOVD R8, ret+40(FP)
				144	RET
				145
				146	step3:
				147	// Emit the remaining copy, encoded as 3 bytes.
				148	SUB $1, R3, R3
				149	AND $0xff, R3, R3
				150	LSLW $2, R3, R3
				151	ORRW $2, R3, R3
				152	MOVB R3, 0(R8)
				153	MOVW R11, 1(R8)
				154	ADD $3, R8, R8
				155
				156	// Return the number of bytes written.
				157	SUB R7, R8, R8
				158	MOVD R8, ret+40(FP)
				159	RET
				160
				161	// ----------------------------------------------------------------------------
				162
				163	// func extendMatch(src []byte, i, j int) int
				164	//
				165	// All local variables fit into registers. The register allocation:
				166	// - R6 &src[0]
				167	// - R7 &src[j]
				168	// - R13 &src[len(src) - 8]
				169	// - R14 &src[len(src)]
				170	// - R15 &src[i]
				171	//
				172	// The unusual register allocation of local variables, such as R15 for a source
				173	// pointer, matches the allocation used at the call site in encodeBlock, which
				174	// makes it easier to manually inline this function.
				175	TEXT ·extendMatch(SB), NOSPLIT, $0-48
				176	MOVD src_base+0(FP), R6
				177	MOVD src_len+8(FP), R14
				178	MOVD i+24(FP), R15
				179	MOVD j+32(FP), R7
				180	ADD R6, R14, R14
				181	ADD R6, R15, R15
				182	ADD R6, R7, R7
				183	MOVD R14, R13
				184	SUB $8, R13, R13
				185
				186	cmp8:
				187	// As long as we are 8 or more bytes before the end of src, we can load and
				188	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
				189	CMP R13, R7
				190	BHI cmp1
				191	MOVD (R15), R3
				192	MOVD (R7), R4
				193	CMP R4, R3
				194	BNE bsf
				195	ADD $8, R15, R15
				196	ADD $8, R7, R7
				197	B cmp8
				198
				199	bsf:
				200	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
				201	// the index of the first byte that differs.
				202	// RBIT reverses the bit order, then CLZ counts the leading zeros, the
				203	// combination of which finds the least significant bit which is set.
				204	// The arm64 architecture is little-endian, and the shift by 3 converts
				205	// a bit index to a byte index.
				206	EOR R3, R4, R4
				207	RBIT R4, R4
				208	CLZ R4, R4
				209	ADD R4>>3, R7, R7
				210
				211	// Convert from &src[ret] to ret.
				212	SUB R6, R7, R7
				213	MOVD R7, ret+40(FP)
				214	RET
				215
				216	cmp1:
				217	// In src's tail, compare 1 byte at a time.
				218	CMP R7, R14
				219	BLS extendMatchEnd
				220	MOVB (R15), R3
				221	MOVB (R7), R4
				222	CMP R4, R3
				223	BNE extendMatchEnd
				224	ADD $1, R15, R15
				225	ADD $1, R7, R7
				226	B cmp1
				227
				228	extendMatchEnd:
				229	// Convert from &src[ret] to ret.
				230	SUB R6, R7, R7
				231	MOVD R7, ret+40(FP)
				232	RET
				233
				234	// ----------------------------------------------------------------------------
				235
				236	// func encodeBlock(dst, src []byte) (d int)
				237	//
				238	// All local variables fit into registers, other than "var table". The register
				239	// allocation:
				240	// - R3 . .
				241	// - R4 . .
				242	// - R5 64 shift
				243	// - R6 72 &src[0], tableSize
				244	// - R7 80 &src[s]
				245	// - R8 88 &dst[d]
				246	// - R9 96 sLimit
				247	// - R10 . &src[nextEmit]
				248	// - R11 104 prevHash, currHash, nextHash, offset
				249	// - R12 112 &src[base], skip
				250	// - R13 . &src[nextS], &src[len(src) - 8]
				251	// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
				252	// - R15 120 candidate
				253	// - R16 . hash constant, 0x1e35a7bd
				254	// - R17 . &table
				255	// - . 128 table
				256	//
				257	// The second column (64, 72, etc) is the stack offset to spill the registers
				258	// when calling other functions. We could pack this slightly tighter, but it's
				259	// simpler to have a dedicated spill map independent of the function called.
				260	//
				261	// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
				262	// extra 64 bytes, to call other functions, and an extra 64 bytes, to spill
				263	// local variables (registers) during calls gives 32768 + 64 + 64 = 32896.
				264	TEXT ·encodeBlock(SB), 0, $32896-56
				265	MOVD dst_base+0(FP), R8
				266	MOVD src_base+24(FP), R7
				267	MOVD src_len+32(FP), R14
				268
				269	// shift, tableSize := uint32(32-8), 1<<8
				270	MOVD $24, R5
				271	MOVD $256, R6
				272	MOVW $0xa7bd, R16
				273	MOVKW $(0x1e35<<16), R16
				274
				275	calcShift:
				276	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
				277	// shift--
				278	// }
				279	MOVD $16384, R2
				280	CMP R2, R6
				281	BGE varTable
				282	CMP R14, R6
				283	BGE varTable
				284	SUB $1, R5, R5
				285	LSL $1, R6, R6
				286	B calcShift
				287
				288	varTable:
				289	// var table [maxTableSize]uint16
				290	//
				291	// In the asm code, unlike the Go code, we can zero-initialize only the
				292	// first tableSize elements. Each uint16 element is 2 bytes and each
				293	// iterations writes 64 bytes, so we can do only tableSize/32 writes
				294	// instead of the 2048 writes that would zero-initialize all of table's
				295	// 32768 bytes. This clear could overrun the first tableSize elements, but
				296	// it won't overrun the allocated stack size.
				297	ADD $128, RSP, R17
				298	MOVD R17, R4
				299
				300	// !!! R6 = &src[tableSize]
				301	ADD R6<<1, R17, R6
				302
				303	memclr:
				304	STP.P (ZR, ZR), 64(R4)
				305	STP (ZR, ZR), -48(R4)
				306	STP (ZR, ZR), -32(R4)
				307	STP (ZR, ZR), -16(R4)
				308	CMP R4, R6
				309	BHI memclr
				310
				311	// !!! R6 = &src[0]
				312	MOVD R7, R6
				313
				314	// sLimit := len(src) - inputMargin
				315	MOVD R14, R9
				316	SUB $15, R9, R9
				317
				318	// !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't
				319	// change for the rest of the function.
				320	MOVD R5, 64(RSP)
				321	MOVD R6, 72(RSP)
				322	MOVD R9, 96(RSP)
				323
				324	// nextEmit := 0
				325	MOVD R6, R10
				326
				327	// s := 1
				328	ADD $1, R7, R7
				329
				330	// nextHash := hash(load32(src, s), shift)
				331	MOVW 0(R7), R11
				332	MULW R16, R11, R11
				333	LSRW R5, R11, R11
				334
				335	outer:
				336	// for { etc }
				337
				338	// skip := 32
				339	MOVD $32, R12
				340
				341	// nextS := s
				342	MOVD R7, R13
				343
				344	// candidate := 0
				345	MOVD $0, R15
				346
				347	inner0:
				348	// for { etc }
				349
				350	// s := nextS
				351	MOVD R13, R7
				352
				353	// bytesBetweenHashLookups := skip >> 5
				354	MOVD R12, R14
				355	LSR $5, R14, R14
				356
				357	// nextS = s + bytesBetweenHashLookups
				358	ADD R14, R13, R13
				359
				360	// skip += bytesBetweenHashLookups
				361	ADD R14, R12, R12
				362
				363	// if nextS > sLimit { goto emitRemainder }
				364	MOVD R13, R3
				365	SUB R6, R3, R3
				366	CMP R9, R3
				367	BHI emitRemainder
				368
				369	// candidate = int(table[nextHash])
				370	MOVHU 0(R17)(R11<<1), R15
				371
				372	// table[nextHash] = uint16(s)
				373	MOVD R7, R3
				374	SUB R6, R3, R3
				375
				376	MOVH R3, 0(R17)(R11<<1)
				377
				378	// nextHash = hash(load32(src, nextS), shift)
				379	MOVW 0(R13), R11
				380	MULW R16, R11
				381	LSRW R5, R11, R11
				382
				383	// if load32(src, s) != load32(src, candidate) { continue } break
				384	MOVW 0(R7), R3
				385	MOVW (R6)(R15*1), R4
				386	CMPW R4, R3
				387	BNE inner0
				388
				389	fourByteMatch:
				390	// As per the encode_other.go code:
				391	//
				392	// A 4-byte match has been found. We'll later see etc.
				393
				394	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
				395	// on inputMargin in encode.go.
				396	MOVD R7, R3
				397	SUB R10, R3, R3
				398	CMP $16, R3
				399	BLE emitLiteralFastPath
				400
				401	// ----------------------------------------
				402	// Begin inline of the emitLiteral call.
				403	//
				404	// d += emitLiteral(dst[d:], src[nextEmit:s])
				405
				406	MOVW R3, R4
				407	SUBW $1, R4, R4
				408
				409	MOVW $60, R2
				410	CMPW R2, R4
				411	BLT inlineEmitLiteralOneByte
				412	MOVW $256, R2
				413	CMPW R2, R4
				414	BLT inlineEmitLiteralTwoBytes
				415
				416	inlineEmitLiteralThreeBytes:
				417	MOVD $0xf4, R1
				418	MOVB R1, 0(R8)
				419	MOVW R4, 1(R8)
				420	ADD $3, R8, R8
				421	B inlineEmitLiteralMemmove
				422
				423	inlineEmitLiteralTwoBytes:
				424	MOVD $0xf0, R1
				425	MOVB R1, 0(R8)
				426	MOVB R4, 1(R8)
				427	ADD $2, R8, R8
				428	B inlineEmitLiteralMemmove
				429
				430	inlineEmitLiteralOneByte:
				431	LSLW $2, R4, R4
				432	MOVB R4, 0(R8)
				433	ADD $1, R8, R8
				434
				435	inlineEmitLiteralMemmove:
				436	// Spill local variables (registers) onto the stack; call; unspill.
				437	//
				438	// copy(dst[i:], lit)
				439	//
				440	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
				441	// R8, R10 and R3 as arguments.
				442	MOVD R8, 8(RSP)
				443	MOVD R10, 16(RSP)
				444	MOVD R3, 24(RSP)
				445
				446	// Finish the "d +=" part of "d += emitLiteral(etc)".
				447	ADD R3, R8, R8
				448	MOVD R7, 80(RSP)
				449	MOVD R8, 88(RSP)
				450	MOVD R15, 120(RSP)
				451	CALL runtime·memmove(SB)
				452	MOVD 64(RSP), R5
				453	MOVD 72(RSP), R6
				454	MOVD 80(RSP), R7
				455	MOVD 88(RSP), R8
				456	MOVD 96(RSP), R9
				457	MOVD 120(RSP), R15
				458	ADD $128, RSP, R17
				459	MOVW $0xa7bd, R16
				460	MOVKW $(0x1e35<<16), R16
				461	B inner1
				462
				463	inlineEmitLiteralEnd:
				464	// End inline of the emitLiteral call.
				465	// ----------------------------------------
				466
				467	emitLiteralFastPath:
				468	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
				469	MOVB R3, R4
				470	SUBW $1, R4, R4
				471	AND $0xff, R4, R4
				472	LSLW $2, R4, R4
				473	MOVB R4, (R8)
				474	ADD $1, R8, R8
				475
				476	// !!! Implement the copy from lit to dst as a 16-byte load and store.
				477	// (Encode's documentation says that dst and src must not overlap.)
				478	//
				479	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
				480	// OK. Subsequent iterations will fix up the overrun.
				481	//
				482	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
				483	// 16-byte loads and stores. This technique probably wouldn't be as
				484	// effective on architectures that are fussier about alignment.
				485	LDP 0(R10), (R0, R1)
				486	STP (R0, R1), 0(R8)
				487	ADD R3, R8, R8
				488
				489	inner1:
				490	// for { etc }
				491
				492	// base := s
				493	MOVD R7, R12
				494
				495	// !!! offset := base - candidate
				496	MOVD R12, R11
				497	SUB R15, R11, R11
				498	SUB R6, R11, R11
				499
				500	// ----------------------------------------
				501	// Begin inline of the extendMatch call.
				502	//
				503	// s = extendMatch(src, candidate+4, s+4)
				504
				505	// !!! R14 = &src[len(src)]
				506	MOVD src_len+32(FP), R14
				507	ADD R6, R14, R14
				508
				509	// !!! R13 = &src[len(src) - 8]
				510	MOVD R14, R13
				511	SUB $8, R13, R13
				512
				513	// !!! R15 = &src[candidate + 4]
				514	ADD $4, R15, R15
				515	ADD R6, R15, R15
				516
				517	// !!! s += 4
				518	ADD $4, R7, R7
				519
				520	inlineExtendMatchCmp8:
				521	// As long as we are 8 or more bytes before the end of src, we can load and
				522	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
				523	CMP R13, R7
				524	BHI inlineExtendMatchCmp1
				525	MOVD (R15), R3
				526	MOVD (R7), R4
				527	CMP R4, R3
				528	BNE inlineExtendMatchBSF
				529	ADD $8, R15, R15
				530	ADD $8, R7, R7
				531	B inlineExtendMatchCmp8
				532
				533	inlineExtendMatchBSF:
				534	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
				535	// the index of the first byte that differs.
				536	// RBIT reverses the bit order, then CLZ counts the leading zeros, the
				537	// combination of which finds the least significant bit which is set.
				538	// The arm64 architecture is little-endian, and the shift by 3 converts
				539	// a bit index to a byte index.
				540	EOR R3, R4, R4
				541	RBIT R4, R4
				542	CLZ R4, R4
				543	ADD R4>>3, R7, R7
				544	B inlineExtendMatchEnd
				545
				546	inlineExtendMatchCmp1:
				547	// In src's tail, compare 1 byte at a time.
				548	CMP R7, R14
				549	BLS inlineExtendMatchEnd
				550	MOVB (R15), R3
				551	MOVB (R7), R4
				552	CMP R4, R3
				553	BNE inlineExtendMatchEnd
				554	ADD $1, R15, R15
				555	ADD $1, R7, R7
				556	B inlineExtendMatchCmp1
				557
				558	inlineExtendMatchEnd:
				559	// End inline of the extendMatch call.
				560	// ----------------------------------------
				561
				562	// ----------------------------------------
				563	// Begin inline of the emitCopy call.
				564	//
				565	// d += emitCopy(dst[d:], base-candidate, s-base)
				566
				567	// !!! length := s - base
				568	MOVD R7, R3
				569	SUB R12, R3, R3
				570
				571	inlineEmitCopyLoop0:
				572	// for length >= 68 { etc }
				573	MOVW $68, R2
				574	CMPW R2, R3
				575	BLT inlineEmitCopyStep1
				576
				577	// Emit a length 64 copy, encoded as 3 bytes.
				578	MOVD $0xfe, R1
				579	MOVB R1, 0(R8)
				580	MOVW R11, 1(R8)
				581	ADD $3, R8, R8
				582	SUBW $64, R3, R3
				583	B inlineEmitCopyLoop0
				584
				585	inlineEmitCopyStep1:
				586	// if length > 64 { etc }
				587	MOVW $64, R2
				588	CMPW R2, R3
				589	BLE inlineEmitCopyStep2
				590
				591	// Emit a length 60 copy, encoded as 3 bytes.
				592	MOVD $0xee, R1
				593	MOVB R1, 0(R8)
				594	MOVW R11, 1(R8)
				595	ADD $3, R8, R8
				596	SUBW $60, R3, R3
				597
				598	inlineEmitCopyStep2:
				599	// if length >= 12 \|\| offset >= 2048 { goto inlineEmitCopyStep3 }
				600	MOVW $12, R2
				601	CMPW R2, R3
				602	BGE inlineEmitCopyStep3
				603	MOVW $2048, R2
				604	CMPW R2, R11
				605	BGE inlineEmitCopyStep3
				606
				607	// Emit the remaining copy, encoded as 2 bytes.
				608	MOVB R11, 1(R8)
				609	LSRW $8, R11, R11
				610	LSLW $5, R11, R11
				611	SUBW $4, R3, R3
				612	AND $0xff, R3, R3
				613	LSLW $2, R3, R3
				614	ORRW R3, R11, R11
				615	ORRW $1, R11, R11
				616	MOVB R11, 0(R8)
				617	ADD $2, R8, R8
				618	B inlineEmitCopyEnd
				619
				620	inlineEmitCopyStep3:
				621	// Emit the remaining copy, encoded as 3 bytes.
				622	SUBW $1, R3, R3
				623	LSLW $2, R3, R3
				624	ORRW $2, R3, R3
				625	MOVB R3, 0(R8)
				626	MOVW R11, 1(R8)
				627	ADD $3, R8, R8
				628
				629	inlineEmitCopyEnd:
				630	// End inline of the emitCopy call.
				631	// ----------------------------------------
				632
				633	// nextEmit = s
				634	MOVD R7, R10
				635
				636	// if s >= sLimit { goto emitRemainder }
				637	MOVD R7, R3
				638	SUB R6, R3, R3
				639	CMP R3, R9
				640	BLS emitRemainder
				641
				642	// As per the encode_other.go code:
				643	//
				644	// We could immediately etc.
				645
				646	// x := load64(src, s-1)
				647	MOVD -1(R7), R14
				648
				649	// prevHash := hash(uint32(x>>0), shift)
				650	MOVW R14, R11
				651	MULW R16, R11, R11
				652	LSRW R5, R11, R11
				653
				654	// table[prevHash] = uint16(s-1)
				655	MOVD R7, R3
				656	SUB R6, R3, R3
				657	SUB $1, R3, R3
				658
				659	MOVHU R3, 0(R17)(R11<<1)
				660
				661	// currHash := hash(uint32(x>>8), shift)
				662	LSR $8, R14, R14
				663	MOVW R14, R11
				664	MULW R16, R11, R11
				665	LSRW R5, R11, R11
				666
				667	// candidate = int(table[currHash])
				668	MOVHU 0(R17)(R11<<1), R15
				669
				670	// table[currHash] = uint16(s)
				671	ADD $1, R3, R3
				672	MOVHU R3, 0(R17)(R11<<1)
				673
				674	// if uint32(x>>8) == load32(src, candidate) { continue }
				675	MOVW (R6)(R15*1), R4
				676	CMPW R4, R14
				677	BEQ inner1
				678
				679	// nextHash = hash(uint32(x>>16), shift)
				680	LSR $8, R14, R14
				681	MOVW R14, R11
				682	MULW R16, R11, R11
				683	LSRW R5, R11, R11
				684
				685	// s++
				686	ADD $1, R7, R7
				687
				688	// break out of the inner1 for loop, i.e. continue the outer loop.
				689	B outer
				690
				691	emitRemainder:
				692	// if nextEmit < len(src) { etc }
				693	MOVD src_len+32(FP), R3
				694	ADD R6, R3, R3
				695	CMP R3, R10
				696	BEQ encodeBlockEnd
				697
				698	// d += emitLiteral(dst[d:], src[nextEmit:])
				699	//
				700	// Push args.
				701	MOVD R8, 8(RSP)
				702	MOVD $0, 16(RSP) // Unnecessary, as the callee ignores it, but conservative.
				703	MOVD $0, 24(RSP) // Unnecessary, as the callee ignores it, but conservative.
				704	MOVD R10, 32(RSP)
				705	SUB R10, R3, R3
				706	MOVD R3, 40(RSP)
				707	MOVD R3, 48(RSP) // Unnecessary, as the callee ignores it, but conservative.
				708
				709	// Spill local variables (registers) onto the stack; call; unspill.
				710	MOVD R8, 88(RSP)
				711	CALL ·emitLiteral(SB)
				712	MOVD 88(RSP), R8
				713
				714	// Finish the "d +=" part of "d += emitLiteral(etc)".
				715	MOVD 56(RSP), R1
				716	ADD R1, R8, R8
				717
				718	encodeBlockEnd:
				719	MOVD dst_base+0(FP), R3
				720	SUB R3, R8, R8
				721	MOVD R8, d+48(FP)
				722	RET