Blame - vendor/github.com/golang/snappy/decode_amd64.s - voltctl

blob: e6179f65e3511d6da76e25c749c6d781c5e337a7 [file] [log] [blame]

Scott Baker	ed4efab	2020-01-13 19:12:25 -0800	[diff] [blame^]	1	// Copyright 2016 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	// +build !appengine
				6	// +build gc
				7	// +build !noasm
				8
				9	#include "textflag.h"
				10
				11	// The asm code generally follows the pure Go code in decode_other.go, except
				12	// where marked with a "!!!".
				13
				14	// func decode(dst, src []byte) int
				15	//
				16	// All local variables fit into registers. The non-zero stack size is only to
				17	// spill registers and push args when issuing a CALL. The register allocation:
				18	// - AX scratch
				19	// - BX scratch
				20	// - CX length or x
				21	// - DX offset
				22	// - SI &src[s]
				23	// - DI &dst[d]
				24	// + R8 dst_base
				25	// + R9 dst_len
				26	// + R10 dst_base + dst_len
				27	// + R11 src_base
				28	// + R12 src_len
				29	// + R13 src_base + src_len
				30	// - R14 used by doCopy
				31	// - R15 used by doCopy
				32	//
				33	// The registers R8-R13 (marked with a "+") are set at the start of the
				34	// function, and after a CALL returns, and are not otherwise modified.
				35	//
				36	// The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI.
				37	// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
				38	TEXT ·decode(SB), NOSPLIT, $48-56
				39	// Initialize SI, DI and R8-R13.
				40	MOVQ dst_base+0(FP), R8
				41	MOVQ dst_len+8(FP), R9
				42	MOVQ R8, DI
				43	MOVQ R8, R10
				44	ADDQ R9, R10
				45	MOVQ src_base+24(FP), R11
				46	MOVQ src_len+32(FP), R12
				47	MOVQ R11, SI
				48	MOVQ R11, R13
				49	ADDQ R12, R13
				50
				51	loop:
				52	// for s < len(src)
				53	CMPQ SI, R13
				54	JEQ end
				55
				56	// CX = uint32(src[s])
				57	//
				58	// switch src[s] & 0x03
				59	MOVBLZX (SI), CX
				60	MOVL CX, BX
				61	ANDL $3, BX
				62	CMPL BX, $1
				63	JAE tagCopy
				64
				65	// ----------------------------------------
				66	// The code below handles literal tags.
				67
				68	// case tagLiteral:
				69	// x := uint32(src[s] >> 2)
				70	// switch
				71	SHRL $2, CX
				72	CMPL CX, $60
				73	JAE tagLit60Plus
				74
				75	// case x < 60:
				76	// s++
				77	INCQ SI
				78
				79	doLit:
				80	// This is the end of the inner "switch", when we have a literal tag.
				81	//
				82	// We assume that CX == x and x fits in a uint32, where x is the variable
				83	// used in the pure Go decode_other.go code.
				84
				85	// length = int(x) + 1
				86	//
				87	// Unlike the pure Go code, we don't need to check if length <= 0 because
				88	// CX can hold 64 bits, so the increment cannot overflow.
				89	INCQ CX
				90
				91	// Prepare to check if copying length bytes will run past the end of dst or
				92	// src.
				93	//
				94	// AX = len(dst) - d
				95	// BX = len(src) - s
				96	MOVQ R10, AX
				97	SUBQ DI, AX
				98	MOVQ R13, BX
				99	SUBQ SI, BX
				100
				101	// !!! Try a faster technique for short (16 or fewer bytes) copies.
				102	//
				103	// if length > 16 \|\| len(dst)-d < 16 \|\| len(src)-s < 16 {
				104	// goto callMemmove // Fall back on calling runtime·memmove.
				105	// }
				106	//
				107	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
				108	// against 21 instead of 16, because it cannot assume that all of its input
				109	// is contiguous in memory and so it needs to leave enough source bytes to
				110	// read the next tag without refilling buffers, but Go's Decode assumes
				111	// contiguousness (the src argument is a []byte).
				112	CMPQ CX, $16
				113	JGT callMemmove
				114	CMPQ AX, $16
				115	JLT callMemmove
				116	CMPQ BX, $16
				117	JLT callMemmove
				118
				119	// !!! Implement the copy from src to dst as a 16-byte load and store.
				120	// (Decode's documentation says that dst and src must not overlap.)
				121	//
				122	// This always copies 16 bytes, instead of only length bytes, but that's
				123	// OK. If the input is a valid Snappy encoding then subsequent iterations
				124	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
				125	// non-nil error), so the overrun will be ignored.
				126	//
				127	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
				128	// 16-byte loads and stores. This technique probably wouldn't be as
				129	// effective on architectures that are fussier about alignment.
				130	MOVOU 0(SI), X0
				131	MOVOU X0, 0(DI)
				132
				133	// d += length
				134	// s += length
				135	ADDQ CX, DI
				136	ADDQ CX, SI
				137	JMP loop
				138
				139	callMemmove:
				140	// if length > len(dst)-d \|\| length > len(src)-s { etc }
				141	CMPQ CX, AX
				142	JGT errCorrupt
				143	CMPQ CX, BX
				144	JGT errCorrupt
				145
				146	// copy(dst[d:], src[s:s+length])
				147	//
				148	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
				149	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
				150	// three registers to the stack, to save local variables across the CALL.
				151	MOVQ DI, 0(SP)
				152	MOVQ SI, 8(SP)
				153	MOVQ CX, 16(SP)
				154	MOVQ DI, 24(SP)
				155	MOVQ SI, 32(SP)
				156	MOVQ CX, 40(SP)
				157	CALL runtime·memmove(SB)
				158
				159	// Restore local variables: unspill registers from the stack and
				160	// re-calculate R8-R13.
				161	MOVQ 24(SP), DI
				162	MOVQ 32(SP), SI
				163	MOVQ 40(SP), CX
				164	MOVQ dst_base+0(FP), R8
				165	MOVQ dst_len+8(FP), R9
				166	MOVQ R8, R10
				167	ADDQ R9, R10
				168	MOVQ src_base+24(FP), R11
				169	MOVQ src_len+32(FP), R12
				170	MOVQ R11, R13
				171	ADDQ R12, R13
				172
				173	// d += length
				174	// s += length
				175	ADDQ CX, DI
				176	ADDQ CX, SI
				177	JMP loop
				178
				179	tagLit60Plus:
				180	// !!! This fragment does the
				181	//
				182	// s += x - 58; if uint(s) > uint(len(src)) { etc }
				183	//
				184	// checks. In the asm version, we code it once instead of once per switch case.
				185	ADDQ CX, SI
				186	SUBQ $58, SI
				187	MOVQ SI, BX
				188	SUBQ R11, BX
				189	CMPQ BX, R12
				190	JA errCorrupt
				191
				192	// case x == 60:
				193	CMPL CX, $61
				194	JEQ tagLit61
				195	JA tagLit62Plus
				196
				197	// x = uint32(src[s-1])
				198	MOVBLZX -1(SI), CX
				199	JMP doLit
				200
				201	tagLit61:
				202	// case x == 61:
				203	// x = uint32(src[s-2]) \| uint32(src[s-1])<<8
				204	MOVWLZX -2(SI), CX
				205	JMP doLit
				206
				207	tagLit62Plus:
				208	CMPL CX, $62
				209	JA tagLit63
				210
				211	// case x == 62:
				212	// x = uint32(src[s-3]) \| uint32(src[s-2])<<8 \| uint32(src[s-1])<<16
				213	MOVWLZX -3(SI), CX
				214	MOVBLZX -1(SI), BX
				215	SHLL $16, BX
				216	ORL BX, CX
				217	JMP doLit
				218
				219	tagLit63:
				220	// case x == 63:
				221	// x = uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24
				222	MOVL -4(SI), CX
				223	JMP doLit
				224
				225	// The code above handles literal tags.
				226	// ----------------------------------------
				227	// The code below handles copy tags.
				228
				229	tagCopy4:
				230	// case tagCopy4:
				231	// s += 5
				232	ADDQ $5, SI
				233
				234	// if uint(s) > uint(len(src)) { etc }
				235	MOVQ SI, BX
				236	SUBQ R11, BX
				237	CMPQ BX, R12
				238	JA errCorrupt
				239
				240	// length = 1 + int(src[s-5])>>2
				241	SHRQ $2, CX
				242	INCQ CX
				243
				244	// offset = int(uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24)
				245	MOVLQZX -4(SI), DX
				246	JMP doCopy
				247
				248	tagCopy2:
				249	// case tagCopy2:
				250	// s += 3
				251	ADDQ $3, SI
				252
				253	// if uint(s) > uint(len(src)) { etc }
				254	MOVQ SI, BX
				255	SUBQ R11, BX
				256	CMPQ BX, R12
				257	JA errCorrupt
				258
				259	// length = 1 + int(src[s-3])>>2
				260	SHRQ $2, CX
				261	INCQ CX
				262
				263	// offset = int(uint32(src[s-2]) \| uint32(src[s-1])<<8)
				264	MOVWQZX -2(SI), DX
				265	JMP doCopy
				266
				267	tagCopy:
				268	// We have a copy tag. We assume that:
				269	// - BX == src[s] & 0x03
				270	// - CX == src[s]
				271	CMPQ BX, $2
				272	JEQ tagCopy2
				273	JA tagCopy4
				274
				275	// case tagCopy1:
				276	// s += 2
				277	ADDQ $2, SI
				278
				279	// if uint(s) > uint(len(src)) { etc }
				280	MOVQ SI, BX
				281	SUBQ R11, BX
				282	CMPQ BX, R12
				283	JA errCorrupt
				284
				285	// offset = int(uint32(src[s-2])&0xe0<<3 \| uint32(src[s-1]))
				286	MOVQ CX, DX
				287	ANDQ $0xe0, DX
				288	SHLQ $3, DX
				289	MOVBQZX -1(SI), BX
				290	ORQ BX, DX
				291
				292	// length = 4 + int(src[s-2])>>2&0x7
				293	SHRQ $2, CX
				294	ANDQ $7, CX
				295	ADDQ $4, CX
				296
				297	doCopy:
				298	// This is the end of the outer "switch", when we have a copy tag.
				299	//
				300	// We assume that:
				301	// - CX == length && CX > 0
				302	// - DX == offset
				303
				304	// if offset <= 0 { etc }
				305	CMPQ DX, $0
				306	JLE errCorrupt
				307
				308	// if d < offset { etc }
				309	MOVQ DI, BX
				310	SUBQ R8, BX
				311	CMPQ BX, DX
				312	JLT errCorrupt
				313
				314	// if length > len(dst)-d { etc }
				315	MOVQ R10, BX
				316	SUBQ DI, BX
				317	CMPQ CX, BX
				318	JGT errCorrupt
				319
				320	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
				321	//
				322	// Set:
				323	// - R14 = len(dst)-d
				324	// - R15 = &dst[d-offset]
				325	MOVQ R10, R14
				326	SUBQ DI, R14
				327	MOVQ DI, R15
				328	SUBQ DX, R15
				329
				330	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
				331	//
				332	// First, try using two 8-byte load/stores, similar to the doLit technique
				333	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
				334	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
				335	// and not one 16-byte load/store, and the first store has to be before the
				336	// second load, due to the overlap if offset is in the range [8, 16).
				337	//
				338	// if length > 16 \|\| offset < 8 \|\| len(dst)-d < 16 {
				339	// goto slowForwardCopy
				340	// }
				341	// copy 16 bytes
				342	// d += length
				343	CMPQ CX, $16
				344	JGT slowForwardCopy
				345	CMPQ DX, $8
				346	JLT slowForwardCopy
				347	CMPQ R14, $16
				348	JLT slowForwardCopy
				349	MOVQ 0(R15), AX
				350	MOVQ AX, 0(DI)
				351	MOVQ 8(R15), BX
				352	MOVQ BX, 8(DI)
				353	ADDQ CX, DI
				354	JMP loop
				355
				356	slowForwardCopy:
				357	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
				358	// can still try 8-byte load stores, provided we can overrun up to 10 extra
				359	// bytes. As above, the overrun will be fixed up by subsequent iterations
				360	// of the outermost loop.
				361	//
				362	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
				363	// commentary says:
				364	//
				365	// ----
				366	//
				367	// The main part of this loop is a simple copy of eight bytes at a time
				368	// until we've copied (at least) the requested amount of bytes. However,
				369	// if d and d-offset are less than eight bytes apart (indicating a
				370	// repeating pattern of length < 8), we first need to expand the pattern in
				371	// order to get the correct results. For instance, if the buffer looks like
				372	// this, with the eight-byte <d-offset> and <d> patterns marked as
				373	// intervals:
				374	//
				375	// abxxxxxxxxxxxx
				376	// [------] d-offset
				377	// [------] d
				378	//
				379	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
				380	// once, after which we can move <d> two bytes without moving <d-offset>:
				381	//
				382	// ababxxxxxxxxxx
				383	// [------] d-offset
				384	// [------] d
				385	//
				386	// and repeat the exercise until the two no longer overlap.
				387	//
				388	// This allows us to do very well in the special case of one single byte
				389	// repeated many times, without taking a big hit for more general cases.
				390	//
				391	// The worst case of extra writing past the end of the match occurs when
				392	// offset == 1 and length == 1; the last copy will read from byte positions
				393	// [0..7] and write to [4..11], whereas it was only supposed to write to
				394	// position 1. Thus, ten excess bytes.
				395	//
				396	// ----
				397	//
				398	// That "10 byte overrun" worst case is confirmed by Go's
				399	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
				400	// and finishSlowForwardCopy algorithm.
				401	//
				402	// if length > len(dst)-d-10 {
				403	// goto verySlowForwardCopy
				404	// }
				405	SUBQ $10, R14
				406	CMPQ CX, R14
				407	JGT verySlowForwardCopy
				408
				409	makeOffsetAtLeast8:
				410	// !!! As above, expand the pattern so that offset >= 8 and we can use
				411	// 8-byte load/stores.
				412	//
				413	// for offset < 8 {
				414	// copy 8 bytes from dst[d-offset:] to dst[d:]
				415	// length -= offset
				416	// d += offset
				417	// offset += offset
				418	// // The two previous lines together means that d-offset, and therefore
				419	// // R15, is unchanged.
				420	// }
				421	CMPQ DX, $8
				422	JGE fixUpSlowForwardCopy
				423	MOVQ (R15), BX
				424	MOVQ BX, (DI)
				425	SUBQ DX, CX
				426	ADDQ DX, DI
				427	ADDQ DX, DX
				428	JMP makeOffsetAtLeast8
				429
				430	fixUpSlowForwardCopy:
				431	// !!! Add length (which might be negative now) to d (implied by DI being
				432	// &dst[d]) so that d ends up at the right place when we jump back to the
				433	// top of the loop. Before we do that, though, we save DI to AX so that, if
				434	// length is positive, copying the remaining length bytes will write to the
				435	// right place.
				436	MOVQ DI, AX
				437	ADDQ CX, DI
				438
				439	finishSlowForwardCopy:
				440	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
				441	// length means that we overrun, but as above, that will be fixed up by
				442	// subsequent iterations of the outermost loop.
				443	CMPQ CX, $0
				444	JLE loop
				445	MOVQ (R15), BX
				446	MOVQ BX, (AX)
				447	ADDQ $8, R15
				448	ADDQ $8, AX
				449	SUBQ $8, CX
				450	JMP finishSlowForwardCopy
				451
				452	verySlowForwardCopy:
				453	// verySlowForwardCopy is a simple implementation of forward copy. In C
				454	// parlance, this is a do/while loop instead of a while loop, since we know
				455	// that length > 0. In Go syntax:
				456	//
				457	// for {
				458	// dst[d] = dst[d - offset]
				459	// d++
				460	// length--
				461	// if length == 0 {
				462	// break
				463	// }
				464	// }
				465	MOVB (R15), BX
				466	MOVB BX, (DI)
				467	INCQ R15
				468	INCQ DI
				469	DECQ CX
				470	JNZ verySlowForwardCopy
				471	JMP loop
				472
				473	// The code above handles copy tags.
				474	// ----------------------------------------
				475
				476	end:
				477	// This is the end of the "for s < len(src)".
				478	//
				479	// if d != len(dst) { etc }
				480	CMPQ DI, R10
				481	JNE errCorrupt
				482
				483	// return 0
				484	MOVQ $0, ret+48(FP)
				485	RET
				486
				487	errCorrupt:
				488	// return decodeErrCodeCorrupt
				489	MOVQ $1, ret+48(FP)
				490	RET