Blame - vendor/github.com/klauspost/compress/snappy/decode_amd64.s - voltctl

blob: 1c66e37234dc73a4d1fb07ea9974acd8fc734326 [file] [log] [blame]

Scott Baker	ed4efab	2020-01-13 19:12:25 -0800	[diff] [blame^]	1	// Copyright 2016 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	// +build !appengine
				6	// +build gc
				7	// +build !noasm
				8
				9	#include "textflag.h"
				10
				11	// The asm code generally follows the pure Go code in decode_other.go, except
				12	// where marked with a "!!!".
				13
				14	// func decode(dst, src []byte) int
				15	//
				16	// All local variables fit into registers. The non-zero stack size is only to
				17	// spill registers and push args when issuing a CALL. The register allocation:
				18	// - AX scratch
				19	// - BX scratch
				20	// - CX length or x
				21	// - DX offset
				22	// - SI &src[s]
				23	// - DI &dst[d]
				24	// + R8 dst_base
				25	// + R9 dst_len
				26	// + R10 dst_base + dst_len
				27	// + R11 src_base
				28	// + R12 src_len
				29	// + R13 src_base + src_len
				30	// - R14 used by doCopy
				31	// - R15 used by doCopy
				32	//
				33	// The registers R8-R13 (marked with a "+") are set at the start of the
				34	// function, and after a CALL returns, and are not otherwise modified.
				35	//
				36	// The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI.
				37	// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
				38	TEXT ·decode(SB), NOSPLIT, $48-56
				39	// Initialize SI, DI and R8-R13.
				40	MOVQ dst_base+0(FP), R8
				41	MOVQ dst_len+8(FP), R9
				42	MOVQ R8, DI
				43	MOVQ R8, R10
				44	ADDQ R9, R10
				45	MOVQ src_base+24(FP), R11
				46	MOVQ src_len+32(FP), R12
				47	MOVQ R11, SI
				48	MOVQ R11, R13
				49	ADDQ R12, R13
				50
				51	loop:
				52	// for s < len(src)
				53	CMPQ SI, R13
				54	JEQ end
				55
				56	// CX = uint32(src[s])
				57	//
				58	// switch src[s] & 0x03
				59	MOVBLZX (SI), CX
				60	MOVL CX, BX
				61	ANDL $3, BX
				62	CMPL BX, $1
				63	JAE tagCopy
				64
				65	// ----------------------------------------
				66	// The code below handles literal tags.
				67
				68	// case tagLiteral:
				69	// x := uint32(src[s] >> 2)
				70	// switch
				71	SHRL $2, CX
				72	CMPL CX, $60
				73	JAE tagLit60Plus
				74
				75	// case x < 60:
				76	// s++
				77	INCQ SI
				78
				79	doLit:
				80	// This is the end of the inner "switch", when we have a literal tag.
				81	//
				82	// We assume that CX == x and x fits in a uint32, where x is the variable
				83	// used in the pure Go decode_other.go code.
				84
				85	// length = int(x) + 1
				86	//
				87	// Unlike the pure Go code, we don't need to check if length <= 0 because
				88	// CX can hold 64 bits, so the increment cannot overflow.
				89	INCQ CX
				90
				91	// Prepare to check if copying length bytes will run past the end of dst or
				92	// src.
				93	//
				94	// AX = len(dst) - d
				95	// BX = len(src) - s
				96	MOVQ R10, AX
				97	SUBQ DI, AX
				98	MOVQ R13, BX
				99	SUBQ SI, BX
				100
				101	// !!! Try a faster technique for short (16 or fewer bytes) copies.
				102	//
				103	// if length > 16 \|\| len(dst)-d < 16 \|\| len(src)-s < 16 {
				104	// goto callMemmove // Fall back on calling runtime·memmove.
				105	// }
				106	//
				107	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
				108	// against 21 instead of 16, because it cannot assume that all of its input
				109	// is contiguous in memory and so it needs to leave enough source bytes to
				110	// read the next tag without refilling buffers, but Go's Decode assumes
				111	// contiguousness (the src argument is a []byte).
				112	CMPQ CX, $16
				113	JGT callMemmove
				114	CMPQ AX, $16
				115	JLT callMemmove
				116	CMPQ BX, $16
				117	JLT callMemmove
				118
				119	// !!! Implement the copy from src to dst as a 16-byte load and store.
				120	// (Decode's documentation says that dst and src must not overlap.)
				121	//
				122	// This always copies 16 bytes, instead of only length bytes, but that's
				123	// OK. If the input is a valid Snappy encoding then subsequent iterations
				124	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
				125	// non-nil error), so the overrun will be ignored.
				126	//
				127	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
				128	// 16-byte loads and stores. This technique probably wouldn't be as
				129	// effective on architectures that are fussier about alignment.
				130	MOVOU 0(SI), X0
				131	MOVOU X0, 0(DI)
				132
				133	// d += length
				134	// s += length
				135	ADDQ CX, DI
				136	ADDQ CX, SI
				137	JMP loop
				138
				139	callMemmove:
				140	// if length > len(dst)-d \|\| length > len(src)-s { etc }
				141	CMPQ CX, AX
				142	JGT errCorrupt
				143	CMPQ CX, BX
				144	JGT errCorrupt
				145
				146	// copy(dst[d:], src[s:s+length])
				147	//
				148	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
				149	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
				150	// three registers to the stack, to save local variables across the CALL.
				151	MOVQ DI, 0(SP)
				152	MOVQ SI, 8(SP)
				153	MOVQ CX, 16(SP)
				154	MOVQ DI, 24(SP)
				155	MOVQ SI, 32(SP)
				156	MOVQ CX, 40(SP)
				157	CALL runtime·memmove(SB)
				158
				159	// Restore local variables: unspill registers from the stack and
				160	// re-calculate R8-R13.
				161	MOVQ 24(SP), DI
				162	MOVQ 32(SP), SI
				163	MOVQ 40(SP), CX
				164	MOVQ dst_base+0(FP), R8
				165	MOVQ dst_len+8(FP), R9
				166	MOVQ R8, R10
				167	ADDQ R9, R10
				168	MOVQ src_base+24(FP), R11
				169	MOVQ src_len+32(FP), R12
				170	MOVQ R11, R13
				171	ADDQ R12, R13
				172
				173	// d += length
				174	// s += length
				175	ADDQ CX, DI
				176	ADDQ CX, SI
				177	JMP loop
				178
				179	tagLit60Plus:
				180	// !!! This fragment does the
				181	//
				182	// s += x - 58; if uint(s) > uint(len(src)) { etc }
				183	//
				184	// checks. In the asm version, we code it once instead of once per switch case.
				185	ADDQ CX, SI
				186	SUBQ $58, SI
				187	CMPQ SI, R13
				188	JA errCorrupt
				189
				190	// case x == 60:
				191	CMPL CX, $61
				192	JEQ tagLit61
				193	JA tagLit62Plus
				194
				195	// x = uint32(src[s-1])
				196	MOVBLZX -1(SI), CX
				197	JMP doLit
				198
				199	tagLit61:
				200	// case x == 61:
				201	// x = uint32(src[s-2]) \| uint32(src[s-1])<<8
				202	MOVWLZX -2(SI), CX
				203	JMP doLit
				204
				205	tagLit62Plus:
				206	CMPL CX, $62
				207	JA tagLit63
				208
				209	// case x == 62:
				210	// x = uint32(src[s-3]) \| uint32(src[s-2])<<8 \| uint32(src[s-1])<<16
				211	MOVWLZX -3(SI), CX
				212	MOVBLZX -1(SI), BX
				213	SHLL $16, BX
				214	ORL BX, CX
				215	JMP doLit
				216
				217	tagLit63:
				218	// case x == 63:
				219	// x = uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24
				220	MOVL -4(SI), CX
				221	JMP doLit
				222
				223	// The code above handles literal tags.
				224	// ----------------------------------------
				225	// The code below handles copy tags.
				226
				227	tagCopy4:
				228	// case tagCopy4:
				229	// s += 5
				230	ADDQ $5, SI
				231
				232	// if uint(s) > uint(len(src)) { etc }
				233	CMPQ SI, R13
				234	JA errCorrupt
				235
				236	// length = 1 + int(src[s-5])>>2
				237	SHRQ $2, CX
				238	INCQ CX
				239
				240	// offset = int(uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24)
				241	MOVLQZX -4(SI), DX
				242	JMP doCopy
				243
				244	tagCopy2:
				245	// case tagCopy2:
				246	// s += 3
				247	ADDQ $3, SI
				248
				249	// if uint(s) > uint(len(src)) { etc }
				250	CMPQ SI, R13
				251	JA errCorrupt
				252
				253	// length = 1 + int(src[s-3])>>2
				254	SHRQ $2, CX
				255	INCQ CX
				256
				257	// offset = int(uint32(src[s-2]) \| uint32(src[s-1])<<8)
				258	MOVWQZX -2(SI), DX
				259	JMP doCopy
				260
				261	tagCopy:
				262	// We have a copy tag. We assume that:
				263	// - BX == src[s] & 0x03
				264	// - CX == src[s]
				265	CMPQ BX, $2
				266	JEQ tagCopy2
				267	JA tagCopy4
				268
				269	// case tagCopy1:
				270	// s += 2
				271	ADDQ $2, SI
				272
				273	// if uint(s) > uint(len(src)) { etc }
				274	CMPQ SI, R13
				275	JA errCorrupt
				276
				277	// offset = int(uint32(src[s-2])&0xe0<<3 \| uint32(src[s-1]))
				278	MOVQ CX, DX
				279	ANDQ $0xe0, DX
				280	SHLQ $3, DX
				281	MOVBQZX -1(SI), BX
				282	ORQ BX, DX
				283
				284	// length = 4 + int(src[s-2])>>2&0x7
				285	SHRQ $2, CX
				286	ANDQ $7, CX
				287	ADDQ $4, CX
				288
				289	doCopy:
				290	// This is the end of the outer "switch", when we have a copy tag.
				291	//
				292	// We assume that:
				293	// - CX == length && CX > 0
				294	// - DX == offset
				295
				296	// if offset <= 0 { etc }
				297	CMPQ DX, $0
				298	JLE errCorrupt
				299
				300	// if d < offset { etc }
				301	MOVQ DI, BX
				302	SUBQ R8, BX
				303	CMPQ BX, DX
				304	JLT errCorrupt
				305
				306	// if length > len(dst)-d { etc }
				307	MOVQ R10, BX
				308	SUBQ DI, BX
				309	CMPQ CX, BX
				310	JGT errCorrupt
				311
				312	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
				313	//
				314	// Set:
				315	// - R14 = len(dst)-d
				316	// - R15 = &dst[d-offset]
				317	MOVQ R10, R14
				318	SUBQ DI, R14
				319	MOVQ DI, R15
				320	SUBQ DX, R15
				321
				322	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
				323	//
				324	// First, try using two 8-byte load/stores, similar to the doLit technique
				325	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
				326	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
				327	// and not one 16-byte load/store, and the first store has to be before the
				328	// second load, due to the overlap if offset is in the range [8, 16).
				329	//
				330	// if length > 16 \|\| offset < 8 \|\| len(dst)-d < 16 {
				331	// goto slowForwardCopy
				332	// }
				333	// copy 16 bytes
				334	// d += length
				335	CMPQ CX, $16
				336	JGT slowForwardCopy
				337	CMPQ DX, $8
				338	JLT slowForwardCopy
				339	CMPQ R14, $16
				340	JLT slowForwardCopy
				341	MOVQ 0(R15), AX
				342	MOVQ AX, 0(DI)
				343	MOVQ 8(R15), BX
				344	MOVQ BX, 8(DI)
				345	ADDQ CX, DI
				346	JMP loop
				347
				348	slowForwardCopy:
				349	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
				350	// can still try 8-byte load stores, provided we can overrun up to 10 extra
				351	// bytes. As above, the overrun will be fixed up by subsequent iterations
				352	// of the outermost loop.
				353	//
				354	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
				355	// commentary says:
				356	//
				357	// ----
				358	//
				359	// The main part of this loop is a simple copy of eight bytes at a time
				360	// until we've copied (at least) the requested amount of bytes. However,
				361	// if d and d-offset are less than eight bytes apart (indicating a
				362	// repeating pattern of length < 8), we first need to expand the pattern in
				363	// order to get the correct results. For instance, if the buffer looks like
				364	// this, with the eight-byte <d-offset> and <d> patterns marked as
				365	// intervals:
				366	//
				367	// abxxxxxxxxxxxx
				368	// [------] d-offset
				369	// [------] d
				370	//
				371	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
				372	// once, after which we can move <d> two bytes without moving <d-offset>:
				373	//
				374	// ababxxxxxxxxxx
				375	// [------] d-offset
				376	// [------] d
				377	//
				378	// and repeat the exercise until the two no longer overlap.
				379	//
				380	// This allows us to do very well in the special case of one single byte
				381	// repeated many times, without taking a big hit for more general cases.
				382	//
				383	// The worst case of extra writing past the end of the match occurs when
				384	// offset == 1 and length == 1; the last copy will read from byte positions
				385	// [0..7] and write to [4..11], whereas it was only supposed to write to
				386	// position 1. Thus, ten excess bytes.
				387	//
				388	// ----
				389	//
				390	// That "10 byte overrun" worst case is confirmed by Go's
				391	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
				392	// and finishSlowForwardCopy algorithm.
				393	//
				394	// if length > len(dst)-d-10 {
				395	// goto verySlowForwardCopy
				396	// }
				397	SUBQ $10, R14
				398	CMPQ CX, R14
				399	JGT verySlowForwardCopy
				400
				401	makeOffsetAtLeast8:
				402	// !!! As above, expand the pattern so that offset >= 8 and we can use
				403	// 8-byte load/stores.
				404	//
				405	// for offset < 8 {
				406	// copy 8 bytes from dst[d-offset:] to dst[d:]
				407	// length -= offset
				408	// d += offset
				409	// offset += offset
				410	// // The two previous lines together means that d-offset, and therefore
				411	// // R15, is unchanged.
				412	// }
				413	CMPQ DX, $8
				414	JGE fixUpSlowForwardCopy
				415	MOVQ (R15), BX
				416	MOVQ BX, (DI)
				417	SUBQ DX, CX
				418	ADDQ DX, DI
				419	ADDQ DX, DX
				420	JMP makeOffsetAtLeast8
				421
				422	fixUpSlowForwardCopy:
				423	// !!! Add length (which might be negative now) to d (implied by DI being
				424	// &dst[d]) so that d ends up at the right place when we jump back to the
				425	// top of the loop. Before we do that, though, we save DI to AX so that, if
				426	// length is positive, copying the remaining length bytes will write to the
				427	// right place.
				428	MOVQ DI, AX
				429	ADDQ CX, DI
				430
				431	finishSlowForwardCopy:
				432	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
				433	// length means that we overrun, but as above, that will be fixed up by
				434	// subsequent iterations of the outermost loop.
				435	CMPQ CX, $0
				436	JLE loop
				437	MOVQ (R15), BX
				438	MOVQ BX, (AX)
				439	ADDQ $8, R15
				440	ADDQ $8, AX
				441	SUBQ $8, CX
				442	JMP finishSlowForwardCopy
				443
				444	verySlowForwardCopy:
				445	// verySlowForwardCopy is a simple implementation of forward copy. In C
				446	// parlance, this is a do/while loop instead of a while loop, since we know
				447	// that length > 0. In Go syntax:
				448	//
				449	// for {
				450	// dst[d] = dst[d - offset]
				451	// d++
				452	// length--
				453	// if length == 0 {
				454	// break
				455	// }
				456	// }
				457	MOVB (R15), BX
				458	MOVB BX, (DI)
				459	INCQ R15
				460	INCQ DI
				461	DECQ CX
				462	JNZ verySlowForwardCopy
				463	JMP loop
				464
				465	// The code above handles copy tags.
				466	// ----------------------------------------
				467
				468	end:
				469	// This is the end of the "for s < len(src)".
				470	//
				471	// if d != len(dst) { etc }
				472	CMPQ DI, R10
				473	JNE errCorrupt
				474
				475	// return 0
				476	MOVQ $0, ret+48(FP)
				477	RET
				478
				479	errCorrupt:
				480	// return decodeErrCodeCorrupt
				481	MOVQ $1, ret+48(FP)
				482	RET