Blame - vendor/github.com/golang/snappy/decode_arm64.s - voltha-go

blob: 7a3ead17eacfe3add2fb2387c40e3682bda4641f [file] [log] [blame]

khenaidoo	d948f77	2021-08-11 17:49:24 -0400	[diff] [blame^]	1	// Copyright 2020 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	// +build !appengine
				6	// +build gc
				7	// +build !noasm
				8
				9	#include "textflag.h"
				10
				11	// The asm code generally follows the pure Go code in decode_other.go, except
				12	// where marked with a "!!!".
				13
				14	// func decode(dst, src []byte) int
				15	//
				16	// All local variables fit into registers. The non-zero stack size is only to
				17	// spill registers and push args when issuing a CALL. The register allocation:
				18	// - R2 scratch
				19	// - R3 scratch
				20	// - R4 length or x
				21	// - R5 offset
				22	// - R6 &src[s]
				23	// - R7 &dst[d]
				24	// + R8 dst_base
				25	// + R9 dst_len
				26	// + R10 dst_base + dst_len
				27	// + R11 src_base
				28	// + R12 src_len
				29	// + R13 src_base + src_len
				30	// - R14 used by doCopy
				31	// - R15 used by doCopy
				32	//
				33	// The registers R8-R13 (marked with a "+") are set at the start of the
				34	// function, and after a CALL returns, and are not otherwise modified.
				35	//
				36	// The d variable is implicitly R7 - R8, and len(dst)-d is R10 - R7.
				37	// The s variable is implicitly R6 - R11, and len(src)-s is R13 - R6.
				38	TEXT ·decode(SB), NOSPLIT, $56-56
				39	// Initialize R6, R7 and R8-R13.
				40	MOVD dst_base+0(FP), R8
				41	MOVD dst_len+8(FP), R9
				42	MOVD R8, R7
				43	MOVD R8, R10
				44	ADD R9, R10, R10
				45	MOVD src_base+24(FP), R11
				46	MOVD src_len+32(FP), R12
				47	MOVD R11, R6
				48	MOVD R11, R13
				49	ADD R12, R13, R13
				50
				51	loop:
				52	// for s < len(src)
				53	CMP R13, R6
				54	BEQ end
				55
				56	// R4 = uint32(src[s])
				57	//
				58	// switch src[s] & 0x03
				59	MOVBU (R6), R4
				60	MOVW R4, R3
				61	ANDW $3, R3
				62	MOVW $1, R1
				63	CMPW R1, R3
				64	BGE tagCopy
				65
				66	// ----------------------------------------
				67	// The code below handles literal tags.
				68
				69	// case tagLiteral:
				70	// x := uint32(src[s] >> 2)
				71	// switch
				72	MOVW $60, R1
				73	LSRW $2, R4, R4
				74	CMPW R4, R1
				75	BLS tagLit60Plus
				76
				77	// case x < 60:
				78	// s++
				79	ADD $1, R6, R6
				80
				81	doLit:
				82	// This is the end of the inner "switch", when we have a literal tag.
				83	//
				84	// We assume that R4 == x and x fits in a uint32, where x is the variable
				85	// used in the pure Go decode_other.go code.
				86
				87	// length = int(x) + 1
				88	//
				89	// Unlike the pure Go code, we don't need to check if length <= 0 because
				90	// R4 can hold 64 bits, so the increment cannot overflow.
				91	ADD $1, R4, R4
				92
				93	// Prepare to check if copying length bytes will run past the end of dst or
				94	// src.
				95	//
				96	// R2 = len(dst) - d
				97	// R3 = len(src) - s
				98	MOVD R10, R2
				99	SUB R7, R2, R2
				100	MOVD R13, R3
				101	SUB R6, R3, R3
				102
				103	// !!! Try a faster technique for short (16 or fewer bytes) copies.
				104	//
				105	// if length > 16 \|\| len(dst)-d < 16 \|\| len(src)-s < 16 {
				106	// goto callMemmove // Fall back on calling runtime·memmove.
				107	// }
				108	//
				109	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
				110	// against 21 instead of 16, because it cannot assume that all of its input
				111	// is contiguous in memory and so it needs to leave enough source bytes to
				112	// read the next tag without refilling buffers, but Go's Decode assumes
				113	// contiguousness (the src argument is a []byte).
				114	CMP $16, R4
				115	BGT callMemmove
				116	CMP $16, R2
				117	BLT callMemmove
				118	CMP $16, R3
				119	BLT callMemmove
				120
				121	// !!! Implement the copy from src to dst as a 16-byte load and store.
				122	// (Decode's documentation says that dst and src must not overlap.)
				123	//
				124	// This always copies 16 bytes, instead of only length bytes, but that's
				125	// OK. If the input is a valid Snappy encoding then subsequent iterations
				126	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
				127	// non-nil error), so the overrun will be ignored.
				128	//
				129	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
				130	// 16-byte loads and stores. This technique probably wouldn't be as
				131	// effective on architectures that are fussier about alignment.
				132	LDP 0(R6), (R14, R15)
				133	STP (R14, R15), 0(R7)
				134
				135	// d += length
				136	// s += length
				137	ADD R4, R7, R7
				138	ADD R4, R6, R6
				139	B loop
				140
				141	callMemmove:
				142	// if length > len(dst)-d \|\| length > len(src)-s { etc }
				143	CMP R2, R4
				144	BGT errCorrupt
				145	CMP R3, R4
				146	BGT errCorrupt
				147
				148	// copy(dst[d:], src[s:s+length])
				149	//
				150	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
				151	// R7, R6 and R4 as arguments. Coincidentally, we also need to spill those
				152	// three registers to the stack, to save local variables across the CALL.
				153	MOVD R7, 8(RSP)
				154	MOVD R6, 16(RSP)
				155	MOVD R4, 24(RSP)
				156	MOVD R7, 32(RSP)
				157	MOVD R6, 40(RSP)
				158	MOVD R4, 48(RSP)
				159	CALL runtime·memmove(SB)
				160
				161	// Restore local variables: unspill registers from the stack and
				162	// re-calculate R8-R13.
				163	MOVD 32(RSP), R7
				164	MOVD 40(RSP), R6
				165	MOVD 48(RSP), R4
				166	MOVD dst_base+0(FP), R8
				167	MOVD dst_len+8(FP), R9
				168	MOVD R8, R10
				169	ADD R9, R10, R10
				170	MOVD src_base+24(FP), R11
				171	MOVD src_len+32(FP), R12
				172	MOVD R11, R13
				173	ADD R12, R13, R13
				174
				175	// d += length
				176	// s += length
				177	ADD R4, R7, R7
				178	ADD R4, R6, R6
				179	B loop
				180
				181	tagLit60Plus:
				182	// !!! This fragment does the
				183	//
				184	// s += x - 58; if uint(s) > uint(len(src)) { etc }
				185	//
				186	// checks. In the asm version, we code it once instead of once per switch case.
				187	ADD R4, R6, R6
				188	SUB $58, R6, R6
				189	MOVD R6, R3
				190	SUB R11, R3, R3
				191	CMP R12, R3
				192	BGT errCorrupt
				193
				194	// case x == 60:
				195	MOVW $61, R1
				196	CMPW R1, R4
				197	BEQ tagLit61
				198	BGT tagLit62Plus
				199
				200	// x = uint32(src[s-1])
				201	MOVBU -1(R6), R4
				202	B doLit
				203
				204	tagLit61:
				205	// case x == 61:
				206	// x = uint32(src[s-2]) \| uint32(src[s-1])<<8
				207	MOVHU -2(R6), R4
				208	B doLit
				209
				210	tagLit62Plus:
				211	CMPW $62, R4
				212	BHI tagLit63
				213
				214	// case x == 62:
				215	// x = uint32(src[s-3]) \| uint32(src[s-2])<<8 \| uint32(src[s-1])<<16
				216	MOVHU -3(R6), R4
				217	MOVBU -1(R6), R3
				218	ORR R3<<16, R4
				219	B doLit
				220
				221	tagLit63:
				222	// case x == 63:
				223	// x = uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24
				224	MOVWU -4(R6), R4
				225	B doLit
				226
				227	// The code above handles literal tags.
				228	// ----------------------------------------
				229	// The code below handles copy tags.
				230
				231	tagCopy4:
				232	// case tagCopy4:
				233	// s += 5
				234	ADD $5, R6, R6
				235
				236	// if uint(s) > uint(len(src)) { etc }
				237	MOVD R6, R3
				238	SUB R11, R3, R3
				239	CMP R12, R3
				240	BGT errCorrupt
				241
				242	// length = 1 + int(src[s-5])>>2
				243	MOVD $1, R1
				244	ADD R4>>2, R1, R4
				245
				246	// offset = int(uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24)
				247	MOVWU -4(R6), R5
				248	B doCopy
				249
				250	tagCopy2:
				251	// case tagCopy2:
				252	// s += 3
				253	ADD $3, R6, R6
				254
				255	// if uint(s) > uint(len(src)) { etc }
				256	MOVD R6, R3
				257	SUB R11, R3, R3
				258	CMP R12, R3
				259	BGT errCorrupt
				260
				261	// length = 1 + int(src[s-3])>>2
				262	MOVD $1, R1
				263	ADD R4>>2, R1, R4
				264
				265	// offset = int(uint32(src[s-2]) \| uint32(src[s-1])<<8)
				266	MOVHU -2(R6), R5
				267	B doCopy
				268
				269	tagCopy:
				270	// We have a copy tag. We assume that:
				271	// - R3 == src[s] & 0x03
				272	// - R4 == src[s]
				273	CMP $2, R3
				274	BEQ tagCopy2
				275	BGT tagCopy4
				276
				277	// case tagCopy1:
				278	// s += 2
				279	ADD $2, R6, R6
				280
				281	// if uint(s) > uint(len(src)) { etc }
				282	MOVD R6, R3
				283	SUB R11, R3, R3
				284	CMP R12, R3
				285	BGT errCorrupt
				286
				287	// offset = int(uint32(src[s-2])&0xe0<<3 \| uint32(src[s-1]))
				288	MOVD R4, R5
				289	AND $0xe0, R5
				290	MOVBU -1(R6), R3
				291	ORR R5<<3, R3, R5
				292
				293	// length = 4 + int(src[s-2])>>2&0x7
				294	MOVD $7, R1
				295	AND R4>>2, R1, R4
				296	ADD $4, R4, R4
				297
				298	doCopy:
				299	// This is the end of the outer "switch", when we have a copy tag.
				300	//
				301	// We assume that:
				302	// - R4 == length && R4 > 0
				303	// - R5 == offset
				304
				305	// if offset <= 0 { etc }
				306	MOVD $0, R1
				307	CMP R1, R5
				308	BLE errCorrupt
				309
				310	// if d < offset { etc }
				311	MOVD R7, R3
				312	SUB R8, R3, R3
				313	CMP R5, R3
				314	BLT errCorrupt
				315
				316	// if length > len(dst)-d { etc }
				317	MOVD R10, R3
				318	SUB R7, R3, R3
				319	CMP R3, R4
				320	BGT errCorrupt
				321
				322	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
				323	//
				324	// Set:
				325	// - R14 = len(dst)-d
				326	// - R15 = &dst[d-offset]
				327	MOVD R10, R14
				328	SUB R7, R14, R14
				329	MOVD R7, R15
				330	SUB R5, R15, R15
				331
				332	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
				333	//
				334	// First, try using two 8-byte load/stores, similar to the doLit technique
				335	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
				336	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
				337	// and not one 16-byte load/store, and the first store has to be before the
				338	// second load, due to the overlap if offset is in the range [8, 16).
				339	//
				340	// if length > 16 \|\| offset < 8 \|\| len(dst)-d < 16 {
				341	// goto slowForwardCopy
				342	// }
				343	// copy 16 bytes
				344	// d += length
				345	CMP $16, R4
				346	BGT slowForwardCopy
				347	CMP $8, R5
				348	BLT slowForwardCopy
				349	CMP $16, R14
				350	BLT slowForwardCopy
				351	MOVD 0(R15), R2
				352	MOVD R2, 0(R7)
				353	MOVD 8(R15), R3
				354	MOVD R3, 8(R7)
				355	ADD R4, R7, R7
				356	B loop
				357
				358	slowForwardCopy:
				359	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
				360	// can still try 8-byte load stores, provided we can overrun up to 10 extra
				361	// bytes. As above, the overrun will be fixed up by subsequent iterations
				362	// of the outermost loop.
				363	//
				364	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
				365	// commentary says:
				366	//
				367	// ----
				368	//
				369	// The main part of this loop is a simple copy of eight bytes at a time
				370	// until we've copied (at least) the requested amount of bytes. However,
				371	// if d and d-offset are less than eight bytes apart (indicating a
				372	// repeating pattern of length < 8), we first need to expand the pattern in
				373	// order to get the correct results. For instance, if the buffer looks like
				374	// this, with the eight-byte <d-offset> and <d> patterns marked as
				375	// intervals:
				376	//
				377	// abxxxxxxxxxxxx
				378	// [------] d-offset
				379	// [------] d
				380	//
				381	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
				382	// once, after which we can move <d> two bytes without moving <d-offset>:
				383	//
				384	// ababxxxxxxxxxx
				385	// [------] d-offset
				386	// [------] d
				387	//
				388	// and repeat the exercise until the two no longer overlap.
				389	//
				390	// This allows us to do very well in the special case of one single byte
				391	// repeated many times, without taking a big hit for more general cases.
				392	//
				393	// The worst case of extra writing past the end of the match occurs when
				394	// offset == 1 and length == 1; the last copy will read from byte positions
				395	// [0..7] and write to [4..11], whereas it was only supposed to write to
				396	// position 1. Thus, ten excess bytes.
				397	//
				398	// ----
				399	//
				400	// That "10 byte overrun" worst case is confirmed by Go's
				401	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
				402	// and finishSlowForwardCopy algorithm.
				403	//
				404	// if length > len(dst)-d-10 {
				405	// goto verySlowForwardCopy
				406	// }
				407	SUB $10, R14, R14
				408	CMP R14, R4
				409	BGT verySlowForwardCopy
				410
				411	makeOffsetAtLeast8:
				412	// !!! As above, expand the pattern so that offset >= 8 and we can use
				413	// 8-byte load/stores.
				414	//
				415	// for offset < 8 {
				416	// copy 8 bytes from dst[d-offset:] to dst[d:]
				417	// length -= offset
				418	// d += offset
				419	// offset += offset
				420	// // The two previous lines together means that d-offset, and therefore
				421	// // R15, is unchanged.
				422	// }
				423	CMP $8, R5
				424	BGE fixUpSlowForwardCopy
				425	MOVD (R15), R3
				426	MOVD R3, (R7)
				427	SUB R5, R4, R4
				428	ADD R5, R7, R7
				429	ADD R5, R5, R5
				430	B makeOffsetAtLeast8
				431
				432	fixUpSlowForwardCopy:
				433	// !!! Add length (which might be negative now) to d (implied by R7 being
				434	// &dst[d]) so that d ends up at the right place when we jump back to the
				435	// top of the loop. Before we do that, though, we save R7 to R2 so that, if
				436	// length is positive, copying the remaining length bytes will write to the
				437	// right place.
				438	MOVD R7, R2
				439	ADD R4, R7, R7
				440
				441	finishSlowForwardCopy:
				442	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
				443	// length means that we overrun, but as above, that will be fixed up by
				444	// subsequent iterations of the outermost loop.
				445	MOVD $0, R1
				446	CMP R1, R4
				447	BLE loop
				448	MOVD (R15), R3
				449	MOVD R3, (R2)
				450	ADD $8, R15, R15
				451	ADD $8, R2, R2
				452	SUB $8, R4, R4
				453	B finishSlowForwardCopy
				454
				455	verySlowForwardCopy:
				456	// verySlowForwardCopy is a simple implementation of forward copy. In C
				457	// parlance, this is a do/while loop instead of a while loop, since we know
				458	// that length > 0. In Go syntax:
				459	//
				460	// for {
				461	// dst[d] = dst[d - offset]
				462	// d++
				463	// length--
				464	// if length == 0 {
				465	// break
				466	// }
				467	// }
				468	MOVB (R15), R3
				469	MOVB R3, (R7)
				470	ADD $1, R15, R15
				471	ADD $1, R7, R7
				472	SUB $1, R4, R4
				473	CBNZ R4, verySlowForwardCopy
				474	B loop
				475
				476	// The code above handles copy tags.
				477	// ----------------------------------------
				478
				479	end:
				480	// This is the end of the "for s < len(src)".
				481	//
				482	// if d != len(dst) { etc }
				483	CMP R10, R7
				484	BNE errCorrupt
				485
				486	// return 0
				487	MOVD $0, ret+48(FP)
				488	RET
				489
				490	errCorrupt:
				491	// return decodeErrCodeCorrupt
				492	MOVD $1, R2
				493	MOVD R2, ret+48(FP)
				494	RET