vendor/github.com/pierrec/lz4/decode_amd64.s - bbsim - Gitiles

 // +build !appengine
 // +build gc
 // +build !noasm

 #include "textflag.h"

 // AX scratch
 // BX scratch
 // CX scratch
 // DX token
 //
 // DI &dst
 // SI &src
 // R8 &dst + len(dst)
 // R9 &src + len(src)
 // R11 &dst
 // R12 short output end
 // R13 short input end
 // func decodeBlock(dst, src []byte) int
 // using 50 bytes of stack currently
 TEXT ·decodeBlock(SB), NOSPLIT, $64-56
 	MOVQ dst_base+0(FP), DI
 	MOVQ DI, R11
 	MOVQ dst_len+8(FP), R8
 	ADDQ DI, R8

 	MOVQ src_base+24(FP), SI
 	MOVQ src_len+32(FP), R9
 	ADDQ SI, R9

 	// shortcut ends
 	// short output end
 	MOVQ R8, R12
 	SUBQ $32, R12
 	// short input end
 	MOVQ R9, R13
 	SUBQ $16, R13

 loop:
 	// for si < len(src)
 	CMPQ SI, R9
 	JGE end

 	// token := uint32(src[si])
 	MOVBQZX (SI), DX
 	INCQ SI

 	// lit_len = token >> 4
 	// if lit_len > 0
 	// CX = lit_len
 	MOVQ DX, CX
 	SHRQ $4, CX

 	// if lit_len != 0xF
 	CMPQ CX, $0xF
 	JEQ lit_len_loop_pre
 	CMPQ DI, R12
 	JGE lit_len_loop_pre
 	CMPQ SI, R13
 	JGE lit_len_loop_pre

 	// copy shortcut

 	// A two-stage shortcut for the most common case:
 	// 1) If the literal length is 0..14, and there is enough space,
 	// enter the shortcut and copy 16 bytes on behalf of the literals
 	// (in the fast mode, only 8 bytes can be safely copied this way).
 	// 2) Further if the match length is 4..18, copy 18 bytes in a similar
 	// manner; but we ensure that there's enough space in the output for
 	// those 18 bytes earlier, upon entering the shortcut (in other words,
 	// there is a combined check for both stages).

 	// copy literal
 	MOVOU (SI), X0
 	MOVOU X0, (DI)
 	ADDQ CX, DI
 	ADDQ CX, SI

 	MOVQ DX, CX
 	ANDQ $0xF, CX

 	// The second stage: prepare for match copying, decode full info.
 	// If it doesn't work out, the info won't be wasted.
 	// offset := uint16(data[:2])
 	MOVWQZX (SI), DX
 	ADDQ $2, SI

 	MOVQ DI, AX
 	SUBQ DX, AX
 	CMPQ AX, DI
 	JGT err_short_buf

 	// if we can't do the second stage then jump straight to read the
 	// match length, we already have the offset.
 	CMPQ CX, $0xF
 	JEQ match_len_loop_pre
 	CMPQ DX, $8
 	JLT match_len_loop_pre
 	CMPQ AX, R11
 	JLT err_short_buf

 	// memcpy(op + 0, match + 0, 8);
 	MOVQ (AX), BX
 	MOVQ BX, (DI)
 	// memcpy(op + 8, match + 8, 8);
 	MOVQ 8(AX), BX
 	MOVQ BX, 8(DI)
 	// memcpy(op +16, match +16, 2);
 	MOVW 16(AX), BX
 	MOVW BX, 16(DI)

 	ADDQ $4, DI // minmatch
 	ADDQ CX, DI

 	// shortcut complete, load next token
 	JMP loop

 lit_len_loop_pre:
 	// if lit_len > 0
 	CMPQ CX, $0
 	JEQ offset
 	CMPQ CX, $0xF
 	JNE copy_literal

 lit_len_loop:
 	// for src[si] == 0xFF
 	CMPB (SI), $0xFF
 	JNE lit_len_finalise

 	// bounds check src[si+1]
 	MOVQ SI, AX
 	ADDQ $1, AX
 	CMPQ AX, R9
 	JGT err_short_buf

 	// lit_len += 0xFF
 	ADDQ $0xFF, CX
 	INCQ SI
 	JMP lit_len_loop

 lit_len_finalise:
 	// lit_len += int(src[si])
 	// si++
 	MOVBQZX (SI), AX
 	ADDQ AX, CX
 	INCQ SI

 copy_literal:
 	// bounds check src and dst
 	MOVQ SI, AX
 	ADDQ CX, AX
 	CMPQ AX, R9
 	JGT err_short_buf

 	MOVQ DI, AX
 	ADDQ CX, AX
 	CMPQ AX, R8
 	JGT err_short_buf

 	// whats a good cut off to call memmove?
 	CMPQ CX, $16
 	JGT memmove_lit

 	// if len(dst[di:]) < 16
 	MOVQ R8, AX
 	SUBQ DI, AX
 	CMPQ AX, $16
 	JLT memmove_lit

 	// if len(src[si:]) < 16
 	MOVQ R9, AX
 	SUBQ SI, AX
 	CMPQ AX, $16
 	JLT memmove_lit

 	MOVOU (SI), X0
 	MOVOU X0, (DI)

 	JMP finish_lit_copy

 memmove_lit:
 	// memmove(to, from, len)
 	MOVQ DI, 0(SP)
 	MOVQ SI, 8(SP)
 	MOVQ CX, 16(SP)
 	// spill
 	MOVQ DI, 24(SP)
 	MOVQ SI, 32(SP)
 	MOVQ CX, 40(SP) // need len to inc SI, DI after
 	MOVB DX, 48(SP)
 	CALL runtime·memmove(SB)

 	// restore registers
 	MOVQ 24(SP), DI
 	MOVQ 32(SP), SI
 	MOVQ 40(SP), CX
 	MOVB 48(SP), DX

 	// recalc initial values
 	MOVQ dst_base+0(FP), R8
 	MOVQ R8, R11
 	ADDQ dst_len+8(FP), R8
 	MOVQ src_base+24(FP), R9
 	ADDQ src_len+32(FP), R9
 	MOVQ R8, R12
 	SUBQ $32, R12
 	MOVQ R9, R13
 	SUBQ $16, R13

 finish_lit_copy:
 	ADDQ CX, SI
 	ADDQ CX, DI

 	CMPQ SI, R9
 	JGE end

 offset:
 	// CX := mLen
 	// free up DX to use for offset
 	MOVQ DX, CX

 	MOVQ SI, AX
 	ADDQ $2, AX
 	CMPQ AX, R9
 	JGT err_short_buf

 	// offset
 	// DX := int(src[si]) | int(src[si+1])<<8
 	MOVWQZX (SI), DX
 	ADDQ $2, SI

 	// 0 offset is invalid
 	CMPQ DX, $0
 	JEQ err_corrupt

 	ANDB $0xF, CX

 match_len_loop_pre:
 	// if mlen != 0xF
 	CMPB CX, $0xF
 	JNE copy_match

 match_len_loop:
 	// for src[si] == 0xFF
 	// lit_len += 0xFF
 	CMPB (SI), $0xFF
 	JNE match_len_finalise

 	// bounds check src[si+1]
 	MOVQ SI, AX
 	ADDQ $1, AX
 	CMPQ AX, R9
 	JGT err_short_buf

 	ADDQ $0xFF, CX
 	INCQ SI
 	JMP match_len_loop

 match_len_finalise:
 	// lit_len += int(src[si])
 	// si++
 	MOVBQZX (SI), AX
 	ADDQ AX, CX
 	INCQ SI

 copy_match:
 	// mLen += minMatch
 	ADDQ $4, CX

 	// check we have match_len bytes left in dst
 	// di+match_len < len(dst)
 	MOVQ DI, AX
 	ADDQ CX, AX
 	CMPQ AX, R8
 	JGT err_short_buf

 	// DX = offset
 	// CX = match_len
 	// BX = &dst + (di - offset)
 	MOVQ DI, BX
 	SUBQ DX, BX

 	// check BX is within dst
 	// if BX < &dst
 	CMPQ BX, R11
 	JLT err_short_buf

 	// if offset + match_len < di
 	MOVQ BX, AX
 	ADDQ CX, AX
 	CMPQ DI, AX
 	JGT copy_interior_match

 	// AX := len(dst[:di])
 	// MOVQ DI, AX
 	// SUBQ R11, AX

 	// copy 16 bytes at a time
 	// if di-offset < 16 copy 16-(di-offset) bytes to di
 	// then do the remaining

 copy_match_loop:
 	// for match_len >= 0
 	// dst[di] = dst[i]
 	// di++
 	// i++
 	MOVB (BX), AX
 	MOVB AX, (DI)
 	INCQ DI
 	INCQ BX
 	DECQ CX

 	CMPQ CX, $0
 	JGT copy_match_loop

 	JMP loop

 copy_interior_match:
 	CMPQ CX, $16
 	JGT memmove_match

 	// if len(dst[di:]) < 16
 	MOVQ R8, AX
 	SUBQ DI, AX
 	CMPQ AX, $16
 	JLT memmove_match

 	MOVOU (BX), X0
 	MOVOU X0, (DI)

 	ADDQ CX, DI
 	JMP loop

 memmove_match:
 	// memmove(to, from, len)
 	MOVQ DI, 0(SP)
 	MOVQ BX, 8(SP)
 	MOVQ CX, 16(SP)
 	// spill
 	MOVQ DI, 24(SP)
 	MOVQ SI, 32(SP)
 	MOVQ CX, 40(SP) // need len to inc SI, DI after
 	CALL runtime·memmove(SB)

 	// restore registers
 	MOVQ 24(SP), DI
 	MOVQ 32(SP), SI
 	MOVQ 40(SP), CX

 	// recalc initial values
 	MOVQ dst_base+0(FP), R8
 	MOVQ R8, R11 // TODO: make these sensible numbers
 	ADDQ dst_len+8(FP), R8
 	MOVQ src_base+24(FP), R9
 	ADDQ src_len+32(FP), R9
 	MOVQ R8, R12
 	SUBQ $32, R12
 	MOVQ R9, R13
 	SUBQ $16, R13

 	ADDQ CX, DI
 	JMP loop

 err_corrupt:
 	MOVQ $-1, ret+48(FP)
 	RET

 err_short_buf:
 	MOVQ $-2, ret+48(FP)
 	RET

 end:
 	SUBQ R11, DI
 	MOVQ DI, ret+48(FP)
 	RET
	// +build !appengine
	// +build gc
	// +build !noasm

	#include "textflag.h"

	// AX scratch
	// BX scratch
	// CX scratch
	// DX token
	//
	// DI &dst
	// SI &src
	// R8 &dst + len(dst)
	// R9 &src + len(src)
	// R11 &dst
	// R12 short output end
	// R13 short input end
	// func decodeBlock(dst, src []byte) int
	// using 50 bytes of stack currently
	TEXT ·decodeBlock(SB), NOSPLIT, $64-56
	MOVQ dst_base+0(FP), DI
	MOVQ DI, R11
	MOVQ dst_len+8(FP), R8
	ADDQ DI, R8

	MOVQ src_base+24(FP), SI
	MOVQ src_len+32(FP), R9
	ADDQ SI, R9

	// shortcut ends
	// short output end
	MOVQ R8, R12
	SUBQ $32, R12
	// short input end
	MOVQ R9, R13
	SUBQ $16, R13

	loop:
	// for si < len(src)
	CMPQ SI, R9
	JGE end

	// token := uint32(src[si])
	MOVBQZX (SI), DX
	INCQ SI

	// lit_len = token >> 4
	// if lit_len > 0
	// CX = lit_len
	MOVQ DX, CX
	SHRQ $4, CX

	// if lit_len != 0xF
	CMPQ CX, $0xF
	JEQ lit_len_loop_pre
	CMPQ DI, R12
	JGE lit_len_loop_pre
	CMPQ SI, R13
	JGE lit_len_loop_pre

	// copy shortcut

	// A two-stage shortcut for the most common case:
	// 1) If the literal length is 0..14, and there is enough space,
	// enter the shortcut and copy 16 bytes on behalf of the literals
	// (in the fast mode, only 8 bytes can be safely copied this way).
	// 2) Further if the match length is 4..18, copy 18 bytes in a similar
	// manner; but we ensure that there's enough space in the output for
	// those 18 bytes earlier, upon entering the shortcut (in other words,
	// there is a combined check for both stages).

	// copy literal
	MOVOU (SI), X0
	MOVOU X0, (DI)
	ADDQ CX, DI
	ADDQ CX, SI

	MOVQ DX, CX
	ANDQ $0xF, CX

	// The second stage: prepare for match copying, decode full info.
	// If it doesn't work out, the info won't be wasted.
	// offset := uint16(data[:2])
	MOVWQZX (SI), DX
	ADDQ $2, SI

	MOVQ DI, AX
	SUBQ DX, AX
	CMPQ AX, DI
	JGT err_short_buf

	// if we can't do the second stage then jump straight to read the
	// match length, we already have the offset.
	CMPQ CX, $0xF
	JEQ match_len_loop_pre
	CMPQ DX, $8
	JLT match_len_loop_pre
	CMPQ AX, R11
	JLT err_short_buf

	// memcpy(op + 0, match + 0, 8);
	MOVQ (AX), BX
	MOVQ BX, (DI)
	// memcpy(op + 8, match + 8, 8);
	MOVQ 8(AX), BX
	MOVQ BX, 8(DI)
	// memcpy(op +16, match +16, 2);
	MOVW 16(AX), BX
	MOVW BX, 16(DI)

	ADDQ $4, DI // minmatch
	ADDQ CX, DI

	// shortcut complete, load next token
	JMP loop

	lit_len_loop_pre:
	// if lit_len > 0
	CMPQ CX, $0
	JEQ offset
	CMPQ CX, $0xF
	JNE copy_literal

	lit_len_loop:
	// for src[si] == 0xFF
	CMPB (SI), $0xFF
	JNE lit_len_finalise

	// bounds check src[si+1]
	MOVQ SI, AX
	ADDQ $1, AX
	CMPQ AX, R9
	JGT err_short_buf

	// lit_len += 0xFF
	ADDQ $0xFF, CX
	INCQ SI
	JMP lit_len_loop

	lit_len_finalise:
	// lit_len += int(src[si])
	// si++
	MOVBQZX (SI), AX
	ADDQ AX, CX
	INCQ SI

	copy_literal:
	// bounds check src and dst
	MOVQ SI, AX
	ADDQ CX, AX
	CMPQ AX, R9
	JGT err_short_buf

	MOVQ DI, AX
	ADDQ CX, AX
	CMPQ AX, R8
	JGT err_short_buf

	// whats a good cut off to call memmove?
	CMPQ CX, $16
	JGT memmove_lit

	// if len(dst[di:]) < 16
	MOVQ R8, AX
	SUBQ DI, AX
	CMPQ AX, $16
	JLT memmove_lit

	// if len(src[si:]) < 16
	MOVQ R9, AX
	SUBQ SI, AX
	CMPQ AX, $16
	JLT memmove_lit

	MOVOU (SI), X0
	MOVOU X0, (DI)

	JMP finish_lit_copy

	memmove_lit:
	// memmove(to, from, len)
	MOVQ DI, 0(SP)
	MOVQ SI, 8(SP)
	MOVQ CX, 16(SP)
	// spill
	MOVQ DI, 24(SP)
	MOVQ SI, 32(SP)
	MOVQ CX, 40(SP) // need len to inc SI, DI after
	MOVB DX, 48(SP)
	CALL runtime·memmove(SB)

	// restore registers
	MOVQ 24(SP), DI
	MOVQ 32(SP), SI
	MOVQ 40(SP), CX
	MOVB 48(SP), DX

	// recalc initial values
	MOVQ dst_base+0(FP), R8
	MOVQ R8, R11
	ADDQ dst_len+8(FP), R8
	MOVQ src_base+24(FP), R9
	ADDQ src_len+32(FP), R9
	MOVQ R8, R12
	SUBQ $32, R12
	MOVQ R9, R13
	SUBQ $16, R13

	finish_lit_copy:
	ADDQ CX, SI
	ADDQ CX, DI

	CMPQ SI, R9
	JGE end

	offset:
	// CX := mLen
	// free up DX to use for offset
	MOVQ DX, CX

	MOVQ SI, AX
	ADDQ $2, AX
	CMPQ AX, R9
	JGT err_short_buf

	// offset
	// DX := int(src[si]) \| int(src[si+1])<<8
	MOVWQZX (SI), DX
	ADDQ $2, SI

	// 0 offset is invalid
	CMPQ DX, $0
	JEQ err_corrupt

	ANDB $0xF, CX

	match_len_loop_pre:
	// if mlen != 0xF
	CMPB CX, $0xF
	JNE copy_match

	match_len_loop:
	// for src[si] == 0xFF
	// lit_len += 0xFF
	CMPB (SI), $0xFF
	JNE match_len_finalise

	// bounds check src[si+1]
	MOVQ SI, AX
	ADDQ $1, AX
	CMPQ AX, R9
	JGT err_short_buf

	ADDQ $0xFF, CX
	INCQ SI
	JMP match_len_loop

	match_len_finalise:
	// lit_len += int(src[si])
	// si++
	MOVBQZX (SI), AX
	ADDQ AX, CX
	INCQ SI

	copy_match:
	// mLen += minMatch
	ADDQ $4, CX

	// check we have match_len bytes left in dst
	// di+match_len < len(dst)
	MOVQ DI, AX
	ADDQ CX, AX
	CMPQ AX, R8
	JGT err_short_buf

	// DX = offset
	// CX = match_len
	// BX = &dst + (di - offset)
	MOVQ DI, BX
	SUBQ DX, BX

	// check BX is within dst
	// if BX < &dst
	CMPQ BX, R11
	JLT err_short_buf

	// if offset + match_len < di
	MOVQ BX, AX
	ADDQ CX, AX
	CMPQ DI, AX
	JGT copy_interior_match

	// AX := len(dst[:di])
	// MOVQ DI, AX
	// SUBQ R11, AX

	// copy 16 bytes at a time
	// if di-offset < 16 copy 16-(di-offset) bytes to di
	// then do the remaining

	copy_match_loop:
	// for match_len >= 0
	// dst[di] = dst[i]
	// di++
	// i++
	MOVB (BX), AX
	MOVB AX, (DI)
	INCQ DI
	INCQ BX
	DECQ CX

	CMPQ CX, $0
	JGT copy_match_loop

	JMP loop

	copy_interior_match:
	CMPQ CX, $16
	JGT memmove_match

	// if len(dst[di:]) < 16
	MOVQ R8, AX
	SUBQ DI, AX
	CMPQ AX, $16
	JLT memmove_match

	MOVOU (BX), X0
	MOVOU X0, (DI)

	ADDQ CX, DI
	JMP loop

	memmove_match:
	// memmove(to, from, len)
	MOVQ DI, 0(SP)
	MOVQ BX, 8(SP)
	MOVQ CX, 16(SP)
	// spill
	MOVQ DI, 24(SP)
	MOVQ SI, 32(SP)
	MOVQ CX, 40(SP) // need len to inc SI, DI after
	CALL runtime·memmove(SB)

	// restore registers
	MOVQ 24(SP), DI
	MOVQ 32(SP), SI
	MOVQ 40(SP), CX

	// recalc initial values
	MOVQ dst_base+0(FP), R8
	MOVQ R8, R11 // TODO: make these sensible numbers
	ADDQ dst_len+8(FP), R8
	MOVQ src_base+24(FP), R9
	ADDQ src_len+32(FP), R9
	MOVQ R8, R12
	SUBQ $32, R12
	MOVQ R9, R13
	SUBQ $16, R13

	ADDQ CX, DI
	JMP loop

	err_corrupt:
	MOVQ $-1, ret+48(FP)
	RET

	err_short_buf:
	MOVQ $-2, ret+48(FP)
	RET

	end:
	SUBQ R11, DI
	MOVQ DI, ret+48(FP)
	RET