SEBA-949 support for publishing bbsim events on kafka
Change-Id: I4354cd026bbadc801e4d6d08b2f9cd3462917b4c
diff --git a/vendor/github.com/pierrec/lz4/decode_amd64.s b/vendor/github.com/pierrec/lz4/decode_amd64.s
new file mode 100644
index 0000000..20fef39
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/decode_amd64.s
@@ -0,0 +1,375 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// AX scratch
+// BX scratch
+// CX scratch
+// DX token
+//
+// DI &dst
+// SI &src
+// R8 &dst + len(dst)
+// R9 &src + len(src)
+// R11 &dst
+// R12 short output end
+// R13 short input end
+// func decodeBlock(dst, src []byte) int
+// using 50 bytes of stack currently
+TEXT ·decodeBlock(SB), NOSPLIT, $64-56
+ MOVQ dst_base+0(FP), DI
+ MOVQ DI, R11
+ MOVQ dst_len+8(FP), R8
+ ADDQ DI, R8
+
+ MOVQ src_base+24(FP), SI
+ MOVQ src_len+32(FP), R9
+ ADDQ SI, R9
+
+ // shortcut ends
+ // short output end
+ MOVQ R8, R12
+ SUBQ $32, R12
+ // short input end
+ MOVQ R9, R13
+ SUBQ $16, R13
+
+loop:
+ // for si < len(src)
+ CMPQ SI, R9
+ JGE end
+
+ // token := uint32(src[si])
+ MOVBQZX (SI), DX
+ INCQ SI
+
+ // lit_len = token >> 4
+ // if lit_len > 0
+ // CX = lit_len
+ MOVQ DX, CX
+ SHRQ $4, CX
+
+ // if lit_len != 0xF
+ CMPQ CX, $0xF
+ JEQ lit_len_loop_pre
+ CMPQ DI, R12
+ JGE lit_len_loop_pre
+ CMPQ SI, R13
+ JGE lit_len_loop_pre
+
+ // copy shortcut
+
+ // A two-stage shortcut for the most common case:
+ // 1) If the literal length is 0..14, and there is enough space,
+ // enter the shortcut and copy 16 bytes on behalf of the literals
+ // (in the fast mode, only 8 bytes can be safely copied this way).
+ // 2) Further if the match length is 4..18, copy 18 bytes in a similar
+ // manner; but we ensure that there's enough space in the output for
+ // those 18 bytes earlier, upon entering the shortcut (in other words,
+ // there is a combined check for both stages).
+
+ // copy literal
+ MOVOU (SI), X0
+ MOVOU X0, (DI)
+ ADDQ CX, DI
+ ADDQ CX, SI
+
+ MOVQ DX, CX
+ ANDQ $0xF, CX
+
+ // The second stage: prepare for match copying, decode full info.
+ // If it doesn't work out, the info won't be wasted.
+ // offset := uint16(data[:2])
+ MOVWQZX (SI), DX
+ ADDQ $2, SI
+
+ MOVQ DI, AX
+ SUBQ DX, AX
+ CMPQ AX, DI
+ JGT err_short_buf
+
+ // if we can't do the second stage then jump straight to read the
+ // match length, we already have the offset.
+ CMPQ CX, $0xF
+ JEQ match_len_loop_pre
+ CMPQ DX, $8
+ JLT match_len_loop_pre
+ CMPQ AX, R11
+ JLT err_short_buf
+
+ // memcpy(op + 0, match + 0, 8);
+ MOVQ (AX), BX
+ MOVQ BX, (DI)
+ // memcpy(op + 8, match + 8, 8);
+ MOVQ 8(AX), BX
+ MOVQ BX, 8(DI)
+ // memcpy(op +16, match +16, 2);
+ MOVW 16(AX), BX
+ MOVW BX, 16(DI)
+
+ ADDQ $4, DI // minmatch
+ ADDQ CX, DI
+
+ // shortcut complete, load next token
+ JMP loop
+
+lit_len_loop_pre:
+ // if lit_len > 0
+ CMPQ CX, $0
+ JEQ offset
+ CMPQ CX, $0xF
+ JNE copy_literal
+
+lit_len_loop:
+ // for src[si] == 0xFF
+ CMPB (SI), $0xFF
+ JNE lit_len_finalise
+
+ // bounds check src[si+1]
+ MOVQ SI, AX
+ ADDQ $1, AX
+ CMPQ AX, R9
+ JGT err_short_buf
+
+ // lit_len += 0xFF
+ ADDQ $0xFF, CX
+ INCQ SI
+ JMP lit_len_loop
+
+lit_len_finalise:
+ // lit_len += int(src[si])
+ // si++
+ MOVBQZX (SI), AX
+ ADDQ AX, CX
+ INCQ SI
+
+copy_literal:
+ // bounds check src and dst
+ MOVQ SI, AX
+ ADDQ CX, AX
+ CMPQ AX, R9
+ JGT err_short_buf
+
+ MOVQ DI, AX
+ ADDQ CX, AX
+ CMPQ AX, R8
+ JGT err_short_buf
+
+ // whats a good cut off to call memmove?
+ CMPQ CX, $16
+ JGT memmove_lit
+
+ // if len(dst[di:]) < 16
+ MOVQ R8, AX
+ SUBQ DI, AX
+ CMPQ AX, $16
+ JLT memmove_lit
+
+ // if len(src[si:]) < 16
+ MOVQ R9, AX
+ SUBQ SI, AX
+ CMPQ AX, $16
+ JLT memmove_lit
+
+ MOVOU (SI), X0
+ MOVOU X0, (DI)
+
+ JMP finish_lit_copy
+
+memmove_lit:
+ // memmove(to, from, len)
+ MOVQ DI, 0(SP)
+ MOVQ SI, 8(SP)
+ MOVQ CX, 16(SP)
+ // spill
+ MOVQ DI, 24(SP)
+ MOVQ SI, 32(SP)
+ MOVQ CX, 40(SP) // need len to inc SI, DI after
+ MOVB DX, 48(SP)
+ CALL runtime·memmove(SB)
+
+ // restore registers
+ MOVQ 24(SP), DI
+ MOVQ 32(SP), SI
+ MOVQ 40(SP), CX
+ MOVB 48(SP), DX
+
+ // recalc initial values
+ MOVQ dst_base+0(FP), R8
+ MOVQ R8, R11
+ ADDQ dst_len+8(FP), R8
+ MOVQ src_base+24(FP), R9
+ ADDQ src_len+32(FP), R9
+ MOVQ R8, R12
+ SUBQ $32, R12
+ MOVQ R9, R13
+ SUBQ $16, R13
+
+finish_lit_copy:
+ ADDQ CX, SI
+ ADDQ CX, DI
+
+ CMPQ SI, R9
+ JGE end
+
+offset:
+ // CX := mLen
+ // free up DX to use for offset
+ MOVQ DX, CX
+
+ MOVQ SI, AX
+ ADDQ $2, AX
+ CMPQ AX, R9
+ JGT err_short_buf
+
+ // offset
+ // DX := int(src[si]) | int(src[si+1])<<8
+ MOVWQZX (SI), DX
+ ADDQ $2, SI
+
+ // 0 offset is invalid
+ CMPQ DX, $0
+ JEQ err_corrupt
+
+ ANDB $0xF, CX
+
+match_len_loop_pre:
+ // if mlen != 0xF
+ CMPB CX, $0xF
+ JNE copy_match
+
+match_len_loop:
+ // for src[si] == 0xFF
+ // lit_len += 0xFF
+ CMPB (SI), $0xFF
+ JNE match_len_finalise
+
+ // bounds check src[si+1]
+ MOVQ SI, AX
+ ADDQ $1, AX
+ CMPQ AX, R9
+ JGT err_short_buf
+
+ ADDQ $0xFF, CX
+ INCQ SI
+ JMP match_len_loop
+
+match_len_finalise:
+ // lit_len += int(src[si])
+ // si++
+ MOVBQZX (SI), AX
+ ADDQ AX, CX
+ INCQ SI
+
+copy_match:
+ // mLen += minMatch
+ ADDQ $4, CX
+
+ // check we have match_len bytes left in dst
+ // di+match_len < len(dst)
+ MOVQ DI, AX
+ ADDQ CX, AX
+ CMPQ AX, R8
+ JGT err_short_buf
+
+ // DX = offset
+ // CX = match_len
+ // BX = &dst + (di - offset)
+ MOVQ DI, BX
+ SUBQ DX, BX
+
+ // check BX is within dst
+ // if BX < &dst
+ CMPQ BX, R11
+ JLT err_short_buf
+
+ // if offset + match_len < di
+ MOVQ BX, AX
+ ADDQ CX, AX
+ CMPQ DI, AX
+ JGT copy_interior_match
+
+ // AX := len(dst[:di])
+ // MOVQ DI, AX
+ // SUBQ R11, AX
+
+ // copy 16 bytes at a time
+ // if di-offset < 16 copy 16-(di-offset) bytes to di
+ // then do the remaining
+
+copy_match_loop:
+ // for match_len >= 0
+ // dst[di] = dst[i]
+ // di++
+ // i++
+ MOVB (BX), AX
+ MOVB AX, (DI)
+ INCQ DI
+ INCQ BX
+ DECQ CX
+
+ CMPQ CX, $0
+ JGT copy_match_loop
+
+ JMP loop
+
+copy_interior_match:
+ CMPQ CX, $16
+ JGT memmove_match
+
+ // if len(dst[di:]) < 16
+ MOVQ R8, AX
+ SUBQ DI, AX
+ CMPQ AX, $16
+ JLT memmove_match
+
+ MOVOU (BX), X0
+ MOVOU X0, (DI)
+
+ ADDQ CX, DI
+ JMP loop
+
+memmove_match:
+ // memmove(to, from, len)
+ MOVQ DI, 0(SP)
+ MOVQ BX, 8(SP)
+ MOVQ CX, 16(SP)
+ // spill
+ MOVQ DI, 24(SP)
+ MOVQ SI, 32(SP)
+ MOVQ CX, 40(SP) // need len to inc SI, DI after
+ CALL runtime·memmove(SB)
+
+ // restore registers
+ MOVQ 24(SP), DI
+ MOVQ 32(SP), SI
+ MOVQ 40(SP), CX
+
+ // recalc initial values
+ MOVQ dst_base+0(FP), R8
+ MOVQ R8, R11 // TODO: make these sensible numbers
+ ADDQ dst_len+8(FP), R8
+ MOVQ src_base+24(FP), R9
+ ADDQ src_len+32(FP), R9
+ MOVQ R8, R12
+ SUBQ $32, R12
+ MOVQ R9, R13
+ SUBQ $16, R13
+
+ ADDQ CX, DI
+ JMP loop
+
+err_corrupt:
+ MOVQ $-1, ret+48(FP)
+ RET
+
+err_short_buf:
+ MOVQ $-2, ret+48(FP)
+ RET
+
+end:
+ SUBQ R11, DI
+ MOVQ DI, ret+48(FP)
+ RET