[VOL-5291] - pon & nni stats changes, onu stats from OLT, onu stats from onu
Change-Id: I4f23cb1b1276d27ca6f2c183875b8b227f772edd
Signed-off-by: Akash Reddy Kankanala <akash.kankanala@radisys.com>
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
new file mode 100644
index 0000000..dd1a5ae
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -0,0 +1,847 @@
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
+
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_main_loop_amd64(SB), $0-8
+ XORQ DX, DX
+
+ // Preload values
+ MOVQ ctx+0(FP), AX
+ MOVBQZX 8(AX), DI
+ MOVQ 16(AX), SI
+ MOVQ 48(AX), BX
+ MOVQ 24(AX), R9
+ MOVQ 32(AX), R10
+ MOVQ (AX), R11
+
+ // Main loop
+main_loop:
+ MOVQ SI, R8
+ CMPQ R8, BX
+ SETGE DL
+
+ // br0.fillFast32()
+ MOVQ 32(R11), R12
+ MOVBQZX 40(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill0
+ MOVQ 24(R11), AX
+ SUBQ $0x20, R13
+ SUBQ $0x04, AX
+ MOVQ (R11), R14
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (AX)(R14*1), R14
+ MOVQ R13, CX
+ SHLQ CL, R14
+ MOVQ AX, 24(R11)
+ ORQ R14, R12
+
+ // exhausted = exhausted || (br0.off < 4)
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill0:
+ // val0 := br0.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v0 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br0.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val1 := br0.peekTopBits(peekBits)
+ MOVQ DI, CX
+ MOVQ R12, R14
+ SHRQ CL, R14
+
+ // v1 := table[val1&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br0.advance(uint8(v1.entry))
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // these two writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 32(R11)
+ MOVB R13, 40(R11)
+ ADDQ R9, R8
+
+ // br1.fillFast32()
+ MOVQ 80(R11), R12
+ MOVBQZX 88(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill1
+ MOVQ 72(R11), AX
+ SUBQ $0x20, R13
+ SUBQ $0x04, AX
+ MOVQ 48(R11), R14
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (AX)(R14*1), R14
+ MOVQ R13, CX
+ SHLQ CL, R14
+ MOVQ AX, 72(R11)
+ ORQ R14, R12
+
+ // exhausted = exhausted || (br1.off < 4)
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1:
+ // val0 := br1.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v0 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br1.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val1 := br1.peekTopBits(peekBits)
+ MOVQ DI, CX
+ MOVQ R12, R14
+ SHRQ CL, R14
+
+ // v1 := table[val1&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br1.advance(uint8(v1.entry))
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // these two writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 80(R11)
+ MOVB R13, 88(R11)
+ ADDQ R9, R8
+
+ // br2.fillFast32()
+ MOVQ 128(R11), R12
+ MOVBQZX 136(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill2
+ MOVQ 120(R11), AX
+ SUBQ $0x20, R13
+ SUBQ $0x04, AX
+ MOVQ 96(R11), R14
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (AX)(R14*1), R14
+ MOVQ R13, CX
+ SHLQ CL, R14
+ MOVQ AX, 120(R11)
+ ORQ R14, R12
+
+ // exhausted = exhausted || (br2.off < 4)
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill2:
+ // val0 := br2.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v0 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br2.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val1 := br2.peekTopBits(peekBits)
+ MOVQ DI, CX
+ MOVQ R12, R14
+ SHRQ CL, R14
+
+ // v1 := table[val1&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br2.advance(uint8(v1.entry))
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // these two writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 128(R11)
+ MOVB R13, 136(R11)
+ ADDQ R9, R8
+
+ // br3.fillFast32()
+ MOVQ 176(R11), R12
+ MOVBQZX 184(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill3
+ MOVQ 168(R11), AX
+ SUBQ $0x20, R13
+ SUBQ $0x04, AX
+ MOVQ 144(R11), R14
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (AX)(R14*1), R14
+ MOVQ R13, CX
+ SHLQ CL, R14
+ MOVQ AX, 168(R11)
+ ORQ R14, R12
+
+ // exhausted = exhausted || (br3.off < 4)
+ CMPQ AX, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill3:
+ // val0 := br3.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v0 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br3.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ DI, CX
+ MOVQ R12, R14
+ SHRQ CL, R14
+
+ // v1 := table[val1&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br3.advance(uint8(v1.entry))
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // these two writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 176(R11)
+ MOVB R13, 184(R11)
+ ADDQ $0x02, SI
+ TESTB DL, DL
+ JZ main_loop
+ MOVQ ctx+0(FP), AX
+ SUBQ 16(AX), SI
+ SHLQ $0x02, SI
+ MOVQ SI, 40(AX)
+ RET
+
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
+ XORQ DX, DX
+
+ // Preload values
+ MOVQ ctx+0(FP), CX
+ MOVBQZX 8(CX), DI
+ MOVQ 16(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 24(CX), R9
+ MOVQ 32(CX), R10
+ MOVQ (CX), R11
+
+ // Main loop
+main_loop:
+ MOVQ BX, R8
+ CMPQ R8, SI
+ SETGE DL
+
+ // br0.fillFast32()
+ MOVQ 32(R11), R12
+ MOVBQZX 40(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill0
+ MOVQ 24(R11), R14
+ SUBQ $0x20, R13
+ SUBQ $0x04, R14
+ MOVQ (R11), R15
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R14)(R15*1), R15
+ MOVQ R13, CX
+ SHLQ CL, R15
+ MOVQ R14, 24(R11)
+ ORQ R15, R12
+
+ // exhausted = exhausted || (br0.off < 4)
+ CMPQ R14, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill0:
+ // val0 := br0.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v0 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br0.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val1 := br0.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v1 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br0.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // val2 := br0.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v2 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br0.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val3 := br0.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v3 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br0.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 32(R11)
+ MOVB R13, 40(R11)
+ ADDQ R9, R8
+
+ // br1.fillFast32()
+ MOVQ 80(R11), R12
+ MOVBQZX 88(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill1
+ MOVQ 72(R11), R14
+ SUBQ $0x20, R13
+ SUBQ $0x04, R14
+ MOVQ 48(R11), R15
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R14)(R15*1), R15
+ MOVQ R13, CX
+ SHLQ CL, R15
+ MOVQ R14, 72(R11)
+ ORQ R15, R12
+
+ // exhausted = exhausted || (br1.off < 4)
+ CMPQ R14, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill1:
+ // val0 := br1.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v0 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br1.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val1 := br1.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v1 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br1.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // val2 := br1.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v2 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br1.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val3 := br1.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v3 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br1.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 80(R11)
+ MOVB R13, 88(R11)
+ ADDQ R9, R8
+
+ // br2.fillFast32()
+ MOVQ 128(R11), R12
+ MOVBQZX 136(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill2
+ MOVQ 120(R11), R14
+ SUBQ $0x20, R13
+ SUBQ $0x04, R14
+ MOVQ 96(R11), R15
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R14)(R15*1), R15
+ MOVQ R13, CX
+ SHLQ CL, R15
+ MOVQ R14, 120(R11)
+ ORQ R15, R12
+
+ // exhausted = exhausted || (br2.off < 4)
+ CMPQ R14, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill2:
+ // val0 := br2.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v0 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br2.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val1 := br2.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v1 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br2.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // val2 := br2.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v2 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br2.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val3 := br2.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v3 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br2.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 128(R11)
+ MOVB R13, 136(R11)
+ ADDQ R9, R8
+
+ // br3.fillFast32()
+ MOVQ 176(R11), R12
+ MOVBQZX 184(R11), R13
+ CMPQ R13, $0x20
+ JBE skip_fill3
+ MOVQ 168(R11), R14
+ SUBQ $0x20, R13
+ SUBQ $0x04, R14
+ MOVQ 144(R11), R15
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R14)(R15*1), R15
+ MOVQ R13, CX
+ SHLQ CL, R15
+ MOVQ R14, 168(R11)
+ ORQ R15, R12
+
+ // exhausted = exhausted || (br3.off < 4)
+ CMPQ R14, $0x04
+ SETLT AL
+ ORB AL, DL
+
+skip_fill3:
+ // val0 := br3.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v0 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br3.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v1 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br3.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // val2 := br3.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v2 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br3.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R12
+ ADDB CL, R13
+
+ // val3 := br3.peekTopBits(peekBits)
+ MOVQ R12, R14
+ MOVQ DI, CX
+ SHRQ CL, R14
+
+ // v3 := table[val0&mask]
+ MOVW (R10)(R14*2), CX
+
+ // br3.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R12
+ ADDB CL, R13
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (R8)
+
+ // update the bitreader structure
+ MOVQ R12, 176(R11)
+ MOVB R13, 184(R11)
+ ADDQ $0x04, BX
+ TESTB DL, DL
+ JZ main_loop
+ MOVQ ctx+0(FP), AX
+ SUBQ 16(AX), BX
+ SHLQ $0x02, BX
+ MOVQ BX, 40(AX)
+ RET
+
+// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+TEXT ·decompress1x_main_loop_amd64(SB), $0-8
+ MOVQ ctx+0(FP), CX
+ MOVQ 16(CX), DX
+ MOVQ 24(CX), BX
+ CMPQ BX, $0x04
+ JB error_max_decoded_size_exeeded
+ LEAQ (DX)(BX*1), BX
+ MOVQ (CX), SI
+ MOVQ (SI), R8
+ MOVQ 24(SI), R9
+ MOVQ 32(SI), R10
+ MOVBQZX 40(SI), R11
+ MOVQ 32(CX), SI
+ MOVBQZX 8(CX), DI
+ JMP loop_condition
+
+main_loop:
+ // Check if we have room for 4 bytes in the output buffer
+ LEAQ 4(DX), CX
+ CMPQ CX, BX
+ JGE error_max_decoded_size_exeeded
+
+ // Decode 4 values
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_1_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), R12
+ MOVQ R11, CX
+ SHLQ CL, R12
+ ORQ R12, R10
+
+bitReader_fillFast_1_end:
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ BSWAPL AX
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_2_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), R12
+ MOVQ R11, CX
+ SHLQ CL, R12
+ ORQ R12, R10
+
+bitReader_fillFast_2_end:
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ BSWAPL AX
+
+ // Store the decoded values
+ MOVL AX, (DX)
+ ADDQ $0x04, DX
+
+loop_condition:
+ CMPQ R9, $0x08
+ JGE main_loop
+
+ // Update ctx structure
+ MOVQ ctx+0(FP), AX
+ SUBQ 16(AX), DX
+ MOVQ DX, 40(AX)
+ MOVQ (AX), AX
+ MOVQ R9, 24(AX)
+ MOVQ R10, 32(AX)
+ MOVB R11, 40(AX)
+ RET
+
+ // Report error
+error_max_decoded_size_exeeded:
+ MOVQ ctx+0(FP), AX
+ MOVQ $-1, CX
+ MOVQ CX, 40(AX)
+ RET
+
+// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+// Requires: BMI2
+TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
+ MOVQ ctx+0(FP), CX
+ MOVQ 16(CX), DX
+ MOVQ 24(CX), BX
+ CMPQ BX, $0x04
+ JB error_max_decoded_size_exeeded
+ LEAQ (DX)(BX*1), BX
+ MOVQ (CX), SI
+ MOVQ (SI), R8
+ MOVQ 24(SI), R9
+ MOVQ 32(SI), R10
+ MOVBQZX 40(SI), R11
+ MOVQ 32(CX), SI
+ MOVBQZX 8(CX), DI
+ JMP loop_condition
+
+main_loop:
+ // Check if we have room for 4 bytes in the output buffer
+ LEAQ 4(DX), CX
+ CMPQ CX, BX
+ JGE error_max_decoded_size_exeeded
+
+ // Decode 4 values
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_1_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), CX
+ SHLXQ R11, CX, CX
+ ORQ CX, R10
+
+bitReader_fillFast_1_end:
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ BSWAPL AX
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_2_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), CX
+ SHLXQ R11, CX, CX
+ ORQ CX, R10
+
+bitReader_fillFast_2_end:
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ BSWAPL AX
+
+ // Store the decoded values
+ MOVL AX, (DX)
+ ADDQ $0x04, DX
+
+loop_condition:
+ CMPQ R9, $0x08
+ JGE main_loop
+
+ // Update ctx structure
+ MOVQ ctx+0(FP), AX
+ SUBQ 16(AX), DX
+ MOVQ DX, 40(AX)
+ MOVQ (AX), AX
+ MOVQ R9, 24(AX)
+ MOVQ R10, 32(AX)
+ MOVB R11, 40(AX)
+ RET
+
+ // Report error
+error_max_decoded_size_exeeded:
+ MOVQ ctx+0(FP), AX
+ MOVQ $-1, CX
+ MOVQ CX, 40(AX)
+ RET