[VOL-5292] Implementation for fetching the GEM port history Data from the ONT

Change-Id: I4cf22555cbd13bcd5e49e620c8aa8b67cbd2891c
Signed-off-by: Akash Reddy Kankanala <akash.kankanala@radisys.com>
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index a4979e8..504a7be 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -8,118 +8,13 @@
 import (
 	"encoding/binary"
 	"errors"
+	"fmt"
 	"io"
 )
 
 // bitReader reads a bitstream in reverse.
 // The last set bit indicates the start of the stream and is used
 // for aligning the input.
-type bitReader struct {
-	in       []byte
-	off      uint // next byte to read is at in[off - 1]
-	value    uint64
-	bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReader) init(in []byte) error {
-	if len(in) < 1 {
-		return errors.New("corrupt stream: too short")
-	}
-	b.in = in
-	b.off = uint(len(in))
-	// The highest bit of the last byte indicates where to start
-	v := in[len(in)-1]
-	if v == 0 {
-		return errors.New("corrupt stream, did not find end of stream")
-	}
-	b.bitsRead = 64
-	b.value = 0
-	if len(in) >= 8 {
-		b.fillFastStart()
-	} else {
-		b.fill()
-		b.fill()
-	}
-	b.bitsRead += 8 - uint8(highBit32(uint32(v)))
-	return nil
-}
-
-// peekBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReader) peekBitsFast(n uint8) uint16 {
-	const regMask = 64 - 1
-	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
-	return v
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReader) fillFast() {
-	if b.bitsRead < 32 {
-		return
-	}
-
-	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	v = v[:4]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	b.value = (b.value << 32) | uint64(low)
-	b.bitsRead -= 32
-	b.off -= 4
-}
-
-func (b *bitReader) advance(n uint8) {
-	b.bitsRead += n
-}
-
-// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
-func (b *bitReader) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
-	b.bitsRead = 0
-	b.off -= 8
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReader) fill() {
-	if b.bitsRead < 32 {
-		return
-	}
-	if b.off > 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-		b.value = (b.value << 32) | uint64(low)
-		b.bitsRead -= 32
-		b.off -= 4
-		return
-	}
-	for b.off > 0 {
-		b.value = (b.value << 8) | uint64(b.in[b.off-1])
-		b.bitsRead -= 8
-		b.off--
-	}
-}
-
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReader) close() error {
-	// Release reference.
-	b.in = nil
-	if b.bitsRead > 64 {
-		return io.ErrUnexpectedEOF
-	}
-	return nil
-}
-
-// bitReader reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
 type bitReaderBytes struct {
 	in       []byte
 	off      uint // next byte to read is at in[off - 1]
@@ -213,10 +108,17 @@
 	return b.off == 0 && b.bitsRead >= 64
 }
 
+func (b *bitReaderBytes) remaining() uint {
+	return b.off*8 + uint(64-b.bitsRead)
+}
+
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderBytes) close() error {
 	// Release reference.
 	b.in = nil
+	if b.remaining() > 0 {
+		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
@@ -313,15 +215,17 @@
 	}
 }
 
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReaderShifted) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
+func (b *bitReaderShifted) remaining() uint {
+	return b.off*8 + uint(64-b.bitsRead)
 }
 
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderShifted) close() error {
 	// Release reference.
 	b.in = nil
+	if b.remaining() > 0 {
+		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
index 6bce4e8..ec71f7a 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@@ -5,8 +5,6 @@
 
 package huff0
 
-import "fmt"
-
 // bitWriter will write bits.
 // First bit will be LSB of the first byte of output.
 type bitWriter struct {
@@ -23,14 +21,6 @@
 	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
 	0xFFFF, 0xFFFF} /* up to 16 bits */
 
-// addBits16NC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
-	b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
-	b.nBits += bits
-}
-
 // addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
 // It will not check if there is space for them, so the caller must ensure that it has flushed recently.
 func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@@ -70,104 +60,6 @@
 	b.nBits += encA.nBits + encB.nBits
 }
 
-// addBits16ZeroNC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-// This is fastest if bits can be zero.
-func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
-	if bits == 0 {
-		return
-	}
-	value <<= (16 - bits) & 15
-	value >>= (16 - bits) & 15
-	b.bitContainer |= uint64(value) << (b.nBits & 63)
-	b.nBits += bits
-}
-
-// flush will flush all pending full bytes.
-// There will be at least 56 bits available for writing when this has been called.
-// Using flush32 is faster, but leaves less space for writing.
-func (b *bitWriter) flush() {
-	v := b.nBits >> 3
-	switch v {
-	case 0:
-		return
-	case 1:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-		)
-		b.bitContainer >>= 1 << 3
-	case 2:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-		)
-		b.bitContainer >>= 2 << 3
-	case 3:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-		)
-		b.bitContainer >>= 3 << 3
-	case 4:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-		)
-		b.bitContainer >>= 4 << 3
-	case 5:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-		)
-		b.bitContainer >>= 5 << 3
-	case 6:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-			byte(b.bitContainer>>40),
-		)
-		b.bitContainer >>= 6 << 3
-	case 7:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-			byte(b.bitContainer>>40),
-			byte(b.bitContainer>>48),
-		)
-		b.bitContainer >>= 7 << 3
-	case 8:
-		b.out = append(b.out,
-			byte(b.bitContainer),
-			byte(b.bitContainer>>8),
-			byte(b.bitContainer>>16),
-			byte(b.bitContainer>>24),
-			byte(b.bitContainer>>32),
-			byte(b.bitContainer>>40),
-			byte(b.bitContainer>>48),
-			byte(b.bitContainer>>56),
-		)
-		b.bitContainer = 0
-		b.nBits = 0
-		return
-	default:
-		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
-	}
-	b.nBits &= 7
-}
-
 // flush32 will flush out, so there are at least 32 bits available for writing.
 func (b *bitWriter) flush32() {
 	if b.nBits < 32 {
@@ -201,10 +93,3 @@
 	b.flushAlign()
 	return nil
 }
-
-// reset and continue writing by appending to out.
-func (b *bitWriter) reset(out []byte) {
-	b.bitContainer = 0
-	b.nBits = 0
-	b.out = out
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/bytereader.go b/vendor/github.com/klauspost/compress/huff0/bytereader.go
index 50bcdf6..4dcab8d 100644
--- a/vendor/github.com/klauspost/compress/huff0/bytereader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bytereader.go
@@ -20,11 +20,6 @@
 	b.off = 0
 }
 
-// advance the stream b n bytes.
-func (b *byteReader) advance(n uint) {
-	b.off += int(n)
-}
-
 // Int32 returns a little endian int32 starting at current offset.
 func (b byteReader) Int32() int32 {
 	v3 := int32(b.b[b.off+3])
@@ -43,11 +38,6 @@
 	return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
 }
 
-// unread returns the unread portion of the input.
-func (b byteReader) unread() []byte {
-	return b.b[b.off:]
-}
-
 // remain will return the number of bytes remaining.
 func (b byteReader) remain() int {
 	return len(b.b) - b.off
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index 0823c92..4d14542 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -2,6 +2,7 @@
 
 import (
 	"fmt"
+	"math"
 	"runtime"
 	"sync"
 )
@@ -161,6 +162,70 @@
 	return s.Out, false, nil
 }
 
+// EstimateSizes will estimate the data sizes
+func EstimateSizes(in []byte, s *Scratch) (tableSz, dataSz, reuseSz int, err error) {
+	s, err = s.prepare(in)
+	if err != nil {
+		return 0, 0, 0, err
+	}
+
+	// Create histogram, if none was provided.
+	tableSz, dataSz, reuseSz = -1, -1, -1
+	maxCount := s.maxCount
+	var canReuse = false
+	if maxCount == 0 {
+		maxCount, canReuse = s.countSimple(in)
+	} else {
+		canReuse = s.canUseTable(s.prevTable)
+	}
+
+	// We want the output size to be less than this:
+	wantSize := len(in)
+	if s.WantLogLess > 0 {
+		wantSize -= wantSize >> s.WantLogLess
+	}
+
+	// Reset for next run.
+	s.clearCount = true
+	s.maxCount = 0
+	if maxCount >= len(in) {
+		if maxCount > len(in) {
+			return 0, 0, 0, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
+		}
+		if len(in) == 1 {
+			return 0, 0, 0, ErrIncompressible
+		}
+		// One symbol, use RLE
+		return 0, 0, 0, ErrUseRLE
+	}
+	if maxCount == 1 || maxCount < (len(in)>>7) {
+		// Each symbol present maximum once or too well distributed.
+		return 0, 0, 0, ErrIncompressible
+	}
+
+	// Calculate new table.
+	err = s.buildCTable()
+	if err != nil {
+		return 0, 0, 0, err
+	}
+
+	if false && !s.canUseTable(s.cTable) {
+		panic("invalid table generated")
+	}
+
+	tableSz, err = s.cTable.estTableSize(s)
+	if err != nil {
+		return 0, 0, 0, err
+	}
+	if canReuse {
+		reuseSz = s.prevTable.estimateSize(s.count[:s.symbolLen])
+	}
+	dataSz = s.cTable.estimateSize(s.count[:s.symbolLen])
+
+	// Restore
+	return tableSz, dataSz, reuseSz, nil
+}
+
 func (s *Scratch) compress1X(src []byte) ([]byte, error) {
 	return s.compress1xDo(s.Out, src)
 }
@@ -225,6 +290,10 @@
 		if err != nil {
 			return nil, err
 		}
+		if len(s.Out)-idx > math.MaxUint16 {
+			// We cannot store the size in the jump table
+			return nil, ErrIncompressible
+		}
 		// Write compressed length as little endian before block.
 		if i < 3 {
 			// Last length is not written.
@@ -268,6 +337,10 @@
 			return nil, errs[i]
 		}
 		o := s.tmpOut[i]
+		if len(o) > math.MaxUint16 {
+			// We cannot store the size in the jump table
+			return nil, ErrIncompressible
+		}
 		// Write compressed length as little endian before block.
 		if i < 3 {
 			// Last length is not written.
@@ -331,6 +404,7 @@
 	return true
 }
 
+//lint:ignore U1000 used for debugging
 func (s *Scratch) validateTable(c cTable) bool {
 	if len(c) < int(s.symbolLen) {
 		return false
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 41703bb..c0c48bd 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -4,13 +4,13 @@
 	"errors"
 	"fmt"
 	"io"
+	"sync"
 
 	"github.com/klauspost/compress/fse"
 )
 
 type dTable struct {
 	single []dEntrySingle
-	double []dEntryDouble
 }
 
 // single-symbols decoding
@@ -18,13 +18,6 @@
 	entry uint16
 }
 
-// double-symbols decoding
-type dEntryDouble struct {
-	seq   uint16
-	nBits uint8
-	len   uint8
-}
-
 // Uses special code for all tables that are < 8 bits.
 const use8BitTables = true
 
@@ -34,7 +27,7 @@
 // If no Scratch is provided a new one is allocated.
 // The returned Scratch can be used for encoding or decoding input using this table.
 func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
-	s, err = s.prepare(in)
+	s, err = s.prepare(nil)
 	if err != nil {
 		return s, nil, err
 	}
@@ -216,6 +209,7 @@
 	return &Decoder{
 		dt:             s.dt,
 		actualTableLog: s.actualTableLog,
+		bufs:           &s.decPool,
 	}
 }
 
@@ -223,103 +217,15 @@
 type Decoder struct {
 	dt             dTable
 	actualTableLog uint8
+	bufs           *sync.Pool
 }
 
-// Decompress1X will decompress a 1X encoded stream.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
+func (d *Decoder) buffer() *[4][256]byte {
+	buf, ok := d.bufs.Get().(*[4][256]byte)
+	if ok {
+		return buf
 	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress1X8Bit(dst, src)
-	}
-	var br bitReaderShifted
-	err := br.init(src)
-	if err != nil {
-		return dst, err
-	}
-	maxDecodedSize := cap(dst)
-	dst = dst[:0]
-
-	// Avoid bounds check by always having full sized table.
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	dt := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
-	var off uint8
-
-	for br.off >= 8 {
-		br.fillFast()
-		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+0] = uint8(v.entry >> 8)
-
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+1] = uint8(v.entry >> 8)
-
-		// Refill
-		br.fillFast()
-
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+2] = uint8(v.entry >> 8)
-
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+3] = uint8(v.entry >> 8)
-
-		off += 4
-		if off == 0 {
-			if len(dst)+256 > maxDecodedSize {
-				br.close()
-				return nil, ErrMaxDecodedSizeExceeded
-			}
-			dst = append(dst, buf[:]...)
-		}
-	}
-
-	if len(dst)+int(off) > maxDecodedSize {
-		br.close()
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	dst = append(dst, buf[:off]...)
-
-	// br < 8, so uint8 is fine
-	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
-	for bitsLeft > 0 {
-		br.fill()
-		if false && br.bitsRead >= 32 {
-			if br.off >= 4 {
-				v := br.in[br.off-4:]
-				v = v[:4]
-				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-				br.value = (br.value << 32) | uint64(low)
-				br.bitsRead -= 32
-				br.off -= 4
-			} else {
-				for br.off > 0 {
-					br.value = (br.value << 8) | uint64(br.in[br.off-1])
-					br.bitsRead -= 8
-					br.off--
-				}
-			}
-		}
-		if len(dst) >= maxDecodedSize {
-			br.close()
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
-		nBits := uint8(v.entry)
-		br.advance(nBits)
-		bitsLeft -= nBits
-		dst = append(dst, uint8(v.entry>>8))
-	}
-	return dst, br.close()
+	return &[4][256]byte{}
 }
 
 // decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
@@ -341,41 +247,258 @@
 	dt := d.dt.single[:256]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
-	shift := (8 - d.actualTableLog) & 7
+	switch d.actualTableLog {
+	case 8:
+		const shift = 8 - 8
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
 
-	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
-	for br.off >= 4 {
-		br.fillFast()
-		v := dt[br.peekByteFast()>>shift]
-		br.advance(uint8(v.entry))
-		buf[off+0] = uint8(v.entry >> 8)
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
 
-		v = dt[br.peekByteFast()>>shift]
-		br.advance(uint8(v.entry))
-		buf[off+1] = uint8(v.entry >> 8)
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
 
-		v = dt[br.peekByteFast()>>shift]
-		br.advance(uint8(v.entry))
-		buf[off+2] = uint8(v.entry >> 8)
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
 
-		v = dt[br.peekByteFast()>>shift]
-		br.advance(uint8(v.entry))
-		buf[off+3] = uint8(v.entry >> 8)
-
-		off += 4
-		if off == 0 {
-			if len(dst)+256 > maxDecodedSize {
-				br.close()
-				return nil, ErrMaxDecodedSizeExceeded
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					br.close()
+					d.bufs.Put(bufs)
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
 			}
-			dst = append(dst, buf[:]...)
 		}
+	case 7:
+		const shift = 8 - 7
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					br.close()
+					d.bufs.Put(bufs)
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 6:
+		const shift = 8 - 6
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 5:
+		const shift = 8 - 5
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 4:
+		const shift = 8 - 4
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 3:
+		const shift = 8 - 3
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 2:
+		const shift = 8 - 2
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 1:
+		const shift = 8 - 1
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	default:
+		d.bufs.Put(bufs)
+		return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog)
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -383,6 +506,8 @@
 
 	// br < 4, so uint8 is fine
 	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
+	shift := (8 - d.actualTableLog) & 7
+
 	for bitsLeft > 0 {
 		if br.bitsRead >= 64-8 {
 			for br.off > 0 {
@@ -393,6 +518,7 @@
 		}
 		if len(dst) >= maxDecodedSize {
 			br.close()
+			d.bufs.Put(bufs)
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := dt[br.peekByteFast()>>shift]
@@ -401,6 +527,7 @@
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
@@ -420,33 +547,35 @@
 	dt := d.dt.single[:256]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
-	const shift = 0
+	const shift = 56
 
 	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
 	for br.off >= 4 {
 		br.fillFast()
-		v := dt[br.peekByteFast()>>shift]
+		v := dt[uint8(br.value>>shift)]
 		br.advance(uint8(v.entry))
 		buf[off+0] = uint8(v.entry >> 8)
 
-		v = dt[br.peekByteFast()>>shift]
+		v = dt[uint8(br.value>>shift)]
 		br.advance(uint8(v.entry))
 		buf[off+1] = uint8(v.entry >> 8)
 
-		v = dt[br.peekByteFast()>>shift]
+		v = dt[uint8(br.value>>shift)]
 		br.advance(uint8(v.entry))
 		buf[off+2] = uint8(v.entry >> 8)
 
-		v = dt[br.peekByteFast()>>shift]
+		v = dt[uint8(br.value>>shift)]
 		br.advance(uint8(v.entry))
 		buf[off+3] = uint8(v.entry >> 8)
 
 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
+				d.bufs.Put(bufs)
 				br.close()
 				return nil, ErrMaxDecodedSizeExceeded
 			}
@@ -455,6 +584,7 @@
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -471,15 +601,17 @@
 			}
 		}
 		if len(dst) >= maxDecodedSize {
+			d.bufs.Put(bufs)
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
-		v := dt[br.peekByteFast()>>shift]
+		v := dt[br.peekByteFast()]
 		nBits := uint8(v.entry)
 		br.advance(nBits)
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
@@ -487,196 +619,6 @@
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
 // the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(src) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress4X8bit(dst, src)
-	}
-
-	var br [4]bitReaderShifted
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
-	var off uint8
-	var decoded int
-
-	// Decode 2 values from each decoder/loop.
-	const bufoff = 256 / 4
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			const stream = 0
-			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v2 := single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v2 = single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v2 := single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v2 = single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
-		}
-
-		off += 2
-
-		if off == bufoff {
-			if bufoff > dstEvery {
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
-			out = out[bufoff:]
-			decoded += 256
-			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	for i := range br {
-		offset := dstEvery * i
-		br := &br[i]
-		bitsLeft := br.off*8 + uint(64-br.bitsRead)
-		for bitsLeft > 0 {
-			br.fill()
-			if false && br.bitsRead >= 32 {
-				if br.off >= 4 {
-					v := br.in[br.off-4:]
-					v = v[:4]
-					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-					br.value = (br.value << 32) | uint64(low)
-					br.bitsRead -= 32
-					br.off -= 4
-				} else {
-					for br.off > 0 {
-						br.value = (br.value << 8) | uint64(br.in[br.off-1])
-						br.bitsRead -= 8
-						br.off--
-					}
-				}
-			}
-			// end inline...
-			if offset >= len(out) {
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			val := br.peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			return nil, err
-		}
-	}
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
 func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 	if d.actualTableLog == 8 {
 		return d.decompress4X8bitExactly(dst, src)
@@ -706,19 +648,18 @@
 	out := dst
 	dstEvery := (dstSize + 3) / 4
 
-	shift := (8 - d.actualTableLog) & 7
+	shift := (56 + (8 - d.actualTableLog)) & 63
 
 	const tlSize = 1 << 8
-	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	buf := d.buffer()
 	var off uint8
 	var decoded int
 
 	// Decode 4 values from each decoder/loop.
-	const bufoff = 256 / 4
+	const bufoff = 256
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
@@ -728,96 +669,109 @@
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
 
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
 
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
 
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
 
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		{
 			const stream = 2
 			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
 
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
 
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
 
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
 
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		off += 4
 
-		if off == bufoff {
+		if off == 0 {
 			if bufoff > dstEvery {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
 			out = out[bufoff:]
-			decoded += 256
+			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
@@ -825,23 +779,31 @@
 	if off > 0 {
 		ioff := int(off)
 		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 
 	// Decode remaining.
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
 	for i := range br {
 		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
 		br := &br[i]
-		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		bitsLeft := br.remaining()
 		for bitsLeft > 0 {
 			if br.finished() {
+				d.bufs.Put(buf)
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
@@ -861,24 +823,31 @@
 				}
 			}
 			// end inline...
-			if offset >= len(out) {
+			if offset >= endsAt {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 
 			// Read value and increment offset.
-			v := single[br.peekByteFast()>>shift].entry
+			v := single[uint8(br.value>>shift)].entry
 			nBits := uint8(v)
 			br.advance(nBits)
-			bitsLeft -= int(nBits)
+			bitsLeft -= uint(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
+			d.bufs.Put(buf)
 			return nil, err
 		}
 	}
+	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
@@ -914,18 +883,17 @@
 	out := dst
 	dstEvery := (dstSize + 3) / 4
 
-	const shift = 0
+	const shift = 56
 	const tlSize = 1 << 8
-	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	buf := d.buffer()
 	var off uint8
 	var decoded int
 
 	// Decode 4 values from each decoder/loop.
-	const bufoff = 256 / 4
+	const bufoff = 256
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
@@ -935,96 +903,109 @@
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
 
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
 
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
 
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
 
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		{
 			const stream = 2
 			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
 
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
 
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
 
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
 
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		off += 4
 
-		if off == bufoff {
+		if off == 0 {
 			if bufoff > dstEvery {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
 			out = out[bufoff:]
-			decoded += 256
+			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
@@ -1034,21 +1015,27 @@
 		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 
 	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
 	for i := range br {
 		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
 		br := &br[i]
-		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		bitsLeft := br.remaining()
 		for bitsLeft > 0 {
 			if br.finished() {
+				d.bufs.Put(buf)
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
@@ -1068,24 +1055,32 @@
 				}
 			}
 			// end inline...
-			if offset >= len(out) {
+			if offset >= endsAt {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 
 			// Read value and increment offset.
-			v := single[br.peekByteFast()>>shift].entry
+			v := single[br.peekByteFast()].entry
 			nBits := uint8(v)
 			br.advance(nBits)
-			bitsLeft -= int(nBits)
+			bitsLeft -= uint(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
+			d.bufs.Put(buf)
 			return nil, err
 		}
 	}
+	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
new file mode 100644
index 0000000..9f3e9f7
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -0,0 +1,222 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// This file contains the specialisation of Decoder.Decompress4X
+// and Decoder.Decompress1X that use an asm implementation of thir main loops.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+
+	"github.com/klauspost/compress/internal/cpuinfo"
+)
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog > 8.
+//go:noescape
+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+
+// decompress4x_8b_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog <= 8 which decodes 4 entries
+// per loop.
+//go:noescape
+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+
+// fallback8BitSize is the size where using Go version is faster.
+const fallback8BitSize = 800
+
+type decompress4xContext struct {
+	pbr      *[4]bitReaderShifted
+	peekBits uint8
+	out      *byte
+	dstEvery int
+	tbl      *dEntrySingle
+	decoded  int
+	limit    *byte
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+
+	use8BitTables := d.actualTableLog <= 8
+	if cap(dst) < fallback8BitSize && use8BitTables {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	var decoded int
+
+	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
+		ctx := decompress4xContext{
+			pbr:      &br,
+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+			out:      &out[0],
+			dstEvery: dstEvery,
+			tbl:      &single[0],
+			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
+		}
+		if use8BitTables {
+			decompress4x_8b_main_loop_amd64(&ctx)
+		} else {
+			decompress4x_main_loop_amd64(&ctx)
+		}
+
+		decoded = ctx.decoded
+		out = out[decoded/4:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress1X when tablelog > 8.
+//go:noescape
+func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+
+// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
+// of Decompress1X when tablelog > 8.
+//go:noescape
+func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+
+type decompress1xContext struct {
+	pbr      *bitReaderShifted
+	peekBits uint8
+	out      *byte
+	outCap   int
+	tbl      *dEntrySingle
+	decoded  int
+}
+
+// Error reported by asm implementations
+const error_max_decoded_size_exeeded = -1
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	var br bitReaderShifted
+	err := br.init(src)
+	if err != nil {
+		return dst, err
+	}
+	maxDecodedSize := cap(dst)
+	dst = dst[:maxDecodedSize]
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+
+	if maxDecodedSize >= 4 {
+		ctx := decompress1xContext{
+			pbr:      &br,
+			out:      &dst[0],
+			outCap:   maxDecodedSize,
+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+			tbl:      &d.dt.single[0],
+		}
+
+		if cpuinfo.HasBMI2() {
+			decompress1x_main_loop_bmi2(&ctx)
+		} else {
+			decompress1x_main_loop_amd64(&ctx)
+		}
+		if ctx.decoded == error_max_decoded_size_exeeded {
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+
+		dst = dst[:ctx.decoded]
+	}
+
+	// br < 8, so uint8 is fine
+	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+	for bitsLeft > 0 {
+		br.fill()
+		if len(dst) >= maxDecodedSize {
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= nBits
+		dst = append(dst, uint8(v.entry>>8))
+	}
+	return dst, br.close()
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
new file mode 100644
index 0000000..dd1a5ae
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -0,0 +1,847 @@
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
+
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_main_loop_amd64(SB), $0-8
+	XORQ DX, DX
+
+	// Preload values
+	MOVQ    ctx+0(FP), AX
+	MOVBQZX 8(AX), DI
+	MOVQ    16(AX), SI
+	MOVQ    48(AX), BX
+	MOVQ    24(AX), R9
+	MOVQ    32(AX), R10
+	MOVQ    (AX), R11
+
+	// Main loop
+main_loop:
+	MOVQ  SI, R8
+	CMPQ  R8, BX
+	SETGE DL
+
+	// br0.fillFast32()
+	MOVQ    32(R11), R12
+	MOVBQZX 40(R11), R13
+	CMPQ    R13, $0x20
+	JBE     skip_fill0
+	MOVQ    24(R11), AX
+	SUBQ    $0x20, R13
+	SUBQ    $0x04, AX
+	MOVQ    (R11), R14
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(R14*1), R14
+	MOVQ R13, CX
+	SHLQ CL, R14
+	MOVQ AX, 24(R11)
+	ORQ  R14, R12
+
+	// exhausted = exhausted || (br0.off < 4)
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill0:
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v0 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br0.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ DI, CX
+	MOVQ R12, R14
+	SHRQ CL, R14
+
+	// v1 := table[val1&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br0.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (R8)
+
+	// update the bitreader structure
+	MOVQ R12, 32(R11)
+	MOVB R13, 40(R11)
+	ADDQ R9, R8
+
+	// br1.fillFast32()
+	MOVQ    80(R11), R12
+	MOVBQZX 88(R11), R13
+	CMPQ    R13, $0x20
+	JBE     skip_fill1
+	MOVQ    72(R11), AX
+	SUBQ    $0x20, R13
+	SUBQ    $0x04, AX
+	MOVQ    48(R11), R14
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(R14*1), R14
+	MOVQ R13, CX
+	SHLQ CL, R14
+	MOVQ AX, 72(R11)
+	ORQ  R14, R12
+
+	// exhausted = exhausted || (br1.off < 4)
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1:
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v0 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br1.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ DI, CX
+	MOVQ R12, R14
+	SHRQ CL, R14
+
+	// v1 := table[val1&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br1.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (R8)
+
+	// update the bitreader structure
+	MOVQ R12, 80(R11)
+	MOVB R13, 88(R11)
+	ADDQ R9, R8
+
+	// br2.fillFast32()
+	MOVQ    128(R11), R12
+	MOVBQZX 136(R11), R13
+	CMPQ    R13, $0x20
+	JBE     skip_fill2
+	MOVQ    120(R11), AX
+	SUBQ    $0x20, R13
+	SUBQ    $0x04, AX
+	MOVQ    96(R11), R14
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(R14*1), R14
+	MOVQ R13, CX
+	SHLQ CL, R14
+	MOVQ AX, 120(R11)
+	ORQ  R14, R12
+
+	// exhausted = exhausted || (br2.off < 4)
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill2:
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v0 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br2.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ DI, CX
+	MOVQ R12, R14
+	SHRQ CL, R14
+
+	// v1 := table[val1&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br2.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (R8)
+
+	// update the bitreader structure
+	MOVQ R12, 128(R11)
+	MOVB R13, 136(R11)
+	ADDQ R9, R8
+
+	// br3.fillFast32()
+	MOVQ    176(R11), R12
+	MOVBQZX 184(R11), R13
+	CMPQ    R13, $0x20
+	JBE     skip_fill3
+	MOVQ    168(R11), AX
+	SUBQ    $0x20, R13
+	SUBQ    $0x04, AX
+	MOVQ    144(R11), R14
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(R14*1), R14
+	MOVQ R13, CX
+	SHLQ CL, R14
+	MOVQ AX, 168(R11)
+	ORQ  R14, R12
+
+	// exhausted = exhausted || (br3.off < 4)
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill3:
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v0 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br3.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ DI, CX
+	MOVQ R12, R14
+	SHRQ CL, R14
+
+	// v1 := table[val1&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br3.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (R8)
+
+	// update the bitreader structure
+	MOVQ  R12, 176(R11)
+	MOVB  R13, 184(R11)
+	ADDQ  $0x02, SI
+	TESTB DL, DL
+	JZ    main_loop
+	MOVQ  ctx+0(FP), AX
+	SUBQ  16(AX), SI
+	SHLQ  $0x02, SI
+	MOVQ  SI, 40(AX)
+	RET
+
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
+	XORQ DX, DX
+
+	// Preload values
+	MOVQ    ctx+0(FP), CX
+	MOVBQZX 8(CX), DI
+	MOVQ    16(CX), BX
+	MOVQ    48(CX), SI
+	MOVQ    24(CX), R9
+	MOVQ    32(CX), R10
+	MOVQ    (CX), R11
+
+	// Main loop
+main_loop:
+	MOVQ  BX, R8
+	CMPQ  R8, SI
+	SETGE DL
+
+	// br0.fillFast32()
+	MOVQ    32(R11), R12
+	MOVBQZX 40(R11), R13
+	CMPQ    R13, $0x20
+	JBE     skip_fill0
+	MOVQ    24(R11), R14
+	SUBQ    $0x20, R13
+	SUBQ    $0x04, R14
+	MOVQ    (R11), R15
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R14)(R15*1), R15
+	MOVQ R13, CX
+	SHLQ CL, R15
+	MOVQ R14, 24(R11)
+	ORQ  R15, R12
+
+	// exhausted = exhausted || (br0.off < 4)
+	CMPQ  R14, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill0:
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v0 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br0.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v1 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br0.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R12
+	ADDB   CL, R13
+	BSWAPL AX
+
+	// val2 := br0.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v2 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br0.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// val3 := br0.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v3 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br0.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R12
+	ADDB   CL, R13
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (R8)
+
+	// update the bitreader structure
+	MOVQ R12, 32(R11)
+	MOVB R13, 40(R11)
+	ADDQ R9, R8
+
+	// br1.fillFast32()
+	MOVQ    80(R11), R12
+	MOVBQZX 88(R11), R13
+	CMPQ    R13, $0x20
+	JBE     skip_fill1
+	MOVQ    72(R11), R14
+	SUBQ    $0x20, R13
+	SUBQ    $0x04, R14
+	MOVQ    48(R11), R15
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R14)(R15*1), R15
+	MOVQ R13, CX
+	SHLQ CL, R15
+	MOVQ R14, 72(R11)
+	ORQ  R15, R12
+
+	// exhausted = exhausted || (br1.off < 4)
+	CMPQ  R14, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1:
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v0 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br1.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v1 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br1.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R12
+	ADDB   CL, R13
+	BSWAPL AX
+
+	// val2 := br1.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v2 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br1.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// val3 := br1.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v3 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br1.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R12
+	ADDB   CL, R13
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (R8)
+
+	// update the bitreader structure
+	MOVQ R12, 80(R11)
+	MOVB R13, 88(R11)
+	ADDQ R9, R8
+
+	// br2.fillFast32()
+	MOVQ    128(R11), R12
+	MOVBQZX 136(R11), R13
+	CMPQ    R13, $0x20
+	JBE     skip_fill2
+	MOVQ    120(R11), R14
+	SUBQ    $0x20, R13
+	SUBQ    $0x04, R14
+	MOVQ    96(R11), R15
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R14)(R15*1), R15
+	MOVQ R13, CX
+	SHLQ CL, R15
+	MOVQ R14, 120(R11)
+	ORQ  R15, R12
+
+	// exhausted = exhausted || (br2.off < 4)
+	CMPQ  R14, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill2:
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v0 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br2.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v1 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br2.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R12
+	ADDB   CL, R13
+	BSWAPL AX
+
+	// val2 := br2.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v2 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br2.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// val3 := br2.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v3 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br2.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R12
+	ADDB   CL, R13
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (R8)
+
+	// update the bitreader structure
+	MOVQ R12, 128(R11)
+	MOVB R13, 136(R11)
+	ADDQ R9, R8
+
+	// br3.fillFast32()
+	MOVQ    176(R11), R12
+	MOVBQZX 184(R11), R13
+	CMPQ    R13, $0x20
+	JBE     skip_fill3
+	MOVQ    168(R11), R14
+	SUBQ    $0x20, R13
+	SUBQ    $0x04, R14
+	MOVQ    144(R11), R15
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R14)(R15*1), R15
+	MOVQ R13, CX
+	SHLQ CL, R15
+	MOVQ R14, 168(R11)
+	ORQ  R15, R12
+
+	// exhausted = exhausted || (br3.off < 4)
+	CMPQ  R14, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill3:
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v0 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br3.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v1 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br3.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R12
+	ADDB   CL, R13
+	BSWAPL AX
+
+	// val2 := br3.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v2 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br3.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R12
+	ADDB CL, R13
+
+	// val3 := br3.peekTopBits(peekBits)
+	MOVQ R12, R14
+	MOVQ DI, CX
+	SHRQ CL, R14
+
+	// v3 := table[val0&mask]
+	MOVW (R10)(R14*2), CX
+
+	// br3.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R12
+	ADDB   CL, R13
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (R8)
+
+	// update the bitreader structure
+	MOVQ  R12, 176(R11)
+	MOVB  R13, 184(R11)
+	ADDQ  $0x04, BX
+	TESTB DL, DL
+	JZ    main_loop
+	MOVQ  ctx+0(FP), AX
+	SUBQ  16(AX), BX
+	SHLQ  $0x02, BX
+	MOVQ  BX, 40(AX)
+	RET
+
+// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+TEXT ·decompress1x_main_loop_amd64(SB), $0-8
+	MOVQ    ctx+0(FP), CX
+	MOVQ    16(CX), DX
+	MOVQ    24(CX), BX
+	CMPQ    BX, $0x04
+	JB      error_max_decoded_size_exeeded
+	LEAQ    (DX)(BX*1), BX
+	MOVQ    (CX), SI
+	MOVQ    (SI), R8
+	MOVQ    24(SI), R9
+	MOVQ    32(SI), R10
+	MOVBQZX 40(SI), R11
+	MOVQ    32(CX), SI
+	MOVBQZX 8(CX), DI
+	JMP     loop_condition
+
+main_loop:
+	// Check if we have room for 4 bytes in the output buffer
+	LEAQ 4(DX), CX
+	CMPQ CX, BX
+	JGE  error_max_decoded_size_exeeded
+
+	// Decode 4 values
+	CMPQ R11, $0x20
+	JL   bitReader_fillFast_1_end
+	SUBQ $0x20, R11
+	SUBQ $0x04, R9
+	MOVL (R8)(R9*1), R12
+	MOVQ R11, CX
+	SHLQ CL, R12
+	ORQ  R12, R10
+
+bitReader_fillFast_1_end:
+	MOVQ    DI, CX
+	MOVQ    R10, R12
+	SHRQ    CL, R12
+	MOVW    (SI)(R12*2), CX
+	MOVB    CH, AL
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLQ    CL, R10
+	MOVQ    DI, CX
+	MOVQ    R10, R12
+	SHRQ    CL, R12
+	MOVW    (SI)(R12*2), CX
+	MOVB    CH, AH
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLQ    CL, R10
+	BSWAPL  AX
+	CMPQ    R11, $0x20
+	JL      bitReader_fillFast_2_end
+	SUBQ    $0x20, R11
+	SUBQ    $0x04, R9
+	MOVL    (R8)(R9*1), R12
+	MOVQ    R11, CX
+	SHLQ    CL, R12
+	ORQ     R12, R10
+
+bitReader_fillFast_2_end:
+	MOVQ    DI, CX
+	MOVQ    R10, R12
+	SHRQ    CL, R12
+	MOVW    (SI)(R12*2), CX
+	MOVB    CH, AH
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLQ    CL, R10
+	MOVQ    DI, CX
+	MOVQ    R10, R12
+	SHRQ    CL, R12
+	MOVW    (SI)(R12*2), CX
+	MOVB    CH, AL
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLQ    CL, R10
+	BSWAPL  AX
+
+	// Store the decoded values
+	MOVL AX, (DX)
+	ADDQ $0x04, DX
+
+loop_condition:
+	CMPQ R9, $0x08
+	JGE  main_loop
+
+	// Update ctx structure
+	MOVQ ctx+0(FP), AX
+	SUBQ 16(AX), DX
+	MOVQ DX, 40(AX)
+	MOVQ (AX), AX
+	MOVQ R9, 24(AX)
+	MOVQ R10, 32(AX)
+	MOVB R11, 40(AX)
+	RET
+
+	// Report error
+error_max_decoded_size_exeeded:
+	MOVQ ctx+0(FP), AX
+	MOVQ $-1, CX
+	MOVQ CX, 40(AX)
+	RET
+
+// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+// Requires: BMI2
+TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
+	MOVQ    ctx+0(FP), CX
+	MOVQ    16(CX), DX
+	MOVQ    24(CX), BX
+	CMPQ    BX, $0x04
+	JB      error_max_decoded_size_exeeded
+	LEAQ    (DX)(BX*1), BX
+	MOVQ    (CX), SI
+	MOVQ    (SI), R8
+	MOVQ    24(SI), R9
+	MOVQ    32(SI), R10
+	MOVBQZX 40(SI), R11
+	MOVQ    32(CX), SI
+	MOVBQZX 8(CX), DI
+	JMP     loop_condition
+
+main_loop:
+	// Check if we have room for 4 bytes in the output buffer
+	LEAQ 4(DX), CX
+	CMPQ CX, BX
+	JGE  error_max_decoded_size_exeeded
+
+	// Decode 4 values
+	CMPQ  R11, $0x20
+	JL    bitReader_fillFast_1_end
+	SUBQ  $0x20, R11
+	SUBQ  $0x04, R9
+	MOVL  (R8)(R9*1), CX
+	SHLXQ R11, CX, CX
+	ORQ   CX, R10
+
+bitReader_fillFast_1_end:
+	SHRXQ   DI, R10, CX
+	MOVW    (SI)(CX*2), CX
+	MOVB    CH, AL
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLXQ   CX, R10, R10
+	SHRXQ   DI, R10, CX
+	MOVW    (SI)(CX*2), CX
+	MOVB    CH, AH
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLXQ   CX, R10, R10
+	BSWAPL  AX
+	CMPQ    R11, $0x20
+	JL      bitReader_fillFast_2_end
+	SUBQ    $0x20, R11
+	SUBQ    $0x04, R9
+	MOVL    (R8)(R9*1), CX
+	SHLXQ   R11, CX, CX
+	ORQ     CX, R10
+
+bitReader_fillFast_2_end:
+	SHRXQ   DI, R10, CX
+	MOVW    (SI)(CX*2), CX
+	MOVB    CH, AH
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLXQ   CX, R10, R10
+	SHRXQ   DI, R10, CX
+	MOVW    (SI)(CX*2), CX
+	MOVB    CH, AL
+	MOVBQZX CL, CX
+	ADDQ    CX, R11
+	SHLXQ   CX, R10, R10
+	BSWAPL  AX
+
+	// Store the decoded values
+	MOVL AX, (DX)
+	ADDQ $0x04, DX
+
+loop_condition:
+	CMPQ R9, $0x08
+	JGE  main_loop
+
+	// Update ctx structure
+	MOVQ ctx+0(FP), AX
+	SUBQ 16(AX), DX
+	MOVQ DX, 40(AX)
+	MOVQ (AX), AX
+	MOVQ R9, 24(AX)
+	MOVQ R10, 32(AX)
+	MOVB R11, 40(AX)
+	RET
+
+	// Report error
+error_max_decoded_size_exeeded:
+	MOVQ ctx+0(FP), AX
+	MOVQ $-1, CX
+	MOVQ CX, 40(AX)
+	RET
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
new file mode 100644
index 0000000..4f6f37c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -0,0 +1,295 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+// This file contains a generic implementation of Decoder.Decompress4X.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+)
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	buf := d.buffer()
+	var off uint8
+	var decoded int
+
+	// Decode 2 values from each decoder/loop.
+	const bufoff = 256
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		{
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		{
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		off += 2
+
+		if off == 0 {
+			if bufoff > dstEvery {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
+			out = out[bufoff:]
+			decoded += bufoff * 4
+			// There must at least be 3 buffers left.
+			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	d.bufs.Put(buf)
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress1X8Bit(dst, src)
+	}
+	var br bitReaderShifted
+	err := br.init(src)
+	if err != nil {
+		return dst, err
+	}
+	maxDecodedSize := cap(dst)
+	dst = dst[:0]
+
+	// Avoid bounds check by always having full sized table.
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	dt := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	bufs := d.buffer()
+	buf := &bufs[0]
+	var off uint8
+
+	for br.off >= 8 {
+		br.fillFast()
+		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+0] = uint8(v.entry >> 8)
+
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+1] = uint8(v.entry >> 8)
+
+		// Refill
+		br.fillFast()
+
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+2] = uint8(v.entry >> 8)
+
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+3] = uint8(v.entry >> 8)
+
+		off += 4
+		if off == 0 {
+			if len(dst)+256 > maxDecodedSize {
+				br.close()
+				d.bufs.Put(bufs)
+				return nil, ErrMaxDecodedSizeExceeded
+			}
+			dst = append(dst, buf[:]...)
+		}
+	}
+
+	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
+		br.close()
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	dst = append(dst, buf[:off]...)
+
+	// br < 8, so uint8 is fine
+	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+	for bitsLeft > 0 {
+		br.fill()
+		if false && br.bitsRead >= 32 {
+			if br.off >= 4 {
+				v := br.in[br.off-4:]
+				v = v[:4]
+				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+				br.value = (br.value << 32) | uint64(low)
+				br.bitsRead -= 32
+				br.off -= 4
+			} else {
+				for br.off > 0 {
+					br.value = (br.value << 8) | uint64(br.in[br.off-1])
+					br.bitsRead -= 8
+					br.off--
+				}
+			}
+		}
+		if len(dst) >= maxDecodedSize {
+			d.bufs.Put(bufs)
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= nBits
+		dst = append(dst, uint8(v.entry>>8))
+	}
+	d.bufs.Put(bufs)
+	return dst, br.close()
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
index 7ec2022..e8ad17a 100644
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ b/vendor/github.com/klauspost/compress/huff0/huff0.go
@@ -8,6 +8,7 @@
 	"fmt"
 	"math"
 	"math/bits"
+	"sync"
 
 	"github.com/klauspost/compress/fse"
 )
@@ -116,6 +117,7 @@
 	nodes          []nodeElt
 	tmpOut         [4][]byte
 	fse            *fse.Scratch
+	decPool        sync.Pool // *[4][256]byte buffers.
 	huffWeight     [maxSymbolValue + 1]byte
 }
 
@@ -245,6 +247,68 @@
 	return nil
 }
 
+func (c cTable) estTableSize(s *Scratch) (sz int, err error) {
+	var (
+		// precomputed conversion table
+		bitsToWeight [tableLogMax + 1]byte
+		huffLog      = s.actualTableLog
+		// last weight is not saved.
+		maxSymbolValue = uint8(s.symbolLen - 1)
+		huffWeight     = s.huffWeight[:256]
+	)
+	const (
+		maxFSETableLog = 6
+	)
+	// convert to weight
+	bitsToWeight[0] = 0
+	for n := uint8(1); n < huffLog+1; n++ {
+		bitsToWeight[n] = huffLog + 1 - n
+	}
+
+	// Acquire histogram for FSE.
+	hist := s.fse.Histogram()
+	hist = hist[:256]
+	for i := range hist[:16] {
+		hist[i] = 0
+	}
+	for n := uint8(0); n < maxSymbolValue; n++ {
+		v := bitsToWeight[c[n].nBits] & 15
+		huffWeight[n] = v
+		hist[v]++
+	}
+
+	// FSE compress if feasible.
+	if maxSymbolValue >= 2 {
+		huffMaxCnt := uint32(0)
+		huffMax := uint8(0)
+		for i, v := range hist[:16] {
+			if v == 0 {
+				continue
+			}
+			huffMax = byte(i)
+			if v > huffMaxCnt {
+				huffMaxCnt = v
+			}
+		}
+		s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
+		s.fse.TableLog = maxFSETableLog
+		b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
+		if err == nil && len(b) < int(s.symbolLen>>1) {
+			sz += 1 + len(b)
+			return sz, nil
+		}
+		// Unable to compress (RLE/uncompressible)
+	}
+	// write raw values as 4-bits (max : 15)
+	if maxSymbolValue > (256 - 128) {
+		// should not happen : likely means source cannot be compressed
+		return 0, ErrIncompressible
+	}
+	// special case, pack weights 4 bits/weight.
+	sz += 1 + int(maxSymbolValue/2)
+	return sz, nil
+}
+
 // estimateSize returns the estimated size in bytes of the input represented in the
 // histogram supplied.
 func (c cTable) estimateSize(hist []uint32) int {