vendor/github.com/klauspost/compress/huff0/decompress.go - voltha-go - Gitiles

 package huff0

 import (
 	"errors"
 	"fmt"
 	"io"

 	"github.com/klauspost/compress/fse"
 )

 type dTable struct {
 	single []dEntrySingle
 	double []dEntryDouble
 }

 // single-symbols decoding
 type dEntrySingle struct {
 	entry uint16
 }

 // double-symbols decoding
 type dEntryDouble struct {
 	seq   uint16
 	nBits uint8
 	len   uint8
 }

 // Uses special code for all tables that are < 8 bits.
 const use8BitTables = true

 // ReadTable will read a table from the input.
 // The size of the input may be larger than the table definition.
 // Any content remaining after the table definition will be returned.
 // If no Scratch is provided a new one is allocated.
 // The returned Scratch can be used for encoding or decoding input using this table.
 func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 	s, err = s.prepare(in)
 	if err != nil {
 		return s, nil, err
 	}
 	if len(in) <= 1 {
 		return s, nil, errors.New("input too small for table")
 	}
 	iSize := in[0]
 	in = in[1:]
 	if iSize >= 128 {
 		// Uncompressed
 		oSize := iSize - 127
 		iSize = (oSize + 1) / 2
 		if int(iSize) > len(in) {
 			return s, nil, errors.New("input too small for table")
 		}
 		for n := uint8(0); n < oSize; n += 2 {
 			v := in[n/2]
 			s.huffWeight[n] = v >> 4
 			s.huffWeight[n+1] = v & 15
 		}
 		s.symbolLen = uint16(oSize)
 		in = in[iSize:]
 	} else {
 		if len(in) < int(iSize) {
 			return s, nil, fmt.Errorf("input too small for table, want %d bytes, have %d", iSize, len(in))
 		}
 		// FSE compressed weights
 		s.fse.DecompressLimit = 255
 		hw := s.huffWeight[:]
 		s.fse.Out = hw
 		b, err := fse.Decompress(in[:iSize], s.fse)
 		s.fse.Out = nil
 		if err != nil {
 			return s, nil, err
 		}
 		if len(b) > 255 {
 			return s, nil, errors.New("corrupt input: output table too large")
 		}
 		s.symbolLen = uint16(len(b))
 		in = in[iSize:]
 	}

 	// collect weight stats
 	var rankStats [16]uint32
 	weightTotal := uint32(0)
 	for _, v := range s.huffWeight[:s.symbolLen] {
 		if v > tableLogMax {
 			return s, nil, errors.New("corrupt input: weight too large")
 		}
 		v2 := v & 15
 		rankStats[v2]++
 		// (1 << (v2-1)) is slower since the compiler cannot prove that v2 isn't 0.
 		weightTotal += (1 << v2) >> 1
 	}
 	if weightTotal == 0 {
 		return s, nil, errors.New("corrupt input: weights zero")
 	}

 	// get last non-null symbol weight (implied, total must be 2^n)
 	{
 		tableLog := highBit32(weightTotal) + 1
 		if tableLog > tableLogMax {
 			return s, nil, errors.New("corrupt input: tableLog too big")
 		}
 		s.actualTableLog = uint8(tableLog)
 		// determine last weight
 		{
 			total := uint32(1) << tableLog
 			rest := total - weightTotal
 			verif := uint32(1) << highBit32(rest)
 			lastWeight := highBit32(rest) + 1
 			if verif != rest {
 				// last value must be a clean power of 2
 				return s, nil, errors.New("corrupt input: last value not power of two")
 			}
 			s.huffWeight[s.symbolLen] = uint8(lastWeight)
 			s.symbolLen++
 			rankStats[lastWeight]++
 		}
 	}

 	if (rankStats[1] < 2) || (rankStats[1]&1 != 0) {
 		// by construction : at least 2 elts of rank 1, must be even
 		return s, nil, errors.New("corrupt input: min elt size, even check failed ")
 	}

 	// TODO: Choose between single/double symbol decoding

 	// Calculate starting value for each rank
 	{
 		var nextRankStart uint32
 		for n := uint8(1); n < s.actualTableLog+1; n++ {
 			current := nextRankStart
 			nextRankStart += rankStats[n] << (n - 1)
 			rankStats[n] = current
 		}
 	}

 	// fill DTable (always full size)
 	tSize := 1 << tableLogMax
 	if len(s.dt.single) != tSize {
 		s.dt.single = make([]dEntrySingle, tSize)
 	}
 	cTable := s.prevTable
 	if cap(cTable) < maxSymbolValue+1 {
 		cTable = make([]cTableEntry, 0, maxSymbolValue+1)
 	}
 	cTable = cTable[:maxSymbolValue+1]
 	s.prevTable = cTable[:s.symbolLen]
 	s.prevTableLog = s.actualTableLog

 	for n, w := range s.huffWeight[:s.symbolLen] {
 		if w == 0 {
 			cTable[n] = cTableEntry{
 				val:   0,
 				nBits: 0,
 			}
 			continue
 		}
 		length := (uint32(1) << w) >> 1
 		d := dEntrySingle{
 			entry: uint16(s.actualTableLog+1-w) | (uint16(n) << 8),
 		}

 		rank := &rankStats[w]
 		cTable[n] = cTableEntry{
 			val:   uint16(*rank >> (w - 1)),
 			nBits: uint8(d.entry),
 		}

 		single := s.dt.single[*rank : *rank+length]
 		for i := range single {
 			single[i] = d
 		}
 		*rank += length
 	}

 	return s, in, nil
 }

 // Decompress1X will decompress a 1X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // Before this is called, the table must be initialized with ReadTable unless
 // the encoder re-used the table.
 // deprecated: Use the stateless Decoder() to get a concurrent version.
 func (s *Scratch) Decompress1X(in []byte) (out []byte, err error) {
 	if cap(s.Out) < s.MaxDecodedSize {
 		s.Out = make([]byte, s.MaxDecodedSize)
 	}
 	s.Out = s.Out[:0:s.MaxDecodedSize]
 	s.Out, err = s.Decoder().Decompress1X(s.Out, in)
 	return s.Out, err
 }

 // Decompress4X will decompress a 4X encoded stream.
 // Before this is called, the table must be initialized with ReadTable unless
 // the encoder re-used the table.
 // The length of the supplied input must match the end of a block exactly.
 // The destination size of the uncompressed data must be known and provided.
 // deprecated: Use the stateless Decoder() to get a concurrent version.
 func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) {
 	if dstSize > s.MaxDecodedSize {
 		return nil, ErrMaxDecodedSizeExceeded
 	}
 	if cap(s.Out) < dstSize {
 		s.Out = make([]byte, s.MaxDecodedSize)
 	}
 	s.Out = s.Out[:0:dstSize]
 	s.Out, err = s.Decoder().Decompress4X(s.Out, in)
 	return s.Out, err
 }

 // Decoder will return a stateless decoder that can be used by multiple
 // decompressors concurrently.
 // Before this is called, the table must be initialized with ReadTable.
 // The Decoder is still linked to the scratch buffer so that cannot be reused.
 // However, it is safe to discard the scratch.
 func (s *Scratch) Decoder() *Decoder {
 	return &Decoder{
 		dt:             s.dt,
 		actualTableLog: s.actualTableLog,
 	}
 }

 // Decoder provides stateless decoding.
 type Decoder struct {
 	dt             dTable
 	actualTableLog uint8
 }

 // Decompress1X will decompress a 1X encoded stream.
 // The cap of the output buffer will be the maximum decompressed size.
 // The length of the supplied input must match the end of a block exactly.
 func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	if len(d.dt.single) == 0 {
 		return nil, errors.New("no table loaded")
 	}
 	if use8BitTables && d.actualTableLog <= 8 {
 		return d.decompress1X8Bit(dst, src)
 	}
 	var br bitReaderShifted
 	err := br.init(src)
 	if err != nil {
 		return dst, err
 	}
 	maxDecodedSize := cap(dst)
 	dst = dst[:0]

 	// Avoid bounds check by always having full sized table.
 	const tlSize = 1 << tableLogMax
 	const tlMask = tlSize - 1
 	dt := d.dt.single[:tlSize]

 	// Use temp table to avoid bound checks/append penalty.
 	var buf [256]byte
 	var off uint8

 	for br.off >= 8 {
 		br.fillFast()
 		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+0] = uint8(v.entry >> 8)

 		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+1] = uint8(v.entry >> 8)

 		// Refill
 		br.fillFast()

 		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+2] = uint8(v.entry >> 8)

 		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
 		br.advance(uint8(v.entry))
 		buf[off+3] = uint8(v.entry >> 8)

 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
 				br.close()
 				return nil, ErrMaxDecodedSizeExceeded
 			}
 			dst = append(dst, buf[:]...)
 		}
 	}

 	if len(dst)+int(off) > maxDecodedSize {
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
 	dst = append(dst, buf[:off]...)

 	// br < 8, so uint8 is fine
 	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
 	for bitsLeft > 0 {
 		br.fill()
 		if false && br.bitsRead >= 32 {
 			if br.off >= 4 {
 				v := br.in[br.off-4:]
 				v = v[:4]
 				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 				br.value = (br.value << 32) | uint64(low)
 				br.bitsRead -= 32
 				br.off -= 4
 			} else {
 				for br.off > 0 {
 					br.value = (br.value << 8) | uint64(br.in[br.off-1])
 					br.bitsRead -= 8
 					br.off--
 				}
 			}
 		}
 		if len(dst) >= maxDecodedSize {
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
 		nBits := uint8(v.entry)
 		br.advance(nBits)
 		bitsLeft -= nBits
 		dst = append(dst, uint8(v.entry>>8))
 	}
 	return dst, br.close()
 }

 // decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
 // The cap of the output buffer will be the maximum decompressed size.
 // The length of the supplied input must match the end of a block exactly.
 func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 	if d.actualTableLog == 8 {
 		return d.decompress1X8BitExactly(dst, src)
 	}
 	var br bitReaderBytes
 	err := br.init(src)
 	if err != nil {
 		return dst, err
 	}
 	maxDecodedSize := cap(dst)
 	dst = dst[:0]

 	// Avoid bounds check by always having full sized table.
 	dt := d.dt.single[:256]

 	// Use temp table to avoid bound checks/append penalty.
 	var buf [256]byte
 	var off uint8

 	shift := (8 - d.actualTableLog) & 7

 	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
 	for br.off >= 4 {
 		br.fillFast()
 		v := dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+0] = uint8(v.entry >> 8)

 		v = dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+1] = uint8(v.entry >> 8)

 		v = dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+2] = uint8(v.entry >> 8)

 		v = dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+3] = uint8(v.entry >> 8)

 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
 				br.close()
 				return nil, ErrMaxDecodedSizeExceeded
 			}
 			dst = append(dst, buf[:]...)
 		}
 	}

 	if len(dst)+int(off) > maxDecodedSize {
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
 	dst = append(dst, buf[:off]...)

 	// br < 4, so uint8 is fine
 	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
 	for bitsLeft > 0 {
 		if br.bitsRead >= 64-8 {
 			for br.off > 0 {
 				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
 				br.bitsRead -= 8
 				br.off--
 			}
 		}
 		if len(dst) >= maxDecodedSize {
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := dt[br.peekByteFast()>>shift]
 		nBits := uint8(v.entry)
 		br.advance(nBits)
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
 	return dst, br.close()
 }

 // decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
 // The cap of the output buffer will be the maximum decompressed size.
 // The length of the supplied input must match the end of a block exactly.
 func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 	var br bitReaderBytes
 	err := br.init(src)
 	if err != nil {
 		return dst, err
 	}
 	maxDecodedSize := cap(dst)
 	dst = dst[:0]

 	// Avoid bounds check by always having full sized table.
 	dt := d.dt.single[:256]

 	// Use temp table to avoid bound checks/append penalty.
 	var buf [256]byte
 	var off uint8

 	const shift = 0

 	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
 	for br.off >= 4 {
 		br.fillFast()
 		v := dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+0] = uint8(v.entry >> 8)

 		v = dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+1] = uint8(v.entry >> 8)

 		v = dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+2] = uint8(v.entry >> 8)

 		v = dt[br.peekByteFast()>>shift]
 		br.advance(uint8(v.entry))
 		buf[off+3] = uint8(v.entry >> 8)

 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
 				br.close()
 				return nil, ErrMaxDecodedSizeExceeded
 			}
 			dst = append(dst, buf[:]...)
 		}
 	}

 	if len(dst)+int(off) > maxDecodedSize {
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
 	dst = append(dst, buf[:off]...)

 	// br < 4, so uint8 is fine
 	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
 	for bitsLeft > 0 {
 		if br.bitsRead >= 64-8 {
 			for br.off > 0 {
 				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
 				br.bitsRead -= 8
 				br.off--
 			}
 		}
 		if len(dst) >= maxDecodedSize {
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := dt[br.peekByteFast()>>shift]
 		nBits := uint8(v.entry)
 		br.advance(nBits)
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
 	return dst, br.close()
 }

 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
 // the uncompressed data exactly.
 func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 	if len(d.dt.single) == 0 {
 		return nil, errors.New("no table loaded")
 	}
 	if len(src) < 6+(4*1) {
 		return nil, errors.New("input too small")
 	}
 	if use8BitTables && d.actualTableLog <= 8 {
 		return d.decompress4X8bit(dst, src)
 	}

 	var br [4]bitReaderShifted
 	start := 6
 	for i := 0; i < 3; i++ {
 		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
 		if start+length >= len(src) {
 			return nil, errors.New("truncated input (or invalid offset)")
 		}
 		err := br[i].init(src[start : start+length])
 		if err != nil {
 			return nil, err
 		}
 		start += length
 	}
 	err := br[3].init(src[start:])
 	if err != nil {
 		return nil, err
 	}

 	// destination, offset to match first output
 	dstSize := cap(dst)
 	dst = dst[:dstSize]
 	out := dst
 	dstEvery := (dstSize + 3) / 4

 	const tlSize = 1 << tableLogMax
 	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]

 	// Use temp table to avoid bound checks/append penalty.
 	var buf [256]byte
 	var off uint8
 	var decoded int

 	// Decode 2 values from each decoder/loop.
 	const bufoff = 256 / 4
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
 		}

 		{
 			const stream = 0
 			const stream2 = 1
 			br[stream].fillFast()
 			br[stream2].fillFast()

 			val := br[stream].peekBitsFast(d.actualTableLog)
 			v := single[val&tlMask]
 			br[stream].advance(uint8(v.entry))
 			buf[off+bufoff*stream] = uint8(v.entry >> 8)

 			val2 := br[stream2].peekBitsFast(d.actualTableLog)
 			v2 := single[val2&tlMask]
 			br[stream2].advance(uint8(v2.entry))
 			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)

 			val = br[stream].peekBitsFast(d.actualTableLog)
 			v = single[val&tlMask]
 			br[stream].advance(uint8(v.entry))
 			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)

 			val2 = br[stream2].peekBitsFast(d.actualTableLog)
 			v2 = single[val2&tlMask]
 			br[stream2].advance(uint8(v2.entry))
 			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
 		}

 		{
 			const stream = 2
 			const stream2 = 3
 			br[stream].fillFast()
 			br[stream2].fillFast()

 			val := br[stream].peekBitsFast(d.actualTableLog)
 			v := single[val&tlMask]
 			br[stream].advance(uint8(v.entry))
 			buf[off+bufoff*stream] = uint8(v.entry >> 8)

 			val2 := br[stream2].peekBitsFast(d.actualTableLog)
 			v2 := single[val2&tlMask]
 			br[stream2].advance(uint8(v2.entry))
 			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)

 			val = br[stream].peekBitsFast(d.actualTableLog)
 			v = single[val&tlMask]
 			br[stream].advance(uint8(v.entry))
 			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)

 			val2 = br[stream2].peekBitsFast(d.actualTableLog)
 			v2 = single[val2&tlMask]
 			br[stream2].advance(uint8(v2.entry))
 			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
 		}

 		off += 2

 		if off == bufoff {
 			if bufoff > dstEvery {
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
 			copy(out, buf[:bufoff])
 			copy(out[dstEvery:], buf[bufoff:bufoff*2])
 			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
 			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
 			off = 0
 			out = out[bufoff:]
 			decoded += 256
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
 	}
 	if off > 0 {
 		ioff := int(off)
 		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
 		copy(out, buf[:off])
 		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
 		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
 		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
 		decoded += int(off) * 4
 		out = out[off:]
 	}

 	// Decode remaining.
 	for i := range br {
 		offset := dstEvery * i
 		br := &br[i]
 		bitsLeft := br.off*8 + uint(64-br.bitsRead)
 		for bitsLeft > 0 {
 			br.fill()
 			if false && br.bitsRead >= 32 {
 				if br.off >= 4 {
 					v := br.in[br.off-4:]
 					v = v[:4]
 					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 					br.value = (br.value << 32) | uint64(low)
 					br.bitsRead -= 32
 					br.off -= 4
 				} else {
 					for br.off > 0 {
 						br.value = (br.value << 8) | uint64(br.in[br.off-1])
 						br.bitsRead -= 8
 						br.off--
 					}
 				}
 			}
 			// end inline...
 			if offset >= len(out) {
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}

 			// Read value and increment offset.
 			val := br.peekBitsFast(d.actualTableLog)
 			v := single[val&tlMask].entry
 			nBits := uint8(v)
 			br.advance(nBits)
 			bitsLeft -= uint(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
 			return nil, err
 		}
 	}
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
 	return dst, nil
 }

 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
 // the uncompressed data exactly.
 func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 	if d.actualTableLog == 8 {
 		return d.decompress4X8bitExactly(dst, src)
 	}

 	var br [4]bitReaderBytes
 	start := 6
 	for i := 0; i < 3; i++ {
 		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
 		if start+length >= len(src) {
 			return nil, errors.New("truncated input (or invalid offset)")
 		}
 		err := br[i].init(src[start : start+length])
 		if err != nil {
 			return nil, err
 		}
 		start += length
 	}
 	err := br[3].init(src[start:])
 	if err != nil {
 		return nil, err
 	}

 	// destination, offset to match first output
 	dstSize := cap(dst)
 	dst = dst[:dstSize]
 	out := dst
 	dstEvery := (dstSize + 3) / 4

 	shift := (8 - d.actualTableLog) & 7

 	const tlSize = 1 << 8
 	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]

 	// Use temp table to avoid bound checks/append penalty.
 	var buf [256]byte
 	var off uint8
 	var decoded int

 	// Decode 4 values from each decoder/loop.
 	const bufoff = 256 / 4
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
 		}

 		{
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
 			br[stream].fillFast()
 			br[stream2].fillFast()

 			v := single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 := single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))

 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+1] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))

 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+2] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))

 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+3] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 		}

 		{
 			const stream = 2
 			const stream2 = 3
 			br[stream].fillFast()
 			br[stream2].fillFast()

 			v := single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 := single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))

 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+1] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))

 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+2] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))

 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+3] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 		}

 		off += 4

 		if off == bufoff {
 			if bufoff > dstEvery {
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
 			copy(out, buf[:bufoff])
 			copy(out[dstEvery:], buf[bufoff:bufoff*2])
 			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
 			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
 			off = 0
 			out = out[bufoff:]
 			decoded += 256
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
 	}
 	if off > 0 {
 		ioff := int(off)
 		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
 		copy(out, buf[:off])
 		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
 		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
 		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
 		decoded += int(off) * 4
 		out = out[off:]
 	}

 	// Decode remaining.
 	for i := range br {
 		offset := dstEvery * i
 		br := &br[i]
 		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
 		for bitsLeft > 0 {
 			if br.finished() {
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
 				if br.off >= 4 {
 					v := br.in[br.off-4:]
 					v = v[:4]
 					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 					br.value |= uint64(low) << (br.bitsRead - 32)
 					br.bitsRead -= 32
 					br.off -= 4
 				} else {
 					for br.off > 0 {
 						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
 						br.bitsRead -= 8
 						br.off--
 					}
 				}
 			}
 			// end inline...
 			if offset >= len(out) {
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}

 			// Read value and increment offset.
 			v := single[br.peekByteFast()>>shift].entry
 			nBits := uint8(v)
 			br.advance(nBits)
 			bitsLeft -= int(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
 			return nil, err
 		}
 	}
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
 	return dst, nil
 }

 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
 // the uncompressed data exactly.
 func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 	var br [4]bitReaderBytes
 	start := 6
 	for i := 0; i < 3; i++ {
 		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
 		if start+length >= len(src) {
 			return nil, errors.New("truncated input (or invalid offset)")
 		}
 		err := br[i].init(src[start : start+length])
 		if err != nil {
 			return nil, err
 		}
 		start += length
 	}
 	err := br[3].init(src[start:])
 	if err != nil {
 		return nil, err
 	}

 	// destination, offset to match first output
 	dstSize := cap(dst)
 	dst = dst[:dstSize]
 	out := dst
 	dstEvery := (dstSize + 3) / 4

 	const shift = 0
 	const tlSize = 1 << 8
 	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]

 	// Use temp table to avoid bound checks/append penalty.
 	var buf [256]byte
 	var off uint8
 	var decoded int

 	// Decode 4 values from each decoder/loop.
 	const bufoff = 256 / 4
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
 		}

 		{
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
 			br[stream].fillFast()
 			br[stream2].fillFast()

 			v := single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 := single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))

 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+1] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))

 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+2] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))

 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+3] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 		}

 		{
 			const stream = 2
 			const stream2 = 3
 			br[stream].fillFast()
 			br[stream2].fillFast()

 			v := single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 := single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))

 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+1] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))

 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+2] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))

 			v = single[br[stream].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream+3] = uint8(v >> 8)
 			br[stream].advance(uint8(v))

 			v2 = single[br[stream2].peekByteFast()>>shift].entry
 			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
 			br[stream2].advance(uint8(v2))
 		}

 		off += 4

 		if off == bufoff {
 			if bufoff > dstEvery {
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
 			copy(out, buf[:bufoff])
 			copy(out[dstEvery:], buf[bufoff:bufoff*2])
 			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
 			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
 			off = 0
 			out = out[bufoff:]
 			decoded += 256
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
 	}
 	if off > 0 {
 		ioff := int(off)
 		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
 		copy(out, buf[:off])
 		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
 		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
 		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
 		decoded += int(off) * 4
 		out = out[off:]
 	}

 	// Decode remaining.
 	for i := range br {
 		offset := dstEvery * i
 		br := &br[i]
 		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
 		for bitsLeft > 0 {
 			if br.finished() {
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
 				if br.off >= 4 {
 					v := br.in[br.off-4:]
 					v = v[:4]
 					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 					br.value |= uint64(low) << (br.bitsRead - 32)
 					br.bitsRead -= 32
 					br.off -= 4
 				} else {
 					for br.off > 0 {
 						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
 						br.bitsRead -= 8
 						br.off--
 					}
 				}
 			}
 			// end inline...
 			if offset >= len(out) {
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}

 			// Read value and increment offset.
 			v := single[br.peekByteFast()>>shift].entry
 			nBits := uint8(v)
 			br.advance(nBits)
 			bitsLeft -= int(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
 			return nil, err
 		}
 	}
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
 	return dst, nil
 }

 // matches will compare a decoding table to a coding table.
 // Errors are written to the writer.
 // Nothing will be written if table is ok.
 func (s *Scratch) matches(ct cTable, w io.Writer) {
 	if s == nil || len(s.dt.single) == 0 {
 		return
 	}
 	dt := s.dt.single[:1<<s.actualTableLog]
 	tablelog := s.actualTableLog
 	ok := 0
 	broken := 0
 	for sym, enc := range ct {
 		errs := 0
 		broken++
 		if enc.nBits == 0 {
 			for _, dec := range dt {
 				if uint8(dec.entry>>8) == byte(sym) {
 					fmt.Fprintf(w, "symbol %x has decoder, but no encoder\n", sym)
 					errs++
 					break
 				}
 			}
 			if errs == 0 {
 				broken--
 			}
 			continue
 		}
 		// Unused bits in input
 		ub := tablelog - enc.nBits
 		top := enc.val << ub
 		// decoder looks at top bits.
 		dec := dt[top]
 		if uint8(dec.entry) != enc.nBits {
 			fmt.Fprintf(w, "symbol 0x%x bit size mismatch (enc: %d, dec:%d).\n", sym, enc.nBits, uint8(dec.entry))
 			errs++
 		}
 		if uint8(dec.entry>>8) != uint8(sym) {
 			fmt.Fprintf(w, "symbol 0x%x decoder output mismatch (enc: %d, dec:%d).\n", sym, sym, uint8(dec.entry>>8))
 			errs++
 		}
 		if errs > 0 {
 			fmt.Fprintf(w, "%d errros in base, stopping\n", errs)
 			continue
 		}
 		// Ensure that all combinations are covered.
 		for i := uint16(0); i < (1 << ub); i++ {
 			vval := top | i
 			dec := dt[vval]
 			if uint8(dec.entry) != enc.nBits {
 				fmt.Fprintf(w, "symbol 0x%x bit size mismatch (enc: %d, dec:%d).\n", vval, enc.nBits, uint8(dec.entry))
 				errs++
 			}
 			if uint8(dec.entry>>8) != uint8(sym) {
 				fmt.Fprintf(w, "symbol 0x%x decoder output mismatch (enc: %d, dec:%d).\n", vval, sym, uint8(dec.entry>>8))
 				errs++
 			}
 			if errs > 20 {
 				fmt.Fprintf(w, "%d errros, stopping\n", errs)
 				break
 			}
 		}
 		if errs == 0 {
 			ok++
 			broken--
 		}
 	}
 	if broken > 0 {
 		fmt.Fprintf(w, "%d broken, %d ok\n", broken, ok)
 	}
 }