gRPC migration update
Change-Id: Icdd1a824948fa994cd36bd121c962f5ecf74e3cf
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 97ae66a..41703bb 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -25,11 +25,14 @@
len uint8
}
+// Uses special code for all tables that are < 8 bits.
+const use8BitTables = true
+
// ReadTable will read a table from the input.
// The size of the input may be larger than the table definition.
// Any content remaining after the table definition will be returned.
// If no Scratch is provided a new one is allocated.
-// The returned Scratch can be used for decoding input using this table.
+// The returned Scratch can be used for encoding or decoding input using this table.
func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
s, err = s.prepare(in)
if err != nil {
@@ -55,8 +58,8 @@
s.symbolLen = uint16(oSize)
in = in[iSize:]
} else {
- if len(in) <= int(iSize) {
- return s, nil, errors.New("input too small for table")
+ if len(in) < int(iSize) {
+ return s, nil, fmt.Errorf("input too small for table, want %d bytes, have %d", iSize, len(in))
}
// FSE compressed weights
s.fse.DecompressLimit = 255
@@ -83,6 +86,7 @@
}
v2 := v & 15
rankStats[v2]++
+ // (1 << (v2-1)) is slower since the compiler cannot prove that v2 isn't 0.
weightTotal += (1 << v2) >> 1
}
if weightTotal == 0 {
@@ -134,20 +138,40 @@
if len(s.dt.single) != tSize {
s.dt.single = make([]dEntrySingle, tSize)
}
+ cTable := s.prevTable
+ if cap(cTable) < maxSymbolValue+1 {
+ cTable = make([]cTableEntry, 0, maxSymbolValue+1)
+ }
+ cTable = cTable[:maxSymbolValue+1]
+ s.prevTable = cTable[:s.symbolLen]
+ s.prevTableLog = s.actualTableLog
+
for n, w := range s.huffWeight[:s.symbolLen] {
if w == 0 {
+ cTable[n] = cTableEntry{
+ val: 0,
+ nBits: 0,
+ }
continue
}
length := (uint32(1) << w) >> 1
d := dEntrySingle{
entry: uint16(s.actualTableLog+1-w) | (uint16(n) << 8),
}
- single := s.dt.single[rankStats[w] : rankStats[w]+length]
+
+ rank := &rankStats[w]
+ cTable[n] = cTableEntry{
+ val: uint16(*rank >> (w - 1)),
+ nBits: uint8(d.entry),
+ }
+
+ single := s.dt.single[*rank : *rank+length]
for i := range single {
single[i] = d
}
- rankStats[w] += length
+ *rank += length
}
+
return s, in, nil
}
@@ -155,69 +179,14 @@
// The length of the supplied input must match the end of a block exactly.
// Before this is called, the table must be initialized with ReadTable unless
// the encoder re-used the table.
+// deprecated: Use the stateless Decoder() to get a concurrent version.
func (s *Scratch) Decompress1X(in []byte) (out []byte, err error) {
- if len(s.dt.single) == 0 {
- return nil, errors.New("no table loaded")
+ if cap(s.Out) < s.MaxDecodedSize {
+ s.Out = make([]byte, s.MaxDecodedSize)
}
- var br bitReader
- err = br.init(in)
- if err != nil {
- return nil, err
- }
- s.Out = s.Out[:0]
-
- decode := func() byte {
- val := br.peekBitsFast(s.actualTableLog) /* note : actualTableLog >= 1 */
- v := s.dt.single[val]
- br.bitsRead += uint8(v.entry)
- return uint8(v.entry >> 8)
- }
- hasDec := func(v dEntrySingle) byte {
- br.bitsRead += uint8(v.entry)
- return uint8(v.entry >> 8)
- }
-
- // Avoid bounds check by always having full sized table.
- const tlSize = 1 << tableLogMax
- const tlMask = tlSize - 1
- dt := s.dt.single[:tlSize]
-
- // Use temp table to avoid bound checks/append penalty.
- var tmp = s.huffWeight[:256]
- var off uint8
-
- for br.off >= 8 {
- br.fillFast()
- tmp[off+0] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
- tmp[off+1] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
- br.fillFast()
- tmp[off+2] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
- tmp[off+3] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
- off += 4
- if off == 0 {
- if len(s.Out)+256 > s.MaxDecodedSize {
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
- }
- s.Out = append(s.Out, tmp...)
- }
- }
-
- if len(s.Out)+int(off) > s.MaxDecodedSize {
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
- }
- s.Out = append(s.Out, tmp[:off]...)
-
- for !br.finished() {
- br.fill()
- if len(s.Out) >= s.MaxDecodedSize {
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
- }
- s.Out = append(s.Out, decode())
- }
- return s.Out, br.close()
+ s.Out = s.Out[:0:s.MaxDecodedSize]
+ s.Out, err = s.Decoder().Decompress1X(s.Out, in)
+ return s.Out, err
}
// Decompress4X will decompress a 4X encoded stream.
@@ -225,123 +194,402 @@
// the encoder re-used the table.
// The length of the supplied input must match the end of a block exactly.
// The destination size of the uncompressed data must be known and provided.
+// deprecated: Use the stateless Decoder() to get a concurrent version.
func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) {
- if len(s.dt.single) == 0 {
- return nil, errors.New("no table loaded")
- }
- if len(in) < 6+(4*1) {
- return nil, errors.New("input too small")
- }
if dstSize > s.MaxDecodedSize {
return nil, ErrMaxDecodedSizeExceeded
}
- // TODO: We do not detect when we overrun a buffer, except if the last one does.
+ if cap(s.Out) < dstSize {
+ s.Out = make([]byte, s.MaxDecodedSize)
+ }
+ s.Out = s.Out[:0:dstSize]
+ s.Out, err = s.Decoder().Decompress4X(s.Out, in)
+ return s.Out, err
+}
- var br [4]bitReader
+// Decoder will return a stateless decoder that can be used by multiple
+// decompressors concurrently.
+// Before this is called, the table must be initialized with ReadTable.
+// The Decoder is still linked to the scratch buffer so that cannot be reused.
+// However, it is safe to discard the scratch.
+func (s *Scratch) Decoder() *Decoder {
+ return &Decoder{
+ dt: s.dt,
+ actualTableLog: s.actualTableLog,
+ }
+}
+
+// Decoder provides stateless decoding.
+type Decoder struct {
+ dt dTable
+ actualTableLog uint8
+}
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ if use8BitTables && d.actualTableLog <= 8 {
+ return d.decompress1X8Bit(dst, src)
+ }
+ var br bitReaderShifted
+ err := br.init(src)
+ if err != nil {
+ return dst, err
+ }
+ maxDecodedSize := cap(dst)
+ dst = dst[:0]
+
+ // Avoid bounds check by always having full sized table.
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+ dt := d.dt.single[:tlSize]
+
+ // Use temp table to avoid bound checks/append penalty.
+ var buf [256]byte
+ var off uint8
+
+ for br.off >= 8 {
+ br.fillFast()
+ v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ // Refill
+ br.fillFast()
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+
+ if len(dst)+int(off) > maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:off]...)
+
+ // br < 8, so uint8 is fine
+ bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+ for bitsLeft > 0 {
+ br.fill()
+ if false && br.bitsRead >= 32 {
+ if br.off >= 4 {
+ v := br.in[br.off-4:]
+ v = v[:4]
+ low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ br.value = (br.value << 32) | uint64(low)
+ br.bitsRead -= 32
+ br.off -= 4
+ } else {
+ for br.off > 0 {
+ br.value = (br.value << 8) | uint64(br.in[br.off-1])
+ br.bitsRead -= 8
+ br.off--
+ }
+ }
+ }
+ if len(dst) >= maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+ nBits := uint8(v.entry)
+ br.advance(nBits)
+ bitsLeft -= nBits
+ dst = append(dst, uint8(v.entry>>8))
+ }
+ return dst, br.close()
+}
+
+// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
+ if d.actualTableLog == 8 {
+ return d.decompress1X8BitExactly(dst, src)
+ }
+ var br bitReaderBytes
+ err := br.init(src)
+ if err != nil {
+ return dst, err
+ }
+ maxDecodedSize := cap(dst)
+ dst = dst[:0]
+
+ // Avoid bounds check by always having full sized table.
+ dt := d.dt.single[:256]
+
+ // Use temp table to avoid bound checks/append penalty.
+ var buf [256]byte
+ var off uint8
+
+ shift := (8 - d.actualTableLog) & 7
+
+ //fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
+ for br.off >= 4 {
+ br.fillFast()
+ v := dt[br.peekByteFast()>>shift]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[br.peekByteFast()>>shift]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ v = dt[br.peekByteFast()>>shift]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[br.peekByteFast()>>shift]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+
+ if len(dst)+int(off) > maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:off]...)
+
+ // br < 4, so uint8 is fine
+ bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
+ for bitsLeft > 0 {
+ if br.bitsRead >= 64-8 {
+ for br.off > 0 {
+ br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+ br.bitsRead -= 8
+ br.off--
+ }
+ }
+ if len(dst) >= maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ v := dt[br.peekByteFast()>>shift]
+ nBits := uint8(v.entry)
+ br.advance(nBits)
+ bitsLeft -= int8(nBits)
+ dst = append(dst, uint8(v.entry>>8))
+ }
+ return dst, br.close()
+}
+
+// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
+ var br bitReaderBytes
+ err := br.init(src)
+ if err != nil {
+ return dst, err
+ }
+ maxDecodedSize := cap(dst)
+ dst = dst[:0]
+
+ // Avoid bounds check by always having full sized table.
+ dt := d.dt.single[:256]
+
+ // Use temp table to avoid bound checks/append penalty.
+ var buf [256]byte
+ var off uint8
+
+ const shift = 0
+
+ //fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
+ for br.off >= 4 {
+ br.fillFast()
+ v := dt[br.peekByteFast()>>shift]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[br.peekByteFast()>>shift]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ v = dt[br.peekByteFast()>>shift]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[br.peekByteFast()>>shift]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+
+ if len(dst)+int(off) > maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:off]...)
+
+ // br < 4, so uint8 is fine
+ bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
+ for bitsLeft > 0 {
+ if br.bitsRead >= 64-8 {
+ for br.off > 0 {
+ br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+ br.bitsRead -= 8
+ br.off--
+ }
+ }
+ if len(dst) >= maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ v := dt[br.peekByteFast()>>shift]
+ nBits := uint8(v.entry)
+ br.advance(nBits)
+ bitsLeft -= int8(nBits)
+ dst = append(dst, uint8(v.entry>>8))
+ }
+ return dst, br.close()
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ if len(src) < 6+(4*1) {
+ return nil, errors.New("input too small")
+ }
+ if use8BitTables && d.actualTableLog <= 8 {
+ return d.decompress4X8bit(dst, src)
+ }
+
+ var br [4]bitReaderShifted
start := 6
for i := 0; i < 3; i++ {
- length := int(in[i*2]) | (int(in[i*2+1]) << 8)
- if start+length >= len(in) {
+ length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+ if start+length >= len(src) {
return nil, errors.New("truncated input (or invalid offset)")
}
- err = br[i].init(in[start : start+length])
+ err := br[i].init(src[start : start+length])
if err != nil {
return nil, err
}
start += length
}
- err = br[3].init(in[start:])
+ err := br[3].init(src[start:])
if err != nil {
return nil, err
}
- // Prepare output
- if cap(s.Out) < dstSize {
- s.Out = make([]byte, 0, dstSize)
- }
- s.Out = s.Out[:dstSize]
// destination, offset to match first output
- dstOut := s.Out
+ dstSize := cap(dst)
+ dst = dst[:dstSize]
+ out := dst
dstEvery := (dstSize + 3) / 4
const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
- single := s.dt.single[:tlSize]
-
- decode := func(br *bitReader) byte {
- val := br.peekBitsFast(s.actualTableLog) /* note : actualTableLog >= 1 */
- v := single[val&tlMask]
- br.bitsRead += uint8(v.entry)
- return uint8(v.entry >> 8)
- }
+ single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
- var tmp = s.huffWeight[:256]
+ var buf [256]byte
var off uint8
var decoded int
// Decode 2 values from each decoder/loop.
const bufoff = 256 / 4
-bigloop:
for {
- for i := range br {
- br := &br[i]
- if br.off < 4 {
- break bigloop
- }
- br.fillFast()
+ if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+ break
}
{
const stream = 0
- val := br[stream].peekBitsFast(s.actualTableLog)
+ const stream2 = 1
+ br[stream].fillFast()
+ br[stream2].fillFast()
+
+ val := br[stream].peekBitsFast(d.actualTableLog)
v := single[val&tlMask]
- br[stream].bitsRead += uint8(v.entry)
+ br[stream].advance(uint8(v.entry))
+ buf[off+bufoff*stream] = uint8(v.entry >> 8)
- val2 := br[stream].peekBitsFast(s.actualTableLog)
+ val2 := br[stream2].peekBitsFast(d.actualTableLog)
v2 := single[val2&tlMask]
- tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
- tmp[off+bufoff*stream] = uint8(v.entry >> 8)
- br[stream].bitsRead += uint8(v2.entry)
- }
+ br[stream2].advance(uint8(v2.entry))
+ buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
- {
- const stream = 1
- val := br[stream].peekBitsFast(s.actualTableLog)
- v := single[val&tlMask]
- br[stream].bitsRead += uint8(v.entry)
+ val = br[stream].peekBitsFast(d.actualTableLog)
+ v = single[val&tlMask]
+ br[stream].advance(uint8(v.entry))
+ buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
- val2 := br[stream].peekBitsFast(s.actualTableLog)
- v2 := single[val2&tlMask]
- tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
- tmp[off+bufoff*stream] = uint8(v.entry >> 8)
- br[stream].bitsRead += uint8(v2.entry)
+ val2 = br[stream2].peekBitsFast(d.actualTableLog)
+ v2 = single[val2&tlMask]
+ br[stream2].advance(uint8(v2.entry))
+ buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
}
{
const stream = 2
- val := br[stream].peekBitsFast(s.actualTableLog)
+ const stream2 = 3
+ br[stream].fillFast()
+ br[stream2].fillFast()
+
+ val := br[stream].peekBitsFast(d.actualTableLog)
v := single[val&tlMask]
- br[stream].bitsRead += uint8(v.entry)
+ br[stream].advance(uint8(v.entry))
+ buf[off+bufoff*stream] = uint8(v.entry >> 8)
- val2 := br[stream].peekBitsFast(s.actualTableLog)
+ val2 := br[stream2].peekBitsFast(d.actualTableLog)
v2 := single[val2&tlMask]
- tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
- tmp[off+bufoff*stream] = uint8(v.entry >> 8)
- br[stream].bitsRead += uint8(v2.entry)
- }
+ br[stream2].advance(uint8(v2.entry))
+ buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
- {
- const stream = 3
- val := br[stream].peekBitsFast(s.actualTableLog)
- v := single[val&tlMask]
- br[stream].bitsRead += uint8(v.entry)
+ val = br[stream].peekBitsFast(d.actualTableLog)
+ v = single[val&tlMask]
+ br[stream].advance(uint8(v.entry))
+ buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
- val2 := br[stream].peekBitsFast(s.actualTableLog)
- v2 := single[val2&tlMask]
- tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
- tmp[off+bufoff*stream] = uint8(v.entry >> 8)
- br[stream].bitsRead += uint8(v2.entry)
+ val2 = br[stream2].peekBitsFast(d.actualTableLog)
+ v2 = single[val2&tlMask]
+ br[stream2].advance(uint8(v2.entry))
+ buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
}
off += 2
@@ -350,42 +598,67 @@
if bufoff > dstEvery {
return nil, errors.New("corruption detected: stream overrun 1")
}
- copy(dstOut, tmp[:bufoff])
- copy(dstOut[dstEvery:], tmp[bufoff:bufoff*2])
- copy(dstOut[dstEvery*2:], tmp[bufoff*2:bufoff*3])
- copy(dstOut[dstEvery*3:], tmp[bufoff*3:bufoff*4])
+ copy(out, buf[:bufoff])
+ copy(out[dstEvery:], buf[bufoff:bufoff*2])
+ copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
+ copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
off = 0
- dstOut = dstOut[bufoff:]
+ out = out[bufoff:]
decoded += 256
// There must at least be 3 buffers left.
- if len(dstOut) < dstEvery*3 {
+ if len(out) < dstEvery*3 {
return nil, errors.New("corruption detected: stream overrun 2")
}
}
}
if off > 0 {
ioff := int(off)
- if len(dstOut) < dstEvery*3+ioff {
+ if len(out) < dstEvery*3+ioff {
return nil, errors.New("corruption detected: stream overrun 3")
}
- copy(dstOut, tmp[:off])
- copy(dstOut[dstEvery:dstEvery+ioff], tmp[bufoff:bufoff*2])
- copy(dstOut[dstEvery*2:dstEvery*2+ioff], tmp[bufoff*2:bufoff*3])
- copy(dstOut[dstEvery*3:dstEvery*3+ioff], tmp[bufoff*3:bufoff*4])
+ copy(out, buf[:off])
+ copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
+ copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
+ copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
decoded += int(off) * 4
- dstOut = dstOut[off:]
+ out = out[off:]
}
// Decode remaining.
for i := range br {
offset := dstEvery * i
br := &br[i]
- for !br.finished() {
+ bitsLeft := br.off*8 + uint(64-br.bitsRead)
+ for bitsLeft > 0 {
br.fill()
- if offset >= len(dstOut) {
+ if false && br.bitsRead >= 32 {
+ if br.off >= 4 {
+ v := br.in[br.off-4:]
+ v = v[:4]
+ low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ br.value = (br.value << 32) | uint64(low)
+ br.bitsRead -= 32
+ br.off -= 4
+ } else {
+ for br.off > 0 {
+ br.value = (br.value << 8) | uint64(br.in[br.off-1])
+ br.bitsRead -= 8
+ br.off--
+ }
+ }
+ }
+ // end inline...
+ if offset >= len(out) {
return nil, errors.New("corruption detected: stream overrun 4")
}
- dstOut[offset] = decode(br)
+
+ // Read value and increment offset.
+ val := br.peekBitsFast(d.actualTableLog)
+ v := single[val&tlMask].entry
+ nBits := uint8(v)
+ br.advance(nBits)
+ bitsLeft -= uint(nBits)
+ out[offset] = uint8(v >> 8)
offset++
}
decoded += offset - dstEvery*i
@@ -397,7 +670,426 @@
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
- return s.Out, nil
+ return dst, nil
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
+ if d.actualTableLog == 8 {
+ return d.decompress4X8bitExactly(dst, src)
+ }
+
+ var br [4]bitReaderBytes
+ start := 6
+ for i := 0; i < 3; i++ {
+ length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+ if start+length >= len(src) {
+ return nil, errors.New("truncated input (or invalid offset)")
+ }
+ err := br[i].init(src[start : start+length])
+ if err != nil {
+ return nil, err
+ }
+ start += length
+ }
+ err := br[3].init(src[start:])
+ if err != nil {
+ return nil, err
+ }
+
+ // destination, offset to match first output
+ dstSize := cap(dst)
+ dst = dst[:dstSize]
+ out := dst
+ dstEvery := (dstSize + 3) / 4
+
+ shift := (8 - d.actualTableLog) & 7
+
+ const tlSize = 1 << 8
+ const tlMask = tlSize - 1
+ single := d.dt.single[:tlSize]
+
+ // Use temp table to avoid bound checks/append penalty.
+ var buf [256]byte
+ var off uint8
+ var decoded int
+
+ // Decode 4 values from each decoder/loop.
+ const bufoff = 256 / 4
+ for {
+ if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+ break
+ }
+
+ {
+ // Interleave 2 decodes.
+ const stream = 0
+ const stream2 = 1
+ br[stream].fillFast()
+ br[stream2].fillFast()
+
+ v := single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 := single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+
+ v = single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream+1] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 = single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+
+ v = single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream+2] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 = single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+
+ v = single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream+3] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 = single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+ }
+
+ {
+ const stream = 2
+ const stream2 = 3
+ br[stream].fillFast()
+ br[stream2].fillFast()
+
+ v := single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 := single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+
+ v = single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream+1] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 = single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+
+ v = single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream+2] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 = single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+
+ v = single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream+3] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 = single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+ }
+
+ off += 4
+
+ if off == bufoff {
+ if bufoff > dstEvery {
+ return nil, errors.New("corruption detected: stream overrun 1")
+ }
+ copy(out, buf[:bufoff])
+ copy(out[dstEvery:], buf[bufoff:bufoff*2])
+ copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
+ copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
+ off = 0
+ out = out[bufoff:]
+ decoded += 256
+ // There must at least be 3 buffers left.
+ if len(out) < dstEvery*3 {
+ return nil, errors.New("corruption detected: stream overrun 2")
+ }
+ }
+ }
+ if off > 0 {
+ ioff := int(off)
+ if len(out) < dstEvery*3+ioff {
+ return nil, errors.New("corruption detected: stream overrun 3")
+ }
+ copy(out, buf[:off])
+ copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
+ copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
+ copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+ decoded += int(off) * 4
+ out = out[off:]
+ }
+
+ // Decode remaining.
+ for i := range br {
+ offset := dstEvery * i
+ br := &br[i]
+ bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+ for bitsLeft > 0 {
+ if br.finished() {
+ return nil, io.ErrUnexpectedEOF
+ }
+ if br.bitsRead >= 56 {
+ if br.off >= 4 {
+ v := br.in[br.off-4:]
+ v = v[:4]
+ low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ br.value |= uint64(low) << (br.bitsRead - 32)
+ br.bitsRead -= 32
+ br.off -= 4
+ } else {
+ for br.off > 0 {
+ br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+ br.bitsRead -= 8
+ br.off--
+ }
+ }
+ }
+ // end inline...
+ if offset >= len(out) {
+ return nil, errors.New("corruption detected: stream overrun 4")
+ }
+
+ // Read value and increment offset.
+ v := single[br.peekByteFast()>>shift].entry
+ nBits := uint8(v)
+ br.advance(nBits)
+ bitsLeft -= int(nBits)
+ out[offset] = uint8(v >> 8)
+ offset++
+ }
+ decoded += offset - dstEvery*i
+ err = br.close()
+ if err != nil {
+ return nil, err
+ }
+ }
+ if dstSize != decoded {
+ return nil, errors.New("corruption detected: short output block")
+ }
+ return dst, nil
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
+ var br [4]bitReaderBytes
+ start := 6
+ for i := 0; i < 3; i++ {
+ length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+ if start+length >= len(src) {
+ return nil, errors.New("truncated input (or invalid offset)")
+ }
+ err := br[i].init(src[start : start+length])
+ if err != nil {
+ return nil, err
+ }
+ start += length
+ }
+ err := br[3].init(src[start:])
+ if err != nil {
+ return nil, err
+ }
+
+ // destination, offset to match first output
+ dstSize := cap(dst)
+ dst = dst[:dstSize]
+ out := dst
+ dstEvery := (dstSize + 3) / 4
+
+ const shift = 0
+ const tlSize = 1 << 8
+ const tlMask = tlSize - 1
+ single := d.dt.single[:tlSize]
+
+ // Use temp table to avoid bound checks/append penalty.
+ var buf [256]byte
+ var off uint8
+ var decoded int
+
+ // Decode 4 values from each decoder/loop.
+ const bufoff = 256 / 4
+ for {
+ if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+ break
+ }
+
+ {
+ // Interleave 2 decodes.
+ const stream = 0
+ const stream2 = 1
+ br[stream].fillFast()
+ br[stream2].fillFast()
+
+ v := single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 := single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+
+ v = single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream+1] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 = single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+
+ v = single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream+2] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 = single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+
+ v = single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream+3] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 = single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+ }
+
+ {
+ const stream = 2
+ const stream2 = 3
+ br[stream].fillFast()
+ br[stream2].fillFast()
+
+ v := single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 := single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+
+ v = single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream+1] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 = single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+
+ v = single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream+2] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 = single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+
+ v = single[br[stream].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream+3] = uint8(v >> 8)
+ br[stream].advance(uint8(v))
+
+ v2 = single[br[stream2].peekByteFast()>>shift].entry
+ buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+ br[stream2].advance(uint8(v2))
+ }
+
+ off += 4
+
+ if off == bufoff {
+ if bufoff > dstEvery {
+ return nil, errors.New("corruption detected: stream overrun 1")
+ }
+ copy(out, buf[:bufoff])
+ copy(out[dstEvery:], buf[bufoff:bufoff*2])
+ copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
+ copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
+ off = 0
+ out = out[bufoff:]
+ decoded += 256
+ // There must at least be 3 buffers left.
+ if len(out) < dstEvery*3 {
+ return nil, errors.New("corruption detected: stream overrun 2")
+ }
+ }
+ }
+ if off > 0 {
+ ioff := int(off)
+ if len(out) < dstEvery*3+ioff {
+ return nil, errors.New("corruption detected: stream overrun 3")
+ }
+ copy(out, buf[:off])
+ copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
+ copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
+ copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+ decoded += int(off) * 4
+ out = out[off:]
+ }
+
+ // Decode remaining.
+ for i := range br {
+ offset := dstEvery * i
+ br := &br[i]
+ bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+ for bitsLeft > 0 {
+ if br.finished() {
+ return nil, io.ErrUnexpectedEOF
+ }
+ if br.bitsRead >= 56 {
+ if br.off >= 4 {
+ v := br.in[br.off-4:]
+ v = v[:4]
+ low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ br.value |= uint64(low) << (br.bitsRead - 32)
+ br.bitsRead -= 32
+ br.off -= 4
+ } else {
+ for br.off > 0 {
+ br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+ br.bitsRead -= 8
+ br.off--
+ }
+ }
+ }
+ // end inline...
+ if offset >= len(out) {
+ return nil, errors.New("corruption detected: stream overrun 4")
+ }
+
+ // Read value and increment offset.
+ v := single[br.peekByteFast()>>shift].entry
+ nBits := uint8(v)
+ br.advance(nBits)
+ bitsLeft -= int(nBits)
+ out[offset] = uint8(v >> 8)
+ offset++
+ }
+ decoded += offset - dstEvery*i
+ err = br.close()
+ if err != nil {
+ return nil, err
+ }
+ }
+ if dstSize != decoded {
+ return nil, errors.New("corruption detected: short output block")
+ }
+ return dst, nil
}
// matches will compare a decoding table to a coding table.