gRPC migration update

Change-Id: Icdd1a824948fa994cd36bd121c962f5ecf74e3cf
diff --git a/vendor/github.com/klauspost/compress/fse/bitreader.go b/vendor/github.com/klauspost/compress/fse/bitreader.go
index b9db204..f65eb39 100644
--- a/vendor/github.com/klauspost/compress/fse/bitreader.go
+++ b/vendor/github.com/klauspost/compress/fse/bitreader.go
@@ -6,6 +6,7 @@
 package fse
 
 import (
+	"encoding/binary"
 	"errors"
 	"io"
 )
@@ -34,8 +35,12 @@
 	}
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
-	b.fill()
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
 	b.bitsRead += 8 - uint8(highBits(uint32(v)))
 	return nil
 }
@@ -63,8 +68,9 @@
 	if b.bitsRead < 32 {
 		return
 	}
-	// Do single re-slice to avoid bounds checks.
-	v := b.in[b.off-4 : b.off]
+	// 2 bounds checks.
+	v := b.in[b.off-4:]
+	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
@@ -77,7 +83,8 @@
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
+		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
@@ -91,9 +98,17 @@
 	}
 }
 
+// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
+func (b *bitReader) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
+}
+
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
+	return b.bitsRead >= 64 && b.off == 0
 }
 
 // close the bitstream and returns an error if out-of-buffer reads occurred.
diff --git a/vendor/github.com/klauspost/compress/fse/bytereader.go b/vendor/github.com/klauspost/compress/fse/bytereader.go
index f228a46..abade2d 100644
--- a/vendor/github.com/klauspost/compress/fse/bytereader.go
+++ b/vendor/github.com/klauspost/compress/fse/bytereader.go
@@ -25,19 +25,10 @@
 	b.off += int(n)
 }
 
-// Int32 returns a little endian int32 starting at current offset.
-func (b byteReader) Int32() int32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
-	v3 := int32(b2[3])
-	v2 := int32(b2[2])
-	v1 := int32(b2[1])
-	v0 := int32(b2[0])
-	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
-}
-
 // Uint32 returns a little endian uint32 starting at current offset.
 func (b byteReader) Uint32() uint32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
+	b2 = b2[:4]
 	v3 := uint32(b2[3])
 	v2 := uint32(b2[2])
 	v1 := uint32(b2[1])
diff --git a/vendor/github.com/klauspost/compress/fse/compress.go b/vendor/github.com/klauspost/compress/fse/compress.go
index b69237c..6f34191 100644
--- a/vendor/github.com/klauspost/compress/fse/compress.go
+++ b/vendor/github.com/klauspost/compress/fse/compress.go
@@ -92,7 +92,6 @@
 	im := int32((nbBitsOut << 16) - first.deltaNbBits)
 	lu := (im >> nbBitsOut) + first.deltaFindState
 	c.state = c.stateTable[lu]
-	return
 }
 
 // encode the output symbol provided and write it to the bitstream.
@@ -301,7 +300,7 @@
 	out[outP+1] = byte(bitStream >> 8)
 	outP += (bitCount + 7) / 8
 
-	if uint16(charnum) > s.symbolLen {
+	if charnum > s.symbolLen {
 		return errors.New("internal error: charnum > s.symbolLen")
 	}
 	s.Out = out[:outP]
@@ -331,7 +330,7 @@
 func (s *Scratch) allocCtable() {
 	tableSize := 1 << s.actualTableLog
 	// get tableSymbol that is big enough.
-	if cap(s.ct.tableSymbol) < int(tableSize) {
+	if cap(s.ct.tableSymbol) < tableSize {
 		s.ct.tableSymbol = make([]byte, tableSize)
 	}
 	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
@@ -565,8 +564,8 @@
 		distributed  uint32
 		total        = uint32(s.br.remain())
 		tableLog     = s.actualTableLog
-		lowThreshold = uint32(total >> tableLog)
-		lowOne       = uint32((total * 3) >> (tableLog + 1))
+		lowThreshold = total >> tableLog
+		lowOne       = (total * 3) >> (tableLog + 1)
 	)
 	for i, cnt := range s.count[:s.symbolLen] {
 		if cnt == 0 {
@@ -591,7 +590,7 @@
 
 	if (total / toDistribute) > lowOne {
 		// risk of rounding to zero
-		lowOne = uint32((total * 3) / (toDistribute * 2))
+		lowOne = (total * 3) / (toDistribute * 2)
 		for i, cnt := range s.count[:s.symbolLen] {
 			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
 				s.norm[i] = 1
diff --git a/vendor/github.com/klauspost/compress/fse/decompress.go b/vendor/github.com/klauspost/compress/fse/decompress.go
index 413ec3b..926f5f1 100644
--- a/vendor/github.com/klauspost/compress/fse/decompress.go
+++ b/vendor/github.com/klauspost/compress/fse/decompress.go
@@ -172,7 +172,7 @@
 // allocDtable will allocate decoding tables if they are not big enough.
 func (s *Scratch) allocDtable() {
 	tableSize := 1 << s.actualTableLog
-	if cap(s.decTable) < int(tableSize) {
+	if cap(s.decTable) < tableSize {
 		s.decTable = make([]decSymbol, tableSize)
 	}
 	s.decTable = s.decTable[:tableSize]
@@ -340,7 +340,7 @@
 func (d *decoder) init(in *bitReader, dt []decSymbol, tableLog uint8) {
 	d.dt = dt
 	d.br = in
-	d.state = uint16(in.getBits(tableLog))
+	d.state = in.getBits(tableLog)
 }
 
 // next returns the next symbol and sets the next state.
diff --git a/vendor/github.com/klauspost/compress/fse/fse.go b/vendor/github.com/klauspost/compress/fse/fse.go
index 075357b..535cbad 100644
--- a/vendor/github.com/klauspost/compress/fse/fse.go
+++ b/vendor/github.com/klauspost/compress/fse/fse.go
@@ -44,18 +44,14 @@
 // Scratch provides temporary storage for compression and decompression.
 type Scratch struct {
 	// Private
-	count          [maxSymbolValue + 1]uint32
-	norm           [maxSymbolValue + 1]int16
-	symbolLen      uint16 // Length of active part of the symbol table.
-	actualTableLog uint8  // Selected tablelog.
-	br             byteReader
-	bits           bitReader
-	bw             bitWriter
-	ct             cTable      // Compression tables.
-	decTable       []decSymbol // Decompression table.
-	zeroBits       bool        // no bits has prob > 50%.
-	clearCount     bool        // clear count
-	maxCount       int         // count of the most probable symbol
+	count    [maxSymbolValue + 1]uint32
+	norm     [maxSymbolValue + 1]int16
+	br       byteReader
+	bits     bitReader
+	bw       bitWriter
+	ct       cTable      // Compression tables.
+	decTable []decSymbol // Decompression table.
+	maxCount int         // count of the most probable symbol
 
 	// Per block parameters.
 	// These can be used to override compression parameters of the block.
@@ -68,17 +64,22 @@
 	// and allocation will be avoided.
 	Out []byte
 
-	// MaxSymbolValue will override the maximum symbol value of the next block.
-	MaxSymbolValue uint8
-
-	// TableLog will attempt to override the tablelog for the next block.
-	TableLog uint8
-
 	// DecompressLimit limits the maximum decoded size acceptable.
 	// If > 0 decompression will stop when approximately this many bytes
 	// has been decoded.
 	// If 0, maximum size will be 2GB.
 	DecompressLimit int
+
+	symbolLen      uint16 // Length of active part of the symbol table.
+	actualTableLog uint8  // Selected tablelog.
+	zeroBits       bool   // no bits has prob > 50%.
+	clearCount     bool   // clear count
+
+	// MaxSymbolValue will override the maximum symbol value of the next block.
+	MaxSymbolValue uint8
+
+	// TableLog will attempt to override the tablelog for the next block.
+	TableLog uint8
 }
 
 // Histogram allows to populate the histogram and skip that step in the compression,
diff --git a/vendor/github.com/klauspost/compress/huff0/README.md b/vendor/github.com/klauspost/compress/huff0/README.md
index 0a8448c..8b6e5c6 100644
--- a/vendor/github.com/klauspost/compress/huff0/README.md
+++ b/vendor/github.com/klauspost/compress/huff0/README.md
@@ -12,11 +12,11 @@
 

 * [Godoc documentation](https://godoc.org/github.com/klauspost/compress/huff0)

 

-THIS PACKAGE IS NOT CONSIDERED STABLE AND API OR ENCODING MAY CHANGE IN THE FUTURE.

-

 ## News

 

- * Mar 2018: First implementation released. Consider this beta software for now.

+This is used as part of the [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and decompression package.

+

+This ensures that most functionality is well tested.

 

 # Usage

 

@@ -75,6 +75,8 @@
 Decompressing is done by calling the [`Decompress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress1X) 

 or [`Decompress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress4X) function.

 

+For concurrently decompressing content with a fixed table a stateless [`Decoder`](https://godoc.org/github.com/klauspost/compress/huff0#Decoder) can be requested which will remain correct as long as the scratch is unchanged. The capacity of the provided slice indicates the expected output size.

+

 You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back

 your input was likely corrupted. 

 

@@ -84,4 +86,4 @@
 # Contributing

 

 Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 

-changes will likely not be accepted. If in doubt open an issue before writing the PR.
\ No newline at end of file
+changes will likely not be accepted. If in doubt open an issue before writing the PR.

diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index 7d0903c..a4979e8 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -6,6 +6,7 @@
 package huff0
 
 import (
+	"encoding/binary"
 	"errors"
 	"io"
 )
@@ -34,29 +35,16 @@
 	}
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
-	b.fill()
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
 	b.bitsRead += 8 - uint8(highBit32(uint32(v)))
 	return nil
 }
 
-// getBits will return n bits. n can be 0.
-func (b *bitReader) getBits(n uint8) uint16 {
-	if n == 0 || b.bitsRead >= 64 {
-		return 0
-	}
-	return b.getBitsFast(n)
-}
-
-// getBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReader) getBitsFast(n uint8) uint16 {
-	const regMask = 64 - 1
-	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
-	b.bitsRead += n
-	return v
-}
-
 // peekBitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReader) peekBitsFast(n uint8) uint16 {
@@ -71,21 +59,36 @@
 	if b.bitsRead < 32 {
 		return
 	}
-	// Do single re-slice to avoid bounds checks.
+
+	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
+	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
 	b.off -= 4
 }
 
+func (b *bitReader) advance(n uint8) {
+	b.bitsRead += n
+}
+
+// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
+func (b *bitReader) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
+}
+
 // fill() will make sure at least 32 bits are available.
 func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
+		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
@@ -113,3 +116,214 @@
 	}
 	return nil
 }
+
+// bitReader reads a bitstream in reverse.
+// The last set bit indicates the start of the stream and is used
+// for aligning the input.
+type bitReaderBytes struct {
+	in       []byte
+	off      uint // next byte to read is at in[off - 1]
+	value    uint64
+	bitsRead uint8
+}
+
+// init initializes and resets the bit reader.
+func (b *bitReaderBytes) init(in []byte) error {
+	if len(in) < 1 {
+		return errors.New("corrupt stream: too short")
+	}
+	b.in = in
+	b.off = uint(len(in))
+	// The highest bit of the last byte indicates where to start
+	v := in[len(in)-1]
+	if v == 0 {
+		return errors.New("corrupt stream, did not find end of stream")
+	}
+	b.bitsRead = 64
+	b.value = 0
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
+	b.advance(8 - uint8(highBit32(uint32(v))))
+	return nil
+}
+
+// peekBitsFast requires that at least one bit is requested every time.
+// There are no checks if the buffer is filled.
+func (b *bitReaderBytes) peekByteFast() uint8 {
+	got := uint8(b.value >> 56)
+	return got
+}
+
+func (b *bitReaderBytes) advance(n uint8) {
+	b.bitsRead += n
+	b.value <<= n & 63
+}
+
+// fillFast() will make sure at least 32 bits are available.
+// There must be at least 4 bytes available.
+func (b *bitReaderBytes) fillFast() {
+	if b.bitsRead < 32 {
+		return
+	}
+
+	// 2 bounds checks.
+	v := b.in[b.off-4 : b.off]
+	v = v[:4]
+	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	b.value |= uint64(low) << (b.bitsRead - 32)
+	b.bitsRead -= 32
+	b.off -= 4
+}
+
+// fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read.
+func (b *bitReaderBytes) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
+}
+
+// fill() will make sure at least 32 bits are available.
+func (b *bitReaderBytes) fill() {
+	if b.bitsRead < 32 {
+		return
+	}
+	if b.off > 4 {
+		v := b.in[b.off-4:]
+		v = v[:4]
+		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+		b.value |= uint64(low) << (b.bitsRead - 32)
+		b.bitsRead -= 32
+		b.off -= 4
+		return
+	}
+	for b.off > 0 {
+		b.value |= uint64(b.in[b.off-1]) << (b.bitsRead - 8)
+		b.bitsRead -= 8
+		b.off--
+	}
+}
+
+// finished returns true if all bits have been read from the bit stream.
+func (b *bitReaderBytes) finished() bool {
+	return b.off == 0 && b.bitsRead >= 64
+}
+
+// close the bitstream and returns an error if out-of-buffer reads occurred.
+func (b *bitReaderBytes) close() error {
+	// Release reference.
+	b.in = nil
+	if b.bitsRead > 64 {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+
+// bitReaderShifted reads a bitstream in reverse.
+// The last set bit indicates the start of the stream and is used
+// for aligning the input.
+type bitReaderShifted struct {
+	in       []byte
+	off      uint // next byte to read is at in[off - 1]
+	value    uint64
+	bitsRead uint8
+}
+
+// init initializes and resets the bit reader.
+func (b *bitReaderShifted) init(in []byte) error {
+	if len(in) < 1 {
+		return errors.New("corrupt stream: too short")
+	}
+	b.in = in
+	b.off = uint(len(in))
+	// The highest bit of the last byte indicates where to start
+	v := in[len(in)-1]
+	if v == 0 {
+		return errors.New("corrupt stream, did not find end of stream")
+	}
+	b.bitsRead = 64
+	b.value = 0
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
+	b.advance(8 - uint8(highBit32(uint32(v))))
+	return nil
+}
+
+// peekBitsFast requires that at least one bit is requested every time.
+// There are no checks if the buffer is filled.
+func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
+	return uint16(b.value >> ((64 - n) & 63))
+}
+
+func (b *bitReaderShifted) advance(n uint8) {
+	b.bitsRead += n
+	b.value <<= n & 63
+}
+
+// fillFast() will make sure at least 32 bits are available.
+// There must be at least 4 bytes available.
+func (b *bitReaderShifted) fillFast() {
+	if b.bitsRead < 32 {
+		return
+	}
+
+	// 2 bounds checks.
+	v := b.in[b.off-4 : b.off]
+	v = v[:4]
+	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
+	b.bitsRead -= 32
+	b.off -= 4
+}
+
+// fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read.
+func (b *bitReaderShifted) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
+}
+
+// fill() will make sure at least 32 bits are available.
+func (b *bitReaderShifted) fill() {
+	if b.bitsRead < 32 {
+		return
+	}
+	if b.off > 4 {
+		v := b.in[b.off-4:]
+		v = v[:4]
+		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+		b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
+		b.bitsRead -= 32
+		b.off -= 4
+		return
+	}
+	for b.off > 0 {
+		b.value |= uint64(b.in[b.off-1]) << ((b.bitsRead - 8) & 63)
+		b.bitsRead -= 8
+		b.off--
+	}
+}
+
+// finished returns true if all bits have been read from the bit stream.
+func (b *bitReaderShifted) finished() bool {
+	return b.off == 0 && b.bitsRead >= 64
+}
+
+// close the bitstream and returns an error if out-of-buffer reads occurred.
+func (b *bitReaderShifted) close() error {
+	// Release reference.
+	b.in = nil
+	if b.bitsRead > 64 {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
index bda4021..6bce4e8 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@@ -43,6 +43,11 @@
 func (b *bitWriter) encSymbol(ct cTable, symbol byte) {
 	enc := ct[symbol]
 	b.bitContainer |= uint64(enc.val) << (b.nBits & 63)
+	if false {
+		if enc.nBits == 0 {
+			panic("nbits 0")
+		}
+	}
 	b.nBits += enc.nBits
 }
 
@@ -54,6 +59,14 @@
 	sh := b.nBits & 63
 	combined := uint64(encA.val) | (uint64(encB.val) << (encA.nBits & 63))
 	b.bitContainer |= combined << sh
+	if false {
+		if encA.nBits == 0 {
+			panic("nbitsA 0")
+		}
+		if encB.nBits == 0 {
+			panic("nbitsB 0")
+		}
+	}
 	b.nBits += encA.nBits + encB.nBits
 }
 
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index 0843cb0..0823c92 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -77,8 +77,11 @@
 		// Each symbol present maximum once or too well distributed.
 		return nil, false, ErrIncompressible
 	}
-
-	if s.Reuse == ReusePolicyPrefer && canReuse {
+	if s.Reuse == ReusePolicyMust && !canReuse {
+		// We must reuse, but we can't.
+		return nil, false, ErrIncompressible
+	}
+	if (s.Reuse == ReusePolicyPrefer || s.Reuse == ReusePolicyMust) && canReuse {
 		keepTable := s.cTable
 		keepTL := s.actualTableLog
 		s.cTable = s.prevTable
@@ -90,6 +93,9 @@
 			s.OutData = s.Out
 			return s.Out, true, nil
 		}
+		if s.Reuse == ReusePolicyMust {
+			return nil, false, ErrIncompressible
+		}
 		// Do not attempt to re-use later.
 		s.prevTable = s.prevTable[:0]
 	}
@@ -397,7 +403,7 @@
 	var startNode = int16(s.symbolLen)
 	nonNullRank := s.symbolLen - 1
 
-	nodeNb := int16(startNode)
+	nodeNb := startNode
 	huffNode := s.nodes[1 : huffNodesLen+1]
 
 	// This overlays the slice above, but allows "-1" index lookups.
@@ -530,7 +536,6 @@
 		}
 		nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)}
 	}
-	return
 }
 
 func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
@@ -574,7 +579,7 @@
 
 		// Get pos of last (smallest) symbol per rank
 		{
-			currentNbBits := uint8(maxNbBits)
+			currentNbBits := maxNbBits
 			for pos := int(n); pos >= 0; pos-- {
 				if huffNode[pos].nbBits >= currentNbBits {
 					continue
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 97ae66a..41703bb 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -25,11 +25,14 @@
 	len   uint8
 }
 
+// Uses special code for all tables that are < 8 bits.
+const use8BitTables = true
+
 // ReadTable will read a table from the input.
 // The size of the input may be larger than the table definition.
 // Any content remaining after the table definition will be returned.
 // If no Scratch is provided a new one is allocated.
-// The returned Scratch can be used for decoding input using this table.
+// The returned Scratch can be used for encoding or decoding input using this table.
 func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 	s, err = s.prepare(in)
 	if err != nil {
@@ -55,8 +58,8 @@
 		s.symbolLen = uint16(oSize)
 		in = in[iSize:]
 	} else {
-		if len(in) <= int(iSize) {
-			return s, nil, errors.New("input too small for table")
+		if len(in) < int(iSize) {
+			return s, nil, fmt.Errorf("input too small for table, want %d bytes, have %d", iSize, len(in))
 		}
 		// FSE compressed weights
 		s.fse.DecompressLimit = 255
@@ -83,6 +86,7 @@
 		}
 		v2 := v & 15
 		rankStats[v2]++
+		// (1 << (v2-1)) is slower since the compiler cannot prove that v2 isn't 0.
 		weightTotal += (1 << v2) >> 1
 	}
 	if weightTotal == 0 {
@@ -134,20 +138,40 @@
 	if len(s.dt.single) != tSize {
 		s.dt.single = make([]dEntrySingle, tSize)
 	}
+	cTable := s.prevTable
+	if cap(cTable) < maxSymbolValue+1 {
+		cTable = make([]cTableEntry, 0, maxSymbolValue+1)
+	}
+	cTable = cTable[:maxSymbolValue+1]
+	s.prevTable = cTable[:s.symbolLen]
+	s.prevTableLog = s.actualTableLog
+
 	for n, w := range s.huffWeight[:s.symbolLen] {
 		if w == 0 {
+			cTable[n] = cTableEntry{
+				val:   0,
+				nBits: 0,
+			}
 			continue
 		}
 		length := (uint32(1) << w) >> 1
 		d := dEntrySingle{
 			entry: uint16(s.actualTableLog+1-w) | (uint16(n) << 8),
 		}
-		single := s.dt.single[rankStats[w] : rankStats[w]+length]
+
+		rank := &rankStats[w]
+		cTable[n] = cTableEntry{
+			val:   uint16(*rank >> (w - 1)),
+			nBits: uint8(d.entry),
+		}
+
+		single := s.dt.single[*rank : *rank+length]
 		for i := range single {
 			single[i] = d
 		}
-		rankStats[w] += length
+		*rank += length
 	}
+
 	return s, in, nil
 }
 
@@ -155,69 +179,14 @@
 // The length of the supplied input must match the end of a block exactly.
 // Before this is called, the table must be initialized with ReadTable unless
 // the encoder re-used the table.
+// deprecated: Use the stateless Decoder() to get a concurrent version.
 func (s *Scratch) Decompress1X(in []byte) (out []byte, err error) {
-	if len(s.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
+	if cap(s.Out) < s.MaxDecodedSize {
+		s.Out = make([]byte, s.MaxDecodedSize)
 	}
-	var br bitReader
-	err = br.init(in)
-	if err != nil {
-		return nil, err
-	}
-	s.Out = s.Out[:0]
-
-	decode := func() byte {
-		val := br.peekBitsFast(s.actualTableLog) /* note : actualTableLog >= 1 */
-		v := s.dt.single[val]
-		br.bitsRead += uint8(v.entry)
-		return uint8(v.entry >> 8)
-	}
-	hasDec := func(v dEntrySingle) byte {
-		br.bitsRead += uint8(v.entry)
-		return uint8(v.entry >> 8)
-	}
-
-	// Avoid bounds check by always having full sized table.
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	dt := s.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	var tmp = s.huffWeight[:256]
-	var off uint8
-
-	for br.off >= 8 {
-		br.fillFast()
-		tmp[off+0] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
-		tmp[off+1] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
-		br.fillFast()
-		tmp[off+2] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
-		tmp[off+3] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
-		off += 4
-		if off == 0 {
-			if len(s.Out)+256 > s.MaxDecodedSize {
-				br.close()
-				return nil, ErrMaxDecodedSizeExceeded
-			}
-			s.Out = append(s.Out, tmp...)
-		}
-	}
-
-	if len(s.Out)+int(off) > s.MaxDecodedSize {
-		br.close()
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	s.Out = append(s.Out, tmp[:off]...)
-
-	for !br.finished() {
-		br.fill()
-		if len(s.Out) >= s.MaxDecodedSize {
-			br.close()
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-		s.Out = append(s.Out, decode())
-	}
-	return s.Out, br.close()
+	s.Out = s.Out[:0:s.MaxDecodedSize]
+	s.Out, err = s.Decoder().Decompress1X(s.Out, in)
+	return s.Out, err
 }
 
 // Decompress4X will decompress a 4X encoded stream.
@@ -225,123 +194,402 @@
 // the encoder re-used the table.
 // The length of the supplied input must match the end of a block exactly.
 // The destination size of the uncompressed data must be known and provided.
+// deprecated: Use the stateless Decoder() to get a concurrent version.
 func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) {
-	if len(s.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(in) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
 	if dstSize > s.MaxDecodedSize {
 		return nil, ErrMaxDecodedSizeExceeded
 	}
-	// TODO: We do not detect when we overrun a buffer, except if the last one does.
+	if cap(s.Out) < dstSize {
+		s.Out = make([]byte, s.MaxDecodedSize)
+	}
+	s.Out = s.Out[:0:dstSize]
+	s.Out, err = s.Decoder().Decompress4X(s.Out, in)
+	return s.Out, err
+}
 
-	var br [4]bitReader
+// Decoder will return a stateless decoder that can be used by multiple
+// decompressors concurrently.
+// Before this is called, the table must be initialized with ReadTable.
+// The Decoder is still linked to the scratch buffer so that cannot be reused.
+// However, it is safe to discard the scratch.
+func (s *Scratch) Decoder() *Decoder {
+	return &Decoder{
+		dt:             s.dt,
+		actualTableLog: s.actualTableLog,
+	}
+}
+
+// Decoder provides stateless decoding.
+type Decoder struct {
+	dt             dTable
+	actualTableLog uint8
+}
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress1X8Bit(dst, src)
+	}
+	var br bitReaderShifted
+	err := br.init(src)
+	if err != nil {
+		return dst, err
+	}
+	maxDecodedSize := cap(dst)
+	dst = dst[:0]
+
+	// Avoid bounds check by always having full sized table.
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	dt := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+
+	for br.off >= 8 {
+		br.fillFast()
+		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+0] = uint8(v.entry >> 8)
+
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+1] = uint8(v.entry >> 8)
+
+		// Refill
+		br.fillFast()
+
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+2] = uint8(v.entry >> 8)
+
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+3] = uint8(v.entry >> 8)
+
+		off += 4
+		if off == 0 {
+			if len(dst)+256 > maxDecodedSize {
+				br.close()
+				return nil, ErrMaxDecodedSizeExceeded
+			}
+			dst = append(dst, buf[:]...)
+		}
+	}
+
+	if len(dst)+int(off) > maxDecodedSize {
+		br.close()
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	dst = append(dst, buf[:off]...)
+
+	// br < 8, so uint8 is fine
+	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+	for bitsLeft > 0 {
+		br.fill()
+		if false && br.bitsRead >= 32 {
+			if br.off >= 4 {
+				v := br.in[br.off-4:]
+				v = v[:4]
+				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+				br.value = (br.value << 32) | uint64(low)
+				br.bitsRead -= 32
+				br.off -= 4
+			} else {
+				for br.off > 0 {
+					br.value = (br.value << 8) | uint64(br.in[br.off-1])
+					br.bitsRead -= 8
+					br.off--
+				}
+			}
+		}
+		if len(dst) >= maxDecodedSize {
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= nBits
+		dst = append(dst, uint8(v.entry>>8))
+	}
+	return dst, br.close()
+}
+
+// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
+	if d.actualTableLog == 8 {
+		return d.decompress1X8BitExactly(dst, src)
+	}
+	var br bitReaderBytes
+	err := br.init(src)
+	if err != nil {
+		return dst, err
+	}
+	maxDecodedSize := cap(dst)
+	dst = dst[:0]
+
+	// Avoid bounds check by always having full sized table.
+	dt := d.dt.single[:256]
+
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+
+	shift := (8 - d.actualTableLog) & 7
+
+	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
+	for br.off >= 4 {
+		br.fillFast()
+		v := dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+0] = uint8(v.entry >> 8)
+
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+1] = uint8(v.entry >> 8)
+
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+2] = uint8(v.entry >> 8)
+
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+3] = uint8(v.entry >> 8)
+
+		off += 4
+		if off == 0 {
+			if len(dst)+256 > maxDecodedSize {
+				br.close()
+				return nil, ErrMaxDecodedSizeExceeded
+			}
+			dst = append(dst, buf[:]...)
+		}
+	}
+
+	if len(dst)+int(off) > maxDecodedSize {
+		br.close()
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	dst = append(dst, buf[:off]...)
+
+	// br < 4, so uint8 is fine
+	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
+	for bitsLeft > 0 {
+		if br.bitsRead >= 64-8 {
+			for br.off > 0 {
+				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+				br.bitsRead -= 8
+				br.off--
+			}
+		}
+		if len(dst) >= maxDecodedSize {
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		v := dt[br.peekByteFast()>>shift]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= int8(nBits)
+		dst = append(dst, uint8(v.entry>>8))
+	}
+	return dst, br.close()
+}
+
+// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
+	var br bitReaderBytes
+	err := br.init(src)
+	if err != nil {
+		return dst, err
+	}
+	maxDecodedSize := cap(dst)
+	dst = dst[:0]
+
+	// Avoid bounds check by always having full sized table.
+	dt := d.dt.single[:256]
+
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+
+	const shift = 0
+
+	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
+	for br.off >= 4 {
+		br.fillFast()
+		v := dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+0] = uint8(v.entry >> 8)
+
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+1] = uint8(v.entry >> 8)
+
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+2] = uint8(v.entry >> 8)
+
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+3] = uint8(v.entry >> 8)
+
+		off += 4
+		if off == 0 {
+			if len(dst)+256 > maxDecodedSize {
+				br.close()
+				return nil, ErrMaxDecodedSizeExceeded
+			}
+			dst = append(dst, buf[:]...)
+		}
+	}
+
+	if len(dst)+int(off) > maxDecodedSize {
+		br.close()
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	dst = append(dst, buf[:off]...)
+
+	// br < 4, so uint8 is fine
+	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
+	for bitsLeft > 0 {
+		if br.bitsRead >= 64-8 {
+			for br.off > 0 {
+				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+				br.bitsRead -= 8
+				br.off--
+			}
+		}
+		if len(dst) >= maxDecodedSize {
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		v := dt[br.peekByteFast()>>shift]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= int8(nBits)
+		dst = append(dst, uint8(v.entry>>8))
+	}
+	return dst, br.close()
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
 	start := 6
 	for i := 0; i < 3; i++ {
-		length := int(in[i*2]) | (int(in[i*2+1]) << 8)
-		if start+length >= len(in) {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
 			return nil, errors.New("truncated input (or invalid offset)")
 		}
-		err = br[i].init(in[start : start+length])
+		err := br[i].init(src[start : start+length])
 		if err != nil {
 			return nil, err
 		}
 		start += length
 	}
-	err = br[3].init(in[start:])
+	err := br[3].init(src[start:])
 	if err != nil {
 		return nil, err
 	}
 
-	// Prepare output
-	if cap(s.Out) < dstSize {
-		s.Out = make([]byte, 0, dstSize)
-	}
-	s.Out = s.Out[:dstSize]
 	// destination, offset to match first output
-	dstOut := s.Out
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
 	dstEvery := (dstSize + 3) / 4
 
 	const tlSize = 1 << tableLogMax
 	const tlMask = tlSize - 1
-	single := s.dt.single[:tlSize]
-
-	decode := func(br *bitReader) byte {
-		val := br.peekBitsFast(s.actualTableLog) /* note : actualTableLog >= 1 */
-		v := single[val&tlMask]
-		br.bitsRead += uint8(v.entry)
-		return uint8(v.entry >> 8)
-	}
+	single := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var tmp = s.huffWeight[:256]
+	var buf [256]byte
 	var off uint8
 	var decoded int
 
 	// Decode 2 values from each decoder/loop.
 	const bufoff = 256 / 4
-bigloop:
 	for {
-		for i := range br {
-			br := &br[i]
-			if br.off < 4 {
-				break bigloop
-			}
-			br.fillFast()
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
 		}
 
 		{
 			const stream = 0
-			val := br[stream].peekBitsFast(s.actualTableLog)
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
 			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			br[stream].advance(uint8(v.entry))
+			buf[off+bufoff*stream] = uint8(v.entry >> 8)
 
-			val2 := br[stream].peekBitsFast(s.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
 			v2 := single[val2&tlMask]
-			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
-		}
+			br[stream2].advance(uint8(v2.entry))
+			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
 
-		{
-			const stream = 1
-			val := br[stream].peekBitsFast(s.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			br[stream].advance(uint8(v.entry))
+			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
 
-			val2 := br[stream].peekBitsFast(s.actualTableLog)
-			v2 := single[val2&tlMask]
-			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v2 = single[val2&tlMask]
+			br[stream2].advance(uint8(v2.entry))
+			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
 		}
 
 		{
 			const stream = 2
-			val := br[stream].peekBitsFast(s.actualTableLog)
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
 			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			br[stream].advance(uint8(v.entry))
+			buf[off+bufoff*stream] = uint8(v.entry >> 8)
 
-			val2 := br[stream].peekBitsFast(s.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
 			v2 := single[val2&tlMask]
-			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
-		}
+			br[stream2].advance(uint8(v2.entry))
+			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
 
-		{
-			const stream = 3
-			val := br[stream].peekBitsFast(s.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			br[stream].advance(uint8(v.entry))
+			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
 
-			val2 := br[stream].peekBitsFast(s.actualTableLog)
-			v2 := single[val2&tlMask]
-			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v2 = single[val2&tlMask]
+			br[stream2].advance(uint8(v2.entry))
+			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
 		}
 
 		off += 2
@@ -350,42 +598,67 @@
 			if bufoff > dstEvery {
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(dstOut, tmp[:bufoff])
-			copy(dstOut[dstEvery:], tmp[bufoff:bufoff*2])
-			copy(dstOut[dstEvery*2:], tmp[bufoff*2:bufoff*3])
-			copy(dstOut[dstEvery*3:], tmp[bufoff*3:bufoff*4])
+			copy(out, buf[:bufoff])
+			copy(out[dstEvery:], buf[bufoff:bufoff*2])
+			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
+			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
 			off = 0
-			dstOut = dstOut[bufoff:]
+			out = out[bufoff:]
 			decoded += 256
 			// There must at least be 3 buffers left.
-			if len(dstOut) < dstEvery*3 {
+			if len(out) < dstEvery*3 {
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
 	}
 	if off > 0 {
 		ioff := int(off)
-		if len(dstOut) < dstEvery*3+ioff {
+		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
-		copy(dstOut, tmp[:off])
-		copy(dstOut[dstEvery:dstEvery+ioff], tmp[bufoff:bufoff*2])
-		copy(dstOut[dstEvery*2:dstEvery*2+ioff], tmp[bufoff*2:bufoff*3])
-		copy(dstOut[dstEvery*3:dstEvery*3+ioff], tmp[bufoff*3:bufoff*4])
+		copy(out, buf[:off])
+		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
+		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
+		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
 		decoded += int(off) * 4
-		dstOut = dstOut[off:]
+		out = out[off:]
 	}
 
 	// Decode remaining.
 	for i := range br {
 		offset := dstEvery * i
 		br := &br[i]
-		for !br.finished() {
+		bitsLeft := br.off*8 + uint(64-br.bitsRead)
+		for bitsLeft > 0 {
 			br.fill()
-			if offset >= len(dstOut) {
+			if false && br.bitsRead >= 32 {
+				if br.off >= 4 {
+					v := br.in[br.off-4:]
+					v = v[:4]
+					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+					br.value = (br.value << 32) | uint64(low)
+					br.bitsRead -= 32
+					br.off -= 4
+				} else {
+					for br.off > 0 {
+						br.value = (br.value << 8) | uint64(br.in[br.off-1])
+						br.bitsRead -= 8
+						br.off--
+					}
+				}
+			}
+			// end inline...
+			if offset >= len(out) {
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
-			dstOut[offset] = decode(br)
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
 			offset++
 		}
 		decoded += offset - dstEvery*i
@@ -397,7 +670,426 @@
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
-	return s.Out, nil
+	return dst, nil
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
+	if d.actualTableLog == 8 {
+		return d.decompress4X8bitExactly(dst, src)
+	}
+
+	var br [4]bitReaderBytes
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	shift := (8 - d.actualTableLog) & 7
+
+	const tlSize = 1 << 8
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+	var decoded int
+
+	// Decode 4 values from each decoder/loop.
+	const bufoff = 256 / 4
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		{
+			// Interleave 2 decodes.
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			v := single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 := single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+1] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+2] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+		}
+
+		{
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			v := single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 := single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+1] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+2] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+		}
+
+		off += 4
+
+		if off == bufoff {
+			if bufoff > dstEvery {
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(out, buf[:bufoff])
+			copy(out[dstEvery:], buf[bufoff:bufoff*2])
+			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
+			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
+			off = 0
+			out = out[bufoff:]
+			decoded += 256
+			// There must at least be 3 buffers left.
+			if len(out) < dstEvery*3 {
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[:off])
+		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
+		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
+		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	for i := range br {
+		offset := dstEvery * i
+		br := &br[i]
+		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		for bitsLeft > 0 {
+			if br.finished() {
+				return nil, io.ErrUnexpectedEOF
+			}
+			if br.bitsRead >= 56 {
+				if br.off >= 4 {
+					v := br.in[br.off-4:]
+					v = v[:4]
+					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+					br.value |= uint64(low) << (br.bitsRead - 32)
+					br.bitsRead -= 32
+					br.off -= 4
+				} else {
+					for br.off > 0 {
+						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+						br.bitsRead -= 8
+						br.off--
+					}
+				}
+			}
+			// end inline...
+			if offset >= len(out) {
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			v := single[br.peekByteFast()>>shift].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= int(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
+	var br [4]bitReaderBytes
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const shift = 0
+	const tlSize = 1 << 8
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+	var decoded int
+
+	// Decode 4 values from each decoder/loop.
+	const bufoff = 256 / 4
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		{
+			// Interleave 2 decodes.
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			v := single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 := single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+1] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+2] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+		}
+
+		{
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			v := single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 := single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+1] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+2] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+		}
+
+		off += 4
+
+		if off == bufoff {
+			if bufoff > dstEvery {
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(out, buf[:bufoff])
+			copy(out[dstEvery:], buf[bufoff:bufoff*2])
+			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
+			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
+			off = 0
+			out = out[bufoff:]
+			decoded += 256
+			// There must at least be 3 buffers left.
+			if len(out) < dstEvery*3 {
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[:off])
+		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
+		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
+		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	for i := range br {
+		offset := dstEvery * i
+		br := &br[i]
+		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		for bitsLeft > 0 {
+			if br.finished() {
+				return nil, io.ErrUnexpectedEOF
+			}
+			if br.bitsRead >= 56 {
+				if br.off >= 4 {
+					v := br.in[br.off-4:]
+					v = v[:4]
+					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+					br.value |= uint64(low) << (br.bitsRead - 32)
+					br.bitsRead -= 32
+					br.off -= 4
+				} else {
+					for br.off > 0 {
+						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+						br.bitsRead -= 8
+						br.off--
+					}
+				}
+			}
+			// end inline...
+			if offset >= len(out) {
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			v := single[br.peekByteFast()>>shift].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= int(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
 }
 
 // matches will compare a decoding table to a coding table.
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
index 53249df..7ec2022 100644
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ b/vendor/github.com/klauspost/compress/huff0/huff0.go
@@ -55,6 +55,9 @@
 	// ReusePolicyNone will disable re-use of tables.
 	// This is slightly faster than ReusePolicyAllow but may produce larger output.
 	ReusePolicyNone
+
+	// ReusePolicyMust must allow reuse and produce smaller output.
+	ReusePolicyMust
 )
 
 type Scratch struct {
@@ -79,6 +82,13 @@
 	// Slice of the returned data.
 	OutData []byte
 
+	// MaxDecodedSize will set the maximum allowed output size.
+	// This value will automatically be set to BlockSizeMax if not set.
+	// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
+	MaxDecodedSize int
+
+	br byteReader
+
 	// MaxSymbolValue will override the maximum symbol value of the next block.
 	MaxSymbolValue uint8
 
@@ -95,12 +105,6 @@
 	// If WantLogLess == 0 any improvement will do.
 	WantLogLess uint8
 
-	// MaxDecodedSize will set the maximum allowed output size.
-	// This value will automatically be set to BlockSizeMax if not set.
-	// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
-	MaxDecodedSize int
-
-	br             byteReader
 	symbolLen      uint16 // Length of active part of the symbol table.
 	maxCount       int    // count of the most probable symbol
 	clearCount     bool   // clear count
@@ -115,6 +119,16 @@
 	huffWeight     [maxSymbolValue + 1]byte
 }
 
+// TransferCTable will transfer the previously used compression table.
+func (s *Scratch) TransferCTable(src *Scratch) {
+	if cap(s.prevTable) < len(src.prevTable) {
+		s.prevTable = make(cTable, 0, maxSymbolValue+1)
+	}
+	s.prevTable = s.prevTable[:len(src.prevTable)]
+	copy(s.prevTable, src.prevTable)
+	s.prevTableLog = src.prevTableLog
+}
+
 func (s *Scratch) prepare(in []byte) (*Scratch, error) {
 	if len(in) > BlockSizeMax {
 		return nil, ErrTooBig
diff --git a/vendor/github.com/klauspost/compress/snappy/.gitignore b/vendor/github.com/klauspost/compress/snappy/.gitignore
deleted file mode 100644
index 042091d..0000000
--- a/vendor/github.com/klauspost/compress/snappy/.gitignore
+++ /dev/null
@@ -1,16 +0,0 @@
-cmd/snappytool/snappytool
-testdata/bench
-
-# These explicitly listed benchmark data files are for an obsolete version of
-# snappy_test.go.
-testdata/alice29.txt
-testdata/asyoulik.txt
-testdata/fireworks.jpeg
-testdata/geo.protodata
-testdata/html
-testdata/html_x_4
-testdata/kppkn.gtb
-testdata/lcet10.txt
-testdata/paper-100k.pdf
-testdata/plrabn12.txt
-testdata/urls.10K
diff --git a/vendor/github.com/klauspost/compress/snappy/AUTHORS b/vendor/github.com/klauspost/compress/snappy/AUTHORS
deleted file mode 100644
index bcfa195..0000000
--- a/vendor/github.com/klauspost/compress/snappy/AUTHORS
+++ /dev/null
@@ -1,15 +0,0 @@
-# This is the official list of Snappy-Go authors for copyright purposes.
-# This file is distinct from the CONTRIBUTORS files.
-# See the latter for an explanation.
-
-# Names should be added to this file as
-#	Name or Organization <email address>
-# The email address is not required for organizations.
-
-# Please keep the list sorted.
-
-Damian Gryski <dgryski@gmail.com>
-Google Inc.
-Jan Mercl <0xjnml@gmail.com>
-Rodolfo Carvalho <rhcarvalho@gmail.com>
-Sebastien Binet <seb.binet@gmail.com>
diff --git a/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS b/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS
deleted file mode 100644
index 931ae31..0000000
--- a/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS
+++ /dev/null
@@ -1,37 +0,0 @@
-# This is the official list of people who can contribute
-# (and typically have contributed) code to the Snappy-Go repository.
-# The AUTHORS file lists the copyright holders; this file
-# lists people.  For example, Google employees are listed here
-# but not in AUTHORS, because Google holds the copyright.
-#
-# The submission process automatically checks to make sure
-# that people submitting code are listed in this file (by email address).
-#
-# Names should be added to this file only after verifying that
-# the individual or the individual's organization has agreed to
-# the appropriate Contributor License Agreement, found here:
-#
-#     http://code.google.com/legal/individual-cla-v1.0.html
-#     http://code.google.com/legal/corporate-cla-v1.0.html
-#
-# The agreement for individuals can be filled out on the web.
-#
-# When adding J Random Contributor's name to this file,
-# either J's name or J's organization's name should be
-# added to the AUTHORS file, depending on whether the
-# individual or corporate CLA was used.
-
-# Names should be added to this file like so:
-#     Name <email address>
-
-# Please keep the list sorted.
-
-Damian Gryski <dgryski@gmail.com>
-Jan Mercl <0xjnml@gmail.com>
-Kai Backman <kaib@golang.org>
-Marc-Antoine Ruel <maruel@chromium.org>
-Nigel Tao <nigeltao@golang.org>
-Rob Pike <r@golang.org>
-Rodolfo Carvalho <rhcarvalho@gmail.com>
-Russ Cox <rsc@golang.org>
-Sebastien Binet <seb.binet@gmail.com>
diff --git a/vendor/github.com/klauspost/compress/snappy/LICENSE b/vendor/github.com/klauspost/compress/snappy/LICENSE
deleted file mode 100644
index 6050c10..0000000
--- a/vendor/github.com/klauspost/compress/snappy/LICENSE
+++ /dev/null
@@ -1,27 +0,0 @@
-Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/compress/snappy/README b/vendor/github.com/klauspost/compress/snappy/README
deleted file mode 100644
index cea1287..0000000
--- a/vendor/github.com/klauspost/compress/snappy/README
+++ /dev/null
@@ -1,107 +0,0 @@
-The Snappy compression format in the Go programming language.
-
-To download and install from source:
-$ go get github.com/golang/snappy
-
-Unless otherwise noted, the Snappy-Go source files are distributed
-under the BSD-style license found in the LICENSE file.
-
-
-
-Benchmarks.
-
-The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
-or so files, the same set used by the C++ Snappy code (github.com/google/snappy
-and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
-3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29:
-
-"go test -test.bench=."
-
-_UFlat0-8         2.19GB/s ± 0%  html
-_UFlat1-8         1.41GB/s ± 0%  urls
-_UFlat2-8         23.5GB/s ± 2%  jpg
-_UFlat3-8         1.91GB/s ± 0%  jpg_200
-_UFlat4-8         14.0GB/s ± 1%  pdf
-_UFlat5-8         1.97GB/s ± 0%  html4
-_UFlat6-8          814MB/s ± 0%  txt1
-_UFlat7-8          785MB/s ± 0%  txt2
-_UFlat8-8          857MB/s ± 0%  txt3
-_UFlat9-8          719MB/s ± 1%  txt4
-_UFlat10-8        2.84GB/s ± 0%  pb
-_UFlat11-8        1.05GB/s ± 0%  gaviota
-
-_ZFlat0-8         1.04GB/s ± 0%  html
-_ZFlat1-8          534MB/s ± 0%  urls
-_ZFlat2-8         15.7GB/s ± 1%  jpg
-_ZFlat3-8          740MB/s ± 3%  jpg_200
-_ZFlat4-8         9.20GB/s ± 1%  pdf
-_ZFlat5-8          991MB/s ± 0%  html4
-_ZFlat6-8          379MB/s ± 0%  txt1
-_ZFlat7-8          352MB/s ± 0%  txt2
-_ZFlat8-8          396MB/s ± 1%  txt3
-_ZFlat9-8          327MB/s ± 1%  txt4
-_ZFlat10-8        1.33GB/s ± 1%  pb
-_ZFlat11-8         605MB/s ± 1%  gaviota
-
-
-
-"go test -test.bench=. -tags=noasm"
-
-_UFlat0-8          621MB/s ± 2%  html
-_UFlat1-8          494MB/s ± 1%  urls
-_UFlat2-8         23.2GB/s ± 1%  jpg
-_UFlat3-8         1.12GB/s ± 1%  jpg_200
-_UFlat4-8         4.35GB/s ± 1%  pdf
-_UFlat5-8          609MB/s ± 0%  html4
-_UFlat6-8          296MB/s ± 0%  txt1
-_UFlat7-8          288MB/s ± 0%  txt2
-_UFlat8-8          309MB/s ± 1%  txt3
-_UFlat9-8          280MB/s ± 1%  txt4
-_UFlat10-8         753MB/s ± 0%  pb
-_UFlat11-8         400MB/s ± 0%  gaviota
-
-_ZFlat0-8          409MB/s ± 1%  html
-_ZFlat1-8          250MB/s ± 1%  urls
-_ZFlat2-8         12.3GB/s ± 1%  jpg
-_ZFlat3-8          132MB/s ± 0%  jpg_200
-_ZFlat4-8         2.92GB/s ± 0%  pdf
-_ZFlat5-8          405MB/s ± 1%  html4
-_ZFlat6-8          179MB/s ± 1%  txt1
-_ZFlat7-8          170MB/s ± 1%  txt2
-_ZFlat8-8          189MB/s ± 1%  txt3
-_ZFlat9-8          164MB/s ± 1%  txt4
-_ZFlat10-8         479MB/s ± 1%  pb
-_ZFlat11-8         270MB/s ± 1%  gaviota
-
-
-
-For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
-are the numbers from C++ Snappy's
-
-make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
-
-BM_UFlat/0     2.4GB/s  html
-BM_UFlat/1     1.4GB/s  urls
-BM_UFlat/2    21.8GB/s  jpg
-BM_UFlat/3     1.5GB/s  jpg_200
-BM_UFlat/4    13.3GB/s  pdf
-BM_UFlat/5     2.1GB/s  html4
-BM_UFlat/6     1.0GB/s  txt1
-BM_UFlat/7   959.4MB/s  txt2
-BM_UFlat/8     1.0GB/s  txt3
-BM_UFlat/9   864.5MB/s  txt4
-BM_UFlat/10    2.9GB/s  pb
-BM_UFlat/11    1.2GB/s  gaviota
-
-BM_ZFlat/0   944.3MB/s  html (22.31 %)
-BM_ZFlat/1   501.6MB/s  urls (47.78 %)
-BM_ZFlat/2    14.3GB/s  jpg (99.95 %)
-BM_ZFlat/3   538.3MB/s  jpg_200 (73.00 %)
-BM_ZFlat/4     8.3GB/s  pdf (83.30 %)
-BM_ZFlat/5   903.5MB/s  html4 (22.52 %)
-BM_ZFlat/6   336.0MB/s  txt1 (57.88 %)
-BM_ZFlat/7   312.3MB/s  txt2 (61.91 %)
-BM_ZFlat/8   353.1MB/s  txt3 (54.99 %)
-BM_ZFlat/9   289.9MB/s  txt4 (66.26 %)
-BM_ZFlat/10    1.2GB/s  pb (19.68 %)
-BM_ZFlat/11  527.4MB/s  gaviota (37.72 %)
diff --git a/vendor/github.com/klauspost/compress/snappy/decode.go b/vendor/github.com/klauspost/compress/snappy/decode.go
deleted file mode 100644
index 72efb03..0000000
--- a/vendor/github.com/klauspost/compress/snappy/decode.go
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright 2011 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package snappy
-
-import (
-	"encoding/binary"
-	"errors"
-	"io"
-)
-
-var (
-	// ErrCorrupt reports that the input is invalid.
-	ErrCorrupt = errors.New("snappy: corrupt input")
-	// ErrTooLarge reports that the uncompressed length is too large.
-	ErrTooLarge = errors.New("snappy: decoded block is too large")
-	// ErrUnsupported reports that the input isn't supported.
-	ErrUnsupported = errors.New("snappy: unsupported input")
-
-	errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
-)
-
-// DecodedLen returns the length of the decoded block.
-func DecodedLen(src []byte) (int, error) {
-	v, _, err := decodedLen(src)
-	return v, err
-}
-
-// decodedLen returns the length of the decoded block and the number of bytes
-// that the length header occupied.
-func decodedLen(src []byte) (blockLen, headerLen int, err error) {
-	v, n := binary.Uvarint(src)
-	if n <= 0 || v > 0xffffffff {
-		return 0, 0, ErrCorrupt
-	}
-
-	const wordSize = 32 << (^uint(0) >> 32 & 1)
-	if wordSize == 32 && v > 0x7fffffff {
-		return 0, 0, ErrTooLarge
-	}
-	return int(v), n, nil
-}
-
-const (
-	decodeErrCodeCorrupt                  = 1
-	decodeErrCodeUnsupportedLiteralLength = 2
-)
-
-// Decode returns the decoded form of src. The returned slice may be a sub-
-// slice of dst if dst was large enough to hold the entire decoded block.
-// Otherwise, a newly allocated slice will be returned.
-//
-// The dst and src must not overlap. It is valid to pass a nil dst.
-func Decode(dst, src []byte) ([]byte, error) {
-	dLen, s, err := decodedLen(src)
-	if err != nil {
-		return nil, err
-	}
-	if dLen <= len(dst) {
-		dst = dst[:dLen]
-	} else {
-		dst = make([]byte, dLen)
-	}
-	switch decode(dst, src[s:]) {
-	case 0:
-		return dst, nil
-	case decodeErrCodeUnsupportedLiteralLength:
-		return nil, errUnsupportedLiteralLength
-	}
-	return nil, ErrCorrupt
-}
-
-// NewReader returns a new Reader that decompresses from r, using the framing
-// format described at
-// https://github.com/google/snappy/blob/master/framing_format.txt
-func NewReader(r io.Reader) *Reader {
-	return &Reader{
-		r:       r,
-		decoded: make([]byte, maxBlockSize),
-		buf:     make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
-	}
-}
-
-// Reader is an io.Reader that can read Snappy-compressed bytes.
-type Reader struct {
-	r       io.Reader
-	err     error
-	decoded []byte
-	buf     []byte
-	// decoded[i:j] contains decoded bytes that have not yet been passed on.
-	i, j       int
-	readHeader bool
-}
-
-// Reset discards any buffered data, resets all state, and switches the Snappy
-// reader to read from r. This permits reusing a Reader rather than allocating
-// a new one.
-func (r *Reader) Reset(reader io.Reader) {
-	r.r = reader
-	r.err = nil
-	r.i = 0
-	r.j = 0
-	r.readHeader = false
-}
-
-func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
-	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
-		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
-			r.err = ErrCorrupt
-		}
-		return false
-	}
-	return true
-}
-
-// Read satisfies the io.Reader interface.
-func (r *Reader) Read(p []byte) (int, error) {
-	if r.err != nil {
-		return 0, r.err
-	}
-	for {
-		if r.i < r.j {
-			n := copy(p, r.decoded[r.i:r.j])
-			r.i += n
-			return n, nil
-		}
-		if !r.readFull(r.buf[:4], true) {
-			return 0, r.err
-		}
-		chunkType := r.buf[0]
-		if !r.readHeader {
-			if chunkType != chunkTypeStreamIdentifier {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			r.readHeader = true
-		}
-		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
-		if chunkLen > len(r.buf) {
-			r.err = ErrUnsupported
-			return 0, r.err
-		}
-
-		// The chunk types are specified at
-		// https://github.com/google/snappy/blob/master/framing_format.txt
-		switch chunkType {
-		case chunkTypeCompressedData:
-			// Section 4.2. Compressed data (chunk type 0x00).
-			if chunkLen < checksumSize {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			buf := r.buf[:chunkLen]
-			if !r.readFull(buf, false) {
-				return 0, r.err
-			}
-			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
-			buf = buf[checksumSize:]
-
-			n, err := DecodedLen(buf)
-			if err != nil {
-				r.err = err
-				return 0, r.err
-			}
-			if n > len(r.decoded) {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			if _, err := Decode(r.decoded, buf); err != nil {
-				r.err = err
-				return 0, r.err
-			}
-			if crc(r.decoded[:n]) != checksum {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			r.i, r.j = 0, n
-			continue
-
-		case chunkTypeUncompressedData:
-			// Section 4.3. Uncompressed data (chunk type 0x01).
-			if chunkLen < checksumSize {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			buf := r.buf[:checksumSize]
-			if !r.readFull(buf, false) {
-				return 0, r.err
-			}
-			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
-			// Read directly into r.decoded instead of via r.buf.
-			n := chunkLen - checksumSize
-			if n > len(r.decoded) {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			if !r.readFull(r.decoded[:n], false) {
-				return 0, r.err
-			}
-			if crc(r.decoded[:n]) != checksum {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			r.i, r.j = 0, n
-			continue
-
-		case chunkTypeStreamIdentifier:
-			// Section 4.1. Stream identifier (chunk type 0xff).
-			if chunkLen != len(magicBody) {
-				r.err = ErrCorrupt
-				return 0, r.err
-			}
-			if !r.readFull(r.buf[:len(magicBody)], false) {
-				return 0, r.err
-			}
-			for i := 0; i < len(magicBody); i++ {
-				if r.buf[i] != magicBody[i] {
-					r.err = ErrCorrupt
-					return 0, r.err
-				}
-			}
-			continue
-		}
-
-		if chunkType <= 0x7f {
-			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
-			r.err = ErrUnsupported
-			return 0, r.err
-		}
-		// Section 4.4 Padding (chunk type 0xfe).
-		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
-		if !r.readFull(r.buf[:chunkLen], false) {
-			return 0, r.err
-		}
-	}
-}
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_amd64.go b/vendor/github.com/klauspost/compress/snappy/decode_amd64.go
deleted file mode 100644
index fcd192b..0000000
--- a/vendor/github.com/klauspost/compress/snappy/decode_amd64.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-package snappy
-
-// decode has the same semantics as in decode_other.go.
-//
-//go:noescape
-func decode(dst, src []byte) int
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_amd64.s b/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
deleted file mode 100644
index 1c66e37..0000000
--- a/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-
-// The asm code generally follows the pure Go code in decode_other.go, except
-// where marked with a "!!!".
-
-// func decode(dst, src []byte) int
-//
-// All local variables fit into registers. The non-zero stack size is only to
-// spill registers and push args when issuing a CALL. The register allocation:
-//	- AX	scratch
-//	- BX	scratch
-//	- CX	length or x
-//	- DX	offset
-//	- SI	&src[s]
-//	- DI	&dst[d]
-//	+ R8	dst_base
-//	+ R9	dst_len
-//	+ R10	dst_base + dst_len
-//	+ R11	src_base
-//	+ R12	src_len
-//	+ R13	src_base + src_len
-//	- R14	used by doCopy
-//	- R15	used by doCopy
-//
-// The registers R8-R13 (marked with a "+") are set at the start of the
-// function, and after a CALL returns, and are not otherwise modified.
-//
-// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
-// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
-TEXT ·decode(SB), NOSPLIT, $48-56
-	// Initialize SI, DI and R8-R13.
-	MOVQ dst_base+0(FP), R8
-	MOVQ dst_len+8(FP), R9
-	MOVQ R8, DI
-	MOVQ R8, R10
-	ADDQ R9, R10
-	MOVQ src_base+24(FP), R11
-	MOVQ src_len+32(FP), R12
-	MOVQ R11, SI
-	MOVQ R11, R13
-	ADDQ R12, R13
-
-loop:
-	// for s < len(src)
-	CMPQ SI, R13
-	JEQ  end
-
-	// CX = uint32(src[s])
-	//
-	// switch src[s] & 0x03
-	MOVBLZX (SI), CX
-	MOVL    CX, BX
-	ANDL    $3, BX
-	CMPL    BX, $1
-	JAE     tagCopy
-
-	// ----------------------------------------
-	// The code below handles literal tags.
-
-	// case tagLiteral:
-	// x := uint32(src[s] >> 2)
-	// switch
-	SHRL $2, CX
-	CMPL CX, $60
-	JAE  tagLit60Plus
-
-	// case x < 60:
-	// s++
-	INCQ SI
-
-doLit:
-	// This is the end of the inner "switch", when we have a literal tag.
-	//
-	// We assume that CX == x and x fits in a uint32, where x is the variable
-	// used in the pure Go decode_other.go code.
-
-	// length = int(x) + 1
-	//
-	// Unlike the pure Go code, we don't need to check if length <= 0 because
-	// CX can hold 64 bits, so the increment cannot overflow.
-	INCQ CX
-
-	// Prepare to check if copying length bytes will run past the end of dst or
-	// src.
-	//
-	// AX = len(dst) - d
-	// BX = len(src) - s
-	MOVQ R10, AX
-	SUBQ DI, AX
-	MOVQ R13, BX
-	SUBQ SI, BX
-
-	// !!! Try a faster technique for short (16 or fewer bytes) copies.
-	//
-	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
-	//   goto callMemmove // Fall back on calling runtime·memmove.
-	// }
-	//
-	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
-	// against 21 instead of 16, because it cannot assume that all of its input
-	// is contiguous in memory and so it needs to leave enough source bytes to
-	// read the next tag without refilling buffers, but Go's Decode assumes
-	// contiguousness (the src argument is a []byte).
-	CMPQ CX, $16
-	JGT  callMemmove
-	CMPQ AX, $16
-	JLT  callMemmove
-	CMPQ BX, $16
-	JLT  callMemmove
-
-	// !!! Implement the copy from src to dst as a 16-byte load and store.
-	// (Decode's documentation says that dst and src must not overlap.)
-	//
-	// This always copies 16 bytes, instead of only length bytes, but that's
-	// OK. If the input is a valid Snappy encoding then subsequent iterations
-	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
-	// non-nil error), so the overrun will be ignored.
-	//
-	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
-	// 16-byte loads and stores. This technique probably wouldn't be as
-	// effective on architectures that are fussier about alignment.
-	MOVOU 0(SI), X0
-	MOVOU X0, 0(DI)
-
-	// d += length
-	// s += length
-	ADDQ CX, DI
-	ADDQ CX, SI
-	JMP  loop
-
-callMemmove:
-	// if length > len(dst)-d || length > len(src)-s { etc }
-	CMPQ CX, AX
-	JGT  errCorrupt
-	CMPQ CX, BX
-	JGT  errCorrupt
-
-	// copy(dst[d:], src[s:s+length])
-	//
-	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
-	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
-	// three registers to the stack, to save local variables across the CALL.
-	MOVQ DI, 0(SP)
-	MOVQ SI, 8(SP)
-	MOVQ CX, 16(SP)
-	MOVQ DI, 24(SP)
-	MOVQ SI, 32(SP)
-	MOVQ CX, 40(SP)
-	CALL runtime·memmove(SB)
-
-	// Restore local variables: unspill registers from the stack and
-	// re-calculate R8-R13.
-	MOVQ 24(SP), DI
-	MOVQ 32(SP), SI
-	MOVQ 40(SP), CX
-	MOVQ dst_base+0(FP), R8
-	MOVQ dst_len+8(FP), R9
-	MOVQ R8, R10
-	ADDQ R9, R10
-	MOVQ src_base+24(FP), R11
-	MOVQ src_len+32(FP), R12
-	MOVQ R11, R13
-	ADDQ R12, R13
-
-	// d += length
-	// s += length
-	ADDQ CX, DI
-	ADDQ CX, SI
-	JMP  loop
-
-tagLit60Plus:
-	// !!! This fragment does the
-	//
-	// s += x - 58; if uint(s) > uint(len(src)) { etc }
-	//
-	// checks. In the asm version, we code it once instead of once per switch case.
-	ADDQ CX, SI
-	SUBQ $58, SI
-	CMPQ SI, R13
-	JA   errCorrupt
-
-	// case x == 60:
-	CMPL CX, $61
-	JEQ  tagLit61
-	JA   tagLit62Plus
-
-	// x = uint32(src[s-1])
-	MOVBLZX -1(SI), CX
-	JMP     doLit
-
-tagLit61:
-	// case x == 61:
-	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
-	MOVWLZX -2(SI), CX
-	JMP     doLit
-
-tagLit62Plus:
-	CMPL CX, $62
-	JA   tagLit63
-
-	// case x == 62:
-	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
-	MOVWLZX -3(SI), CX
-	MOVBLZX -1(SI), BX
-	SHLL    $16, BX
-	ORL     BX, CX
-	JMP     doLit
-
-tagLit63:
-	// case x == 63:
-	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
-	MOVL -4(SI), CX
-	JMP  doLit
-
-// The code above handles literal tags.
-// ----------------------------------------
-// The code below handles copy tags.
-
-tagCopy4:
-	// case tagCopy4:
-	// s += 5
-	ADDQ $5, SI
-
-	// if uint(s) > uint(len(src)) { etc }
-	CMPQ SI, R13
-	JA   errCorrupt
-
-	// length = 1 + int(src[s-5])>>2
-	SHRQ $2, CX
-	INCQ CX
-
-	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
-	MOVLQZX -4(SI), DX
-	JMP     doCopy
-
-tagCopy2:
-	// case tagCopy2:
-	// s += 3
-	ADDQ $3, SI
-
-	// if uint(s) > uint(len(src)) { etc }
-	CMPQ SI, R13
-	JA   errCorrupt
-
-	// length = 1 + int(src[s-3])>>2
-	SHRQ $2, CX
-	INCQ CX
-
-	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
-	MOVWQZX -2(SI), DX
-	JMP     doCopy
-
-tagCopy:
-	// We have a copy tag. We assume that:
-	//	- BX == src[s] & 0x03
-	//	- CX == src[s]
-	CMPQ BX, $2
-	JEQ  tagCopy2
-	JA   tagCopy4
-
-	// case tagCopy1:
-	// s += 2
-	ADDQ $2, SI
-
-	// if uint(s) > uint(len(src)) { etc }
-	CMPQ SI, R13
-	JA   errCorrupt
-
-	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
-	MOVQ    CX, DX
-	ANDQ    $0xe0, DX
-	SHLQ    $3, DX
-	MOVBQZX -1(SI), BX
-	ORQ     BX, DX
-
-	// length = 4 + int(src[s-2])>>2&0x7
-	SHRQ $2, CX
-	ANDQ $7, CX
-	ADDQ $4, CX
-
-doCopy:
-	// This is the end of the outer "switch", when we have a copy tag.
-	//
-	// We assume that:
-	//	- CX == length && CX > 0
-	//	- DX == offset
-
-	// if offset <= 0 { etc }
-	CMPQ DX, $0
-	JLE  errCorrupt
-
-	// if d < offset { etc }
-	MOVQ DI, BX
-	SUBQ R8, BX
-	CMPQ BX, DX
-	JLT  errCorrupt
-
-	// if length > len(dst)-d { etc }
-	MOVQ R10, BX
-	SUBQ DI, BX
-	CMPQ CX, BX
-	JGT  errCorrupt
-
-	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
-	//
-	// Set:
-	//	- R14 = len(dst)-d
-	//	- R15 = &dst[d-offset]
-	MOVQ R10, R14
-	SUBQ DI, R14
-	MOVQ DI, R15
-	SUBQ DX, R15
-
-	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
-	//
-	// First, try using two 8-byte load/stores, similar to the doLit technique
-	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
-	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
-	// and not one 16-byte load/store, and the first store has to be before the
-	// second load, due to the overlap if offset is in the range [8, 16).
-	//
-	// if length > 16 || offset < 8 || len(dst)-d < 16 {
-	//   goto slowForwardCopy
-	// }
-	// copy 16 bytes
-	// d += length
-	CMPQ CX, $16
-	JGT  slowForwardCopy
-	CMPQ DX, $8
-	JLT  slowForwardCopy
-	CMPQ R14, $16
-	JLT  slowForwardCopy
-	MOVQ 0(R15), AX
-	MOVQ AX, 0(DI)
-	MOVQ 8(R15), BX
-	MOVQ BX, 8(DI)
-	ADDQ CX, DI
-	JMP  loop
-
-slowForwardCopy:
-	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
-	// can still try 8-byte load stores, provided we can overrun up to 10 extra
-	// bytes. As above, the overrun will be fixed up by subsequent iterations
-	// of the outermost loop.
-	//
-	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
-	// commentary says:
-	//
-	// ----
-	//
-	// The main part of this loop is a simple copy of eight bytes at a time
-	// until we've copied (at least) the requested amount of bytes.  However,
-	// if d and d-offset are less than eight bytes apart (indicating a
-	// repeating pattern of length < 8), we first need to expand the pattern in
-	// order to get the correct results. For instance, if the buffer looks like
-	// this, with the eight-byte <d-offset> and <d> patterns marked as
-	// intervals:
-	//
-	//    abxxxxxxxxxxxx
-	//    [------]           d-offset
-	//      [------]         d
-	//
-	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
-	// once, after which we can move <d> two bytes without moving <d-offset>:
-	//
-	//    ababxxxxxxxxxx
-	//    [------]           d-offset
-	//        [------]       d
-	//
-	// and repeat the exercise until the two no longer overlap.
-	//
-	// This allows us to do very well in the special case of one single byte
-	// repeated many times, without taking a big hit for more general cases.
-	//
-	// The worst case of extra writing past the end of the match occurs when
-	// offset == 1 and length == 1; the last copy will read from byte positions
-	// [0..7] and write to [4..11], whereas it was only supposed to write to
-	// position 1. Thus, ten excess bytes.
-	//
-	// ----
-	//
-	// That "10 byte overrun" worst case is confirmed by Go's
-	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
-	// and finishSlowForwardCopy algorithm.
-	//
-	// if length > len(dst)-d-10 {
-	//   goto verySlowForwardCopy
-	// }
-	SUBQ $10, R14
-	CMPQ CX, R14
-	JGT  verySlowForwardCopy
-
-makeOffsetAtLeast8:
-	// !!! As above, expand the pattern so that offset >= 8 and we can use
-	// 8-byte load/stores.
-	//
-	// for offset < 8 {
-	//   copy 8 bytes from dst[d-offset:] to dst[d:]
-	//   length -= offset
-	//   d      += offset
-	//   offset += offset
-	//   // The two previous lines together means that d-offset, and therefore
-	//   // R15, is unchanged.
-	// }
-	CMPQ DX, $8
-	JGE  fixUpSlowForwardCopy
-	MOVQ (R15), BX
-	MOVQ BX, (DI)
-	SUBQ DX, CX
-	ADDQ DX, DI
-	ADDQ DX, DX
-	JMP  makeOffsetAtLeast8
-
-fixUpSlowForwardCopy:
-	// !!! Add length (which might be negative now) to d (implied by DI being
-	// &dst[d]) so that d ends up at the right place when we jump back to the
-	// top of the loop. Before we do that, though, we save DI to AX so that, if
-	// length is positive, copying the remaining length bytes will write to the
-	// right place.
-	MOVQ DI, AX
-	ADDQ CX, DI
-
-finishSlowForwardCopy:
-	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
-	// length means that we overrun, but as above, that will be fixed up by
-	// subsequent iterations of the outermost loop.
-	CMPQ CX, $0
-	JLE  loop
-	MOVQ (R15), BX
-	MOVQ BX, (AX)
-	ADDQ $8, R15
-	ADDQ $8, AX
-	SUBQ $8, CX
-	JMP  finishSlowForwardCopy
-
-verySlowForwardCopy:
-	// verySlowForwardCopy is a simple implementation of forward copy. In C
-	// parlance, this is a do/while loop instead of a while loop, since we know
-	// that length > 0. In Go syntax:
-	//
-	// for {
-	//   dst[d] = dst[d - offset]
-	//   d++
-	//   length--
-	//   if length == 0 {
-	//     break
-	//   }
-	// }
-	MOVB (R15), BX
-	MOVB BX, (DI)
-	INCQ R15
-	INCQ DI
-	DECQ CX
-	JNZ  verySlowForwardCopy
-	JMP  loop
-
-// The code above handles copy tags.
-// ----------------------------------------
-
-end:
-	// This is the end of the "for s < len(src)".
-	//
-	// if d != len(dst) { etc }
-	CMPQ DI, R10
-	JNE  errCorrupt
-
-	// return 0
-	MOVQ $0, ret+48(FP)
-	RET
-
-errCorrupt:
-	// return decodeErrCodeCorrupt
-	MOVQ $1, ret+48(FP)
-	RET
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_other.go b/vendor/github.com/klauspost/compress/snappy/decode_other.go
deleted file mode 100644
index 94a96c5..0000000
--- a/vendor/github.com/klauspost/compress/snappy/decode_other.go
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64 appengine !gc noasm
-
-package snappy
-
-// decode writes the decoding of src to dst. It assumes that the varint-encoded
-// length of the decompressed bytes has already been read, and that len(dst)
-// equals that length.
-//
-// It returns 0 on success or a decodeErrCodeXxx error code on failure.
-func decode(dst, src []byte) int {
-	var d, s, offset, length int
-	for s < len(src) {
-		switch src[s] & 0x03 {
-		case tagLiteral:
-			x := uint32(src[s] >> 2)
-			switch {
-			case x < 60:
-				s++
-			case x == 60:
-				s += 2
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return decodeErrCodeCorrupt
-				}
-				x = uint32(src[s-1])
-			case x == 61:
-				s += 3
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return decodeErrCodeCorrupt
-				}
-				x = uint32(src[s-2]) | uint32(src[s-1])<<8
-			case x == 62:
-				s += 4
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return decodeErrCodeCorrupt
-				}
-				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
-			case x == 63:
-				s += 5
-				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-					return decodeErrCodeCorrupt
-				}
-				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
-			}
-			length = int(x) + 1
-			if length <= 0 {
-				return decodeErrCodeUnsupportedLiteralLength
-			}
-			if length > len(dst)-d || length > len(src)-s {
-				return decodeErrCodeCorrupt
-			}
-			copy(dst[d:], src[s:s+length])
-			d += length
-			s += length
-			continue
-
-		case tagCopy1:
-			s += 2
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				return decodeErrCodeCorrupt
-			}
-			length = 4 + int(src[s-2])>>2&0x7
-			offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
-
-		case tagCopy2:
-			s += 3
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				return decodeErrCodeCorrupt
-			}
-			length = 1 + int(src[s-3])>>2
-			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
-
-		case tagCopy4:
-			s += 5
-			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
-				return decodeErrCodeCorrupt
-			}
-			length = 1 + int(src[s-5])>>2
-			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
-		}
-
-		if offset <= 0 || d < offset || length > len(dst)-d {
-			return decodeErrCodeCorrupt
-		}
-		// Copy from an earlier sub-slice of dst to a later sub-slice.
-		// If no overlap, use the built-in copy:
-		if offset > length {
-			copy(dst[d:d+length], dst[d-offset:])
-			d += length
-			continue
-		}
-
-		// Unlike the built-in copy function, this byte-by-byte copy always runs
-		// forwards, even if the slices overlap. Conceptually, this is:
-		//
-		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
-		//
-		// We align the slices into a and b and show the compiler they are the same size.
-		// This allows the loop to run without bounds checks.
-		a := dst[d : d+length]
-		b := dst[d-offset:]
-		b = b[:len(a)]
-		for i := range a {
-			a[i] = b[i]
-		}
-		d += length
-	}
-	if d != len(dst) {
-		return decodeErrCodeCorrupt
-	}
-	return 0
-}
diff --git a/vendor/github.com/klauspost/compress/snappy/encode.go b/vendor/github.com/klauspost/compress/snappy/encode.go
deleted file mode 100644
index 8d393e9..0000000
--- a/vendor/github.com/klauspost/compress/snappy/encode.go
+++ /dev/null
@@ -1,285 +0,0 @@
-// Copyright 2011 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package snappy
-
-import (
-	"encoding/binary"
-	"errors"
-	"io"
-)
-
-// Encode returns the encoded form of src. The returned slice may be a sub-
-// slice of dst if dst was large enough to hold the entire encoded block.
-// Otherwise, a newly allocated slice will be returned.
-//
-// The dst and src must not overlap. It is valid to pass a nil dst.
-func Encode(dst, src []byte) []byte {
-	if n := MaxEncodedLen(len(src)); n < 0 {
-		panic(ErrTooLarge)
-	} else if len(dst) < n {
-		dst = make([]byte, n)
-	}
-
-	// The block starts with the varint-encoded length of the decompressed bytes.
-	d := binary.PutUvarint(dst, uint64(len(src)))
-
-	for len(src) > 0 {
-		p := src
-		src = nil
-		if len(p) > maxBlockSize {
-			p, src = p[:maxBlockSize], p[maxBlockSize:]
-		}
-		if len(p) < minNonLiteralBlockSize {
-			d += emitLiteral(dst[d:], p)
-		} else {
-			d += encodeBlock(dst[d:], p)
-		}
-	}
-	return dst[:d]
-}
-
-// inputMargin is the minimum number of extra input bytes to keep, inside
-// encodeBlock's inner loop. On some architectures, this margin lets us
-// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
-// literals can be implemented as a single load to and store from a 16-byte
-// register. That literal's actual length can be as short as 1 byte, so this
-// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
-// the encoding loop will fix up the copy overrun, and this inputMargin ensures
-// that we don't overrun the dst and src buffers.
-const inputMargin = 16 - 1
-
-// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
-// could be encoded with a copy tag. This is the minimum with respect to the
-// algorithm used by encodeBlock, not a minimum enforced by the file format.
-//
-// The encoded output must start with at least a 1 byte literal, as there are
-// no previous bytes to copy. A minimal (1 byte) copy after that, generated
-// from an emitCopy call in encodeBlock's main loop, would require at least
-// another inputMargin bytes, for the reason above: we want any emitLiteral
-// calls inside encodeBlock's main loop to use the fast path if possible, which
-// requires being able to overrun by inputMargin bytes. Thus,
-// minNonLiteralBlockSize equals 1 + 1 + inputMargin.
-//
-// The C++ code doesn't use this exact threshold, but it could, as discussed at
-// https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion
-// The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an
-// optimization. It should not affect the encoded form. This is tested by
-// TestSameEncodingAsCppShortCopies.
-const minNonLiteralBlockSize = 1 + 1 + inputMargin
-
-// MaxEncodedLen returns the maximum length of a snappy block, given its
-// uncompressed length.
-//
-// It will return a negative value if srcLen is too large to encode.
-func MaxEncodedLen(srcLen int) int {
-	n := uint64(srcLen)
-	if n > 0xffffffff {
-		return -1
-	}
-	// Compressed data can be defined as:
-	//    compressed := item* literal*
-	//    item       := literal* copy
-	//
-	// The trailing literal sequence has a space blowup of at most 62/60
-	// since a literal of length 60 needs one tag byte + one extra byte
-	// for length information.
-	//
-	// Item blowup is trickier to measure. Suppose the "copy" op copies
-	// 4 bytes of data. Because of a special check in the encoding code,
-	// we produce a 4-byte copy only if the offset is < 65536. Therefore
-	// the copy op takes 3 bytes to encode, and this type of item leads
-	// to at most the 62/60 blowup for representing literals.
-	//
-	// Suppose the "copy" op copies 5 bytes of data. If the offset is big
-	// enough, it will take 5 bytes to encode the copy op. Therefore the
-	// worst case here is a one-byte literal followed by a five-byte copy.
-	// That is, 6 bytes of input turn into 7 bytes of "compressed" data.
-	//
-	// This last factor dominates the blowup, so the final estimate is:
-	n = 32 + n + n/6
-	if n > 0xffffffff {
-		return -1
-	}
-	return int(n)
-}
-
-var errClosed = errors.New("snappy: Writer is closed")
-
-// NewWriter returns a new Writer that compresses to w.
-//
-// The Writer returned does not buffer writes. There is no need to Flush or
-// Close such a Writer.
-//
-// Deprecated: the Writer returned is not suitable for many small writes, only
-// for few large writes. Use NewBufferedWriter instead, which is efficient
-// regardless of the frequency and shape of the writes, and remember to Close
-// that Writer when done.
-func NewWriter(w io.Writer) *Writer {
-	return &Writer{
-		w:    w,
-		obuf: make([]byte, obufLen),
-	}
-}
-
-// NewBufferedWriter returns a new Writer that compresses to w, using the
-// framing format described at
-// https://github.com/google/snappy/blob/master/framing_format.txt
-//
-// The Writer returned buffers writes. Users must call Close to guarantee all
-// data has been forwarded to the underlying io.Writer. They may also call
-// Flush zero or more times before calling Close.
-func NewBufferedWriter(w io.Writer) *Writer {
-	return &Writer{
-		w:    w,
-		ibuf: make([]byte, 0, maxBlockSize),
-		obuf: make([]byte, obufLen),
-	}
-}
-
-// Writer is an io.Writer that can write Snappy-compressed bytes.
-type Writer struct {
-	w   io.Writer
-	err error
-
-	// ibuf is a buffer for the incoming (uncompressed) bytes.
-	//
-	// Its use is optional. For backwards compatibility, Writers created by the
-	// NewWriter function have ibuf == nil, do not buffer incoming bytes, and
-	// therefore do not need to be Flush'ed or Close'd.
-	ibuf []byte
-
-	// obuf is a buffer for the outgoing (compressed) bytes.
-	obuf []byte
-
-	// wroteStreamHeader is whether we have written the stream header.
-	wroteStreamHeader bool
-}
-
-// Reset discards the writer's state and switches the Snappy writer to write to
-// w. This permits reusing a Writer rather than allocating a new one.
-func (w *Writer) Reset(writer io.Writer) {
-	w.w = writer
-	w.err = nil
-	if w.ibuf != nil {
-		w.ibuf = w.ibuf[:0]
-	}
-	w.wroteStreamHeader = false
-}
-
-// Write satisfies the io.Writer interface.
-func (w *Writer) Write(p []byte) (nRet int, errRet error) {
-	if w.ibuf == nil {
-		// Do not buffer incoming bytes. This does not perform or compress well
-		// if the caller of Writer.Write writes many small slices. This
-		// behavior is therefore deprecated, but still supported for backwards
-		// compatibility with code that doesn't explicitly Flush or Close.
-		return w.write(p)
-	}
-
-	// The remainder of this method is based on bufio.Writer.Write from the
-	// standard library.
-
-	for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil {
-		var n int
-		if len(w.ibuf) == 0 {
-			// Large write, empty buffer.
-			// Write directly from p to avoid copy.
-			n, _ = w.write(p)
-		} else {
-			n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
-			w.ibuf = w.ibuf[:len(w.ibuf)+n]
-			w.Flush()
-		}
-		nRet += n
-		p = p[n:]
-	}
-	if w.err != nil {
-		return nRet, w.err
-	}
-	n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
-	w.ibuf = w.ibuf[:len(w.ibuf)+n]
-	nRet += n
-	return nRet, nil
-}
-
-func (w *Writer) write(p []byte) (nRet int, errRet error) {
-	if w.err != nil {
-		return 0, w.err
-	}
-	for len(p) > 0 {
-		obufStart := len(magicChunk)
-		if !w.wroteStreamHeader {
-			w.wroteStreamHeader = true
-			copy(w.obuf, magicChunk)
-			obufStart = 0
-		}
-
-		var uncompressed []byte
-		if len(p) > maxBlockSize {
-			uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
-		} else {
-			uncompressed, p = p, nil
-		}
-		checksum := crc(uncompressed)
-
-		// Compress the buffer, discarding the result if the improvement
-		// isn't at least 12.5%.
-		compressed := Encode(w.obuf[obufHeaderLen:], uncompressed)
-		chunkType := uint8(chunkTypeCompressedData)
-		chunkLen := 4 + len(compressed)
-		obufEnd := obufHeaderLen + len(compressed)
-		if len(compressed) >= len(uncompressed)-len(uncompressed)/8 {
-			chunkType = chunkTypeUncompressedData
-			chunkLen = 4 + len(uncompressed)
-			obufEnd = obufHeaderLen
-		}
-
-		// Fill in the per-chunk header that comes before the body.
-		w.obuf[len(magicChunk)+0] = chunkType
-		w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0)
-		w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8)
-		w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16)
-		w.obuf[len(magicChunk)+4] = uint8(checksum >> 0)
-		w.obuf[len(magicChunk)+5] = uint8(checksum >> 8)
-		w.obuf[len(magicChunk)+6] = uint8(checksum >> 16)
-		w.obuf[len(magicChunk)+7] = uint8(checksum >> 24)
-
-		if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil {
-			w.err = err
-			return nRet, err
-		}
-		if chunkType == chunkTypeUncompressedData {
-			if _, err := w.w.Write(uncompressed); err != nil {
-				w.err = err
-				return nRet, err
-			}
-		}
-		nRet += len(uncompressed)
-	}
-	return nRet, nil
-}
-
-// Flush flushes the Writer to its underlying io.Writer.
-func (w *Writer) Flush() error {
-	if w.err != nil {
-		return w.err
-	}
-	if len(w.ibuf) == 0 {
-		return nil
-	}
-	w.write(w.ibuf)
-	w.ibuf = w.ibuf[:0]
-	return w.err
-}
-
-// Close calls Flush and then closes the Writer.
-func (w *Writer) Close() error {
-	w.Flush()
-	ret := w.err
-	if w.err == nil {
-		w.err = errClosed
-	}
-	return ret
-}
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_amd64.go b/vendor/github.com/klauspost/compress/snappy/encode_amd64.go
deleted file mode 100644
index 150d91b..0000000
--- a/vendor/github.com/klauspost/compress/snappy/encode_amd64.go
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-package snappy
-
-// emitLiteral has the same semantics as in encode_other.go.
-//
-//go:noescape
-func emitLiteral(dst, lit []byte) int
-
-// emitCopy has the same semantics as in encode_other.go.
-//
-//go:noescape
-func emitCopy(dst []byte, offset, length int) int
-
-// extendMatch has the same semantics as in encode_other.go.
-//
-//go:noescape
-func extendMatch(src []byte, i, j int) int
-
-// encodeBlock has the same semantics as in encode_other.go.
-//
-//go:noescape
-func encodeBlock(dst, src []byte) (d int)
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_amd64.s b/vendor/github.com/klauspost/compress/snappy/encode_amd64.s
deleted file mode 100644
index adfd979..0000000
--- a/vendor/github.com/klauspost/compress/snappy/encode_amd64.s
+++ /dev/null
@@ -1,730 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-
-// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
-// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
-// https://github.com/golang/snappy/issues/29
-//
-// As a workaround, the package was built with a known good assembler, and
-// those instructions were disassembled by "objdump -d" to yield the
-//	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
-// style comments, in AT&T asm syntax. Note that rsp here is a physical
-// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
-// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
-// fine on Go 1.6.
-
-// The asm code generally follows the pure Go code in encode_other.go, except
-// where marked with a "!!!".
-
-// ----------------------------------------------------------------------------
-
-// func emitLiteral(dst, lit []byte) int
-//
-// All local variables fit into registers. The register allocation:
-//	- AX	len(lit)
-//	- BX	n
-//	- DX	return value
-//	- DI	&dst[i]
-//	- R10	&lit[0]
-//
-// The 24 bytes of stack space is to call runtime·memmove.
-//
-// The unusual register allocation of local variables, such as R10 for the
-// source pointer, matches the allocation used at the call site in encodeBlock,
-// which makes it easier to manually inline this function.
-TEXT ·emitLiteral(SB), NOSPLIT, $24-56
-	MOVQ dst_base+0(FP), DI
-	MOVQ lit_base+24(FP), R10
-	MOVQ lit_len+32(FP), AX
-	MOVQ AX, DX
-	MOVL AX, BX
-	SUBL $1, BX
-
-	CMPL BX, $60
-	JLT  oneByte
-	CMPL BX, $256
-	JLT  twoBytes
-
-threeBytes:
-	MOVB $0xf4, 0(DI)
-	MOVW BX, 1(DI)
-	ADDQ $3, DI
-	ADDQ $3, DX
-	JMP  memmove
-
-twoBytes:
-	MOVB $0xf0, 0(DI)
-	MOVB BX, 1(DI)
-	ADDQ $2, DI
-	ADDQ $2, DX
-	JMP  memmove
-
-oneByte:
-	SHLB $2, BX
-	MOVB BX, 0(DI)
-	ADDQ $1, DI
-	ADDQ $1, DX
-
-memmove:
-	MOVQ DX, ret+48(FP)
-
-	// copy(dst[i:], lit)
-	//
-	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
-	// DI, R10 and AX as arguments.
-	MOVQ DI, 0(SP)
-	MOVQ R10, 8(SP)
-	MOVQ AX, 16(SP)
-	CALL runtime·memmove(SB)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func emitCopy(dst []byte, offset, length int) int
-//
-// All local variables fit into registers. The register allocation:
-//	- AX	length
-//	- SI	&dst[0]
-//	- DI	&dst[i]
-//	- R11	offset
-//
-// The unusual register allocation of local variables, such as R11 for the
-// offset, matches the allocation used at the call site in encodeBlock, which
-// makes it easier to manually inline this function.
-TEXT ·emitCopy(SB), NOSPLIT, $0-48
-	MOVQ dst_base+0(FP), DI
-	MOVQ DI, SI
-	MOVQ offset+24(FP), R11
-	MOVQ length+32(FP), AX
-
-loop0:
-	// for length >= 68 { etc }
-	CMPL AX, $68
-	JLT  step1
-
-	// Emit a length 64 copy, encoded as 3 bytes.
-	MOVB $0xfe, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $64, AX
-	JMP  loop0
-
-step1:
-	// if length > 64 { etc }
-	CMPL AX, $64
-	JLE  step2
-
-	// Emit a length 60 copy, encoded as 3 bytes.
-	MOVB $0xee, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $60, AX
-
-step2:
-	// if length >= 12 || offset >= 2048 { goto step3 }
-	CMPL AX, $12
-	JGE  step3
-	CMPL R11, $2048
-	JGE  step3
-
-	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB R11, 1(DI)
-	SHRL $8, R11
-	SHLB $5, R11
-	SUBB $4, AX
-	SHLB $2, AX
-	ORB  AX, R11
-	ORB  $1, R11
-	MOVB R11, 0(DI)
-	ADDQ $2, DI
-
-	// Return the number of bytes written.
-	SUBQ SI, DI
-	MOVQ DI, ret+40(FP)
-	RET
-
-step3:
-	// Emit the remaining copy, encoded as 3 bytes.
-	SUBL $1, AX
-	SHLB $2, AX
-	ORB  $2, AX
-	MOVB AX, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-
-	// Return the number of bytes written.
-	SUBQ SI, DI
-	MOVQ DI, ret+40(FP)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func extendMatch(src []byte, i, j int) int
-//
-// All local variables fit into registers. The register allocation:
-//	- DX	&src[0]
-//	- SI	&src[j]
-//	- R13	&src[len(src) - 8]
-//	- R14	&src[len(src)]
-//	- R15	&src[i]
-//
-// The unusual register allocation of local variables, such as R15 for a source
-// pointer, matches the allocation used at the call site in encodeBlock, which
-// makes it easier to manually inline this function.
-TEXT ·extendMatch(SB), NOSPLIT, $0-48
-	MOVQ src_base+0(FP), DX
-	MOVQ src_len+8(FP), R14
-	MOVQ i+24(FP), R15
-	MOVQ j+32(FP), SI
-	ADDQ DX, R14
-	ADDQ DX, R15
-	ADDQ DX, SI
-	MOVQ R14, R13
-	SUBQ $8, R13
-
-cmp8:
-	// As long as we are 8 or more bytes before the end of src, we can load and
-	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMPQ SI, R13
-	JA   cmp1
-	MOVQ (R15), AX
-	MOVQ (SI), BX
-	CMPQ AX, BX
-	JNE  bsf
-	ADDQ $8, R15
-	ADDQ $8, SI
-	JMP  cmp8
-
-bsf:
-	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
-	// the index of the first byte that differs. The BSF instruction finds the
-	// least significant 1 bit, the amd64 architecture is little-endian, and
-	// the shift by 3 converts a bit index to a byte index.
-	XORQ AX, BX
-	BSFQ BX, BX
-	SHRQ $3, BX
-	ADDQ BX, SI
-
-	// Convert from &src[ret] to ret.
-	SUBQ DX, SI
-	MOVQ SI, ret+40(FP)
-	RET
-
-cmp1:
-	// In src's tail, compare 1 byte at a time.
-	CMPQ SI, R14
-	JAE  extendMatchEnd
-	MOVB (R15), AX
-	MOVB (SI), BX
-	CMPB AX, BX
-	JNE  extendMatchEnd
-	ADDQ $1, R15
-	ADDQ $1, SI
-	JMP  cmp1
-
-extendMatchEnd:
-	// Convert from &src[ret] to ret.
-	SUBQ DX, SI
-	MOVQ SI, ret+40(FP)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func encodeBlock(dst, src []byte) (d int)
-//
-// All local variables fit into registers, other than "var table". The register
-// allocation:
-//	- AX	.	.
-//	- BX	.	.
-//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
-//	- DX	64	&src[0], tableSize
-//	- SI	72	&src[s]
-//	- DI	80	&dst[d]
-//	- R9	88	sLimit
-//	- R10	.	&src[nextEmit]
-//	- R11	96	prevHash, currHash, nextHash, offset
-//	- R12	104	&src[base], skip
-//	- R13	.	&src[nextS], &src[len(src) - 8]
-//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
-//	- R15	112	candidate
-//
-// The second column (56, 64, etc) is the stack offset to spill the registers
-// when calling other functions. We could pack this slightly tighter, but it's
-// simpler to have a dedicated spill map independent of the function called.
-//
-// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
-// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
-// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
-TEXT ·encodeBlock(SB), 0, $32888-56
-	MOVQ dst_base+0(FP), DI
-	MOVQ src_base+24(FP), SI
-	MOVQ src_len+32(FP), R14
-
-	// shift, tableSize := uint32(32-8), 1<<8
-	MOVQ $24, CX
-	MOVQ $256, DX
-
-calcShift:
-	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
-	//	shift--
-	// }
-	CMPQ DX, $16384
-	JGE  varTable
-	CMPQ DX, R14
-	JGE  varTable
-	SUBQ $1, CX
-	SHLQ $1, DX
-	JMP  calcShift
-
-varTable:
-	// var table [maxTableSize]uint16
-	//
-	// In the asm code, unlike the Go code, we can zero-initialize only the
-	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
-	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
-	// 2048 writes that would zero-initialize all of table's 32768 bytes.
-	SHRQ $3, DX
-	LEAQ table-32768(SP), BX
-	PXOR X0, X0
-
-memclr:
-	MOVOU X0, 0(BX)
-	ADDQ  $16, BX
-	SUBQ  $1, DX
-	JNZ   memclr
-
-	// !!! DX = &src[0]
-	MOVQ SI, DX
-
-	// sLimit := len(src) - inputMargin
-	MOVQ R14, R9
-	SUBQ $15, R9
-
-	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
-	// change for the rest of the function.
-	MOVQ CX, 56(SP)
-	MOVQ DX, 64(SP)
-	MOVQ R9, 88(SP)
-
-	// nextEmit := 0
-	MOVQ DX, R10
-
-	// s := 1
-	ADDQ $1, SI
-
-	// nextHash := hash(load32(src, s), shift)
-	MOVL  0(SI), R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-outer:
-	// for { etc }
-
-	// skip := 32
-	MOVQ $32, R12
-
-	// nextS := s
-	MOVQ SI, R13
-
-	// candidate := 0
-	MOVQ $0, R15
-
-inner0:
-	// for { etc }
-
-	// s := nextS
-	MOVQ R13, SI
-
-	// bytesBetweenHashLookups := skip >> 5
-	MOVQ R12, R14
-	SHRQ $5, R14
-
-	// nextS = s + bytesBetweenHashLookups
-	ADDQ R14, R13
-
-	// skip += bytesBetweenHashLookups
-	ADDQ R14, R12
-
-	// if nextS > sLimit { goto emitRemainder }
-	MOVQ R13, AX
-	SUBQ DX, AX
-	CMPQ AX, R9
-	JA   emitRemainder
-
-	// candidate = int(table[nextHash])
-	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
-	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
-	BYTE $0x4e
-	BYTE $0x0f
-	BYTE $0xb7
-	BYTE $0x7c
-	BYTE $0x5c
-	BYTE $0x78
-
-	// table[nextHash] = uint16(s)
-	MOVQ SI, AX
-	SUBQ DX, AX
-
-	// XXX: MOVW AX, table-32768(SP)(R11*2)
-	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
-	BYTE $0x66
-	BYTE $0x42
-	BYTE $0x89
-	BYTE $0x44
-	BYTE $0x5c
-	BYTE $0x78
-
-	// nextHash = hash(load32(src, nextS), shift)
-	MOVL  0(R13), R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// if load32(src, s) != load32(src, candidate) { continue } break
-	MOVL 0(SI), AX
-	MOVL (DX)(R15*1), BX
-	CMPL AX, BX
-	JNE  inner0
-
-fourByteMatch:
-	// As per the encode_other.go code:
-	//
-	// A 4-byte match has been found. We'll later see etc.
-
-	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
-	// on inputMargin in encode.go.
-	MOVQ SI, AX
-	SUBQ R10, AX
-	CMPQ AX, $16
-	JLE  emitLiteralFastPath
-
-	// ----------------------------------------
-	// Begin inline of the emitLiteral call.
-	//
-	// d += emitLiteral(dst[d:], src[nextEmit:s])
-
-	MOVL AX, BX
-	SUBL $1, BX
-
-	CMPL BX, $60
-	JLT  inlineEmitLiteralOneByte
-	CMPL BX, $256
-	JLT  inlineEmitLiteralTwoBytes
-
-inlineEmitLiteralThreeBytes:
-	MOVB $0xf4, 0(DI)
-	MOVW BX, 1(DI)
-	ADDQ $3, DI
-	JMP  inlineEmitLiteralMemmove
-
-inlineEmitLiteralTwoBytes:
-	MOVB $0xf0, 0(DI)
-	MOVB BX, 1(DI)
-	ADDQ $2, DI
-	JMP  inlineEmitLiteralMemmove
-
-inlineEmitLiteralOneByte:
-	SHLB $2, BX
-	MOVB BX, 0(DI)
-	ADDQ $1, DI
-
-inlineEmitLiteralMemmove:
-	// Spill local variables (registers) onto the stack; call; unspill.
-	//
-	// copy(dst[i:], lit)
-	//
-	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
-	// DI, R10 and AX as arguments.
-	MOVQ DI, 0(SP)
-	MOVQ R10, 8(SP)
-	MOVQ AX, 16(SP)
-	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
-	MOVQ SI, 72(SP)
-	MOVQ DI, 80(SP)
-	MOVQ R15, 112(SP)
-	CALL runtime·memmove(SB)
-	MOVQ 56(SP), CX
-	MOVQ 64(SP), DX
-	MOVQ 72(SP), SI
-	MOVQ 80(SP), DI
-	MOVQ 88(SP), R9
-	MOVQ 112(SP), R15
-	JMP  inner1
-
-inlineEmitLiteralEnd:
-	// End inline of the emitLiteral call.
-	// ----------------------------------------
-
-emitLiteralFastPath:
-	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
-	MOVB AX, BX
-	SUBB $1, BX
-	SHLB $2, BX
-	MOVB BX, (DI)
-	ADDQ $1, DI
-
-	// !!! Implement the copy from lit to dst as a 16-byte load and store.
-	// (Encode's documentation says that dst and src must not overlap.)
-	//
-	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
-	// OK. Subsequent iterations will fix up the overrun.
-	//
-	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
-	// 16-byte loads and stores. This technique probably wouldn't be as
-	// effective on architectures that are fussier about alignment.
-	MOVOU 0(R10), X0
-	MOVOU X0, 0(DI)
-	ADDQ  AX, DI
-
-inner1:
-	// for { etc }
-
-	// base := s
-	MOVQ SI, R12
-
-	// !!! offset := base - candidate
-	MOVQ R12, R11
-	SUBQ R15, R11
-	SUBQ DX, R11
-
-	// ----------------------------------------
-	// Begin inline of the extendMatch call.
-	//
-	// s = extendMatch(src, candidate+4, s+4)
-
-	// !!! R14 = &src[len(src)]
-	MOVQ src_len+32(FP), R14
-	ADDQ DX, R14
-
-	// !!! R13 = &src[len(src) - 8]
-	MOVQ R14, R13
-	SUBQ $8, R13
-
-	// !!! R15 = &src[candidate + 4]
-	ADDQ $4, R15
-	ADDQ DX, R15
-
-	// !!! s += 4
-	ADDQ $4, SI
-
-inlineExtendMatchCmp8:
-	// As long as we are 8 or more bytes before the end of src, we can load and
-	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMPQ SI, R13
-	JA   inlineExtendMatchCmp1
-	MOVQ (R15), AX
-	MOVQ (SI), BX
-	CMPQ AX, BX
-	JNE  inlineExtendMatchBSF
-	ADDQ $8, R15
-	ADDQ $8, SI
-	JMP  inlineExtendMatchCmp8
-
-inlineExtendMatchBSF:
-	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
-	// the index of the first byte that differs. The BSF instruction finds the
-	// least significant 1 bit, the amd64 architecture is little-endian, and
-	// the shift by 3 converts a bit index to a byte index.
-	XORQ AX, BX
-	BSFQ BX, BX
-	SHRQ $3, BX
-	ADDQ BX, SI
-	JMP  inlineExtendMatchEnd
-
-inlineExtendMatchCmp1:
-	// In src's tail, compare 1 byte at a time.
-	CMPQ SI, R14
-	JAE  inlineExtendMatchEnd
-	MOVB (R15), AX
-	MOVB (SI), BX
-	CMPB AX, BX
-	JNE  inlineExtendMatchEnd
-	ADDQ $1, R15
-	ADDQ $1, SI
-	JMP  inlineExtendMatchCmp1
-
-inlineExtendMatchEnd:
-	// End inline of the extendMatch call.
-	// ----------------------------------------
-
-	// ----------------------------------------
-	// Begin inline of the emitCopy call.
-	//
-	// d += emitCopy(dst[d:], base-candidate, s-base)
-
-	// !!! length := s - base
-	MOVQ SI, AX
-	SUBQ R12, AX
-
-inlineEmitCopyLoop0:
-	// for length >= 68 { etc }
-	CMPL AX, $68
-	JLT  inlineEmitCopyStep1
-
-	// Emit a length 64 copy, encoded as 3 bytes.
-	MOVB $0xfe, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $64, AX
-	JMP  inlineEmitCopyLoop0
-
-inlineEmitCopyStep1:
-	// if length > 64 { etc }
-	CMPL AX, $64
-	JLE  inlineEmitCopyStep2
-
-	// Emit a length 60 copy, encoded as 3 bytes.
-	MOVB $0xee, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $60, AX
-
-inlineEmitCopyStep2:
-	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
-	CMPL AX, $12
-	JGE  inlineEmitCopyStep3
-	CMPL R11, $2048
-	JGE  inlineEmitCopyStep3
-
-	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB R11, 1(DI)
-	SHRL $8, R11
-	SHLB $5, R11
-	SUBB $4, AX
-	SHLB $2, AX
-	ORB  AX, R11
-	ORB  $1, R11
-	MOVB R11, 0(DI)
-	ADDQ $2, DI
-	JMP  inlineEmitCopyEnd
-
-inlineEmitCopyStep3:
-	// Emit the remaining copy, encoded as 3 bytes.
-	SUBL $1, AX
-	SHLB $2, AX
-	ORB  $2, AX
-	MOVB AX, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-
-inlineEmitCopyEnd:
-	// End inline of the emitCopy call.
-	// ----------------------------------------
-
-	// nextEmit = s
-	MOVQ SI, R10
-
-	// if s >= sLimit { goto emitRemainder }
-	MOVQ SI, AX
-	SUBQ DX, AX
-	CMPQ AX, R9
-	JAE  emitRemainder
-
-	// As per the encode_other.go code:
-	//
-	// We could immediately etc.
-
-	// x := load64(src, s-1)
-	MOVQ -1(SI), R14
-
-	// prevHash := hash(uint32(x>>0), shift)
-	MOVL  R14, R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// table[prevHash] = uint16(s-1)
-	MOVQ SI, AX
-	SUBQ DX, AX
-	SUBQ $1, AX
-
-	// XXX: MOVW AX, table-32768(SP)(R11*2)
-	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
-	BYTE $0x66
-	BYTE $0x42
-	BYTE $0x89
-	BYTE $0x44
-	BYTE $0x5c
-	BYTE $0x78
-
-	// currHash := hash(uint32(x>>8), shift)
-	SHRQ  $8, R14
-	MOVL  R14, R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// candidate = int(table[currHash])
-	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
-	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
-	BYTE $0x4e
-	BYTE $0x0f
-	BYTE $0xb7
-	BYTE $0x7c
-	BYTE $0x5c
-	BYTE $0x78
-
-	// table[currHash] = uint16(s)
-	ADDQ $1, AX
-
-	// XXX: MOVW AX, table-32768(SP)(R11*2)
-	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
-	BYTE $0x66
-	BYTE $0x42
-	BYTE $0x89
-	BYTE $0x44
-	BYTE $0x5c
-	BYTE $0x78
-
-	// if uint32(x>>8) == load32(src, candidate) { continue }
-	MOVL (DX)(R15*1), BX
-	CMPL R14, BX
-	JEQ  inner1
-
-	// nextHash = hash(uint32(x>>16), shift)
-	SHRQ  $8, R14
-	MOVL  R14, R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// s++
-	ADDQ $1, SI
-
-	// break out of the inner1 for loop, i.e. continue the outer loop.
-	JMP outer
-
-emitRemainder:
-	// if nextEmit < len(src) { etc }
-	MOVQ src_len+32(FP), AX
-	ADDQ DX, AX
-	CMPQ R10, AX
-	JEQ  encodeBlockEnd
-
-	// d += emitLiteral(dst[d:], src[nextEmit:])
-	//
-	// Push args.
-	MOVQ DI, 0(SP)
-	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ R10, 24(SP)
-	SUBQ R10, AX
-	MOVQ AX, 32(SP)
-	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
-
-	// Spill local variables (registers) onto the stack; call; unspill.
-	MOVQ DI, 80(SP)
-	CALL ·emitLiteral(SB)
-	MOVQ 80(SP), DI
-
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADDQ 48(SP), DI
-
-encodeBlockEnd:
-	MOVQ dst_base+0(FP), AX
-	SUBQ AX, DI
-	MOVQ DI, d+48(FP)
-	RET
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_other.go b/vendor/github.com/klauspost/compress/snappy/encode_other.go
deleted file mode 100644
index dbcae90..0000000
--- a/vendor/github.com/klauspost/compress/snappy/encode_other.go
+++ /dev/null
@@ -1,238 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64 appengine !gc noasm
-
-package snappy
-
-func load32(b []byte, i int) uint32 {
-	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-}
-
-func load64(b []byte, i int) uint64 {
-	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
-}
-
-// emitLiteral writes a literal chunk and returns the number of bytes written.
-//
-// It assumes that:
-//	dst is long enough to hold the encoded bytes
-//	1 <= len(lit) && len(lit) <= 65536
-func emitLiteral(dst, lit []byte) int {
-	i, n := 0, uint(len(lit)-1)
-	switch {
-	case n < 60:
-		dst[0] = uint8(n)<<2 | tagLiteral
-		i = 1
-	case n < 1<<8:
-		dst[0] = 60<<2 | tagLiteral
-		dst[1] = uint8(n)
-		i = 2
-	default:
-		dst[0] = 61<<2 | tagLiteral
-		dst[1] = uint8(n)
-		dst[2] = uint8(n >> 8)
-		i = 3
-	}
-	return i + copy(dst[i:], lit)
-}
-
-// emitCopy writes a copy chunk and returns the number of bytes written.
-//
-// It assumes that:
-//	dst is long enough to hold the encoded bytes
-//	1 <= offset && offset <= 65535
-//	4 <= length && length <= 65535
-func emitCopy(dst []byte, offset, length int) int {
-	i := 0
-	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
-	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
-	// length emitted down below is is a little lower (at 60 = 64 - 4), because
-	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
-	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
-	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
-	// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
-	// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
-	// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
-	for length >= 68 {
-		// Emit a length 64 copy, encoded as 3 bytes.
-		dst[i+0] = 63<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		i += 3
-		length -= 64
-	}
-	if length > 64 {
-		// Emit a length 60 copy, encoded as 3 bytes.
-		dst[i+0] = 59<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		i += 3
-		length -= 60
-	}
-	if length >= 12 || offset >= 2048 {
-		// Emit the remaining copy, encoded as 3 bytes.
-		dst[i+0] = uint8(length-1)<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		return i + 3
-	}
-	// Emit the remaining copy, encoded as 2 bytes.
-	dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
-	dst[i+1] = uint8(offset)
-	return i + 2
-}
-
-// extendMatch returns the largest k such that k <= len(src) and that
-// src[i:i+k-j] and src[j:k] have the same contents.
-//
-// It assumes that:
-//	0 <= i && i < j && j <= len(src)
-func extendMatch(src []byte, i, j int) int {
-	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
-	}
-	return j
-}
-
-func hash(u, shift uint32) uint32 {
-	return (u * 0x1e35a7bd) >> shift
-}
-
-// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
-// assumes that the varint-encoded length of the decompressed bytes has already
-// been written.
-//
-// It also assumes that:
-//	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
-func encodeBlock(dst, src []byte) (d int) {
-	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
-	// The table element type is uint16, as s < sLimit and sLimit < len(src)
-	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
-	const (
-		maxTableSize = 1 << 14
-		// tableMask is redundant, but helps the compiler eliminate bounds
-		// checks.
-		tableMask = maxTableSize - 1
-	)
-	shift := uint32(32 - 8)
-	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
-		shift--
-	}
-	// In Go, all array elements are zero-initialized, so there is no advantage
-	// to a smaller tableSize per se. However, it matches the C++ algorithm,
-	// and in the asm versions of this code, we can get away with zeroing only
-	// the first tableSize elements.
-	var table [maxTableSize]uint16
-
-	// sLimit is when to stop looking for offset/length copies. The inputMargin
-	// lets us use a fast path for emitLiteral in the main loop, while we are
-	// looking for copies.
-	sLimit := len(src) - inputMargin
-
-	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := 0
-
-	// The encoded form must start with a literal, as there are no previous
-	// bytes to copy, so we start looking for hash matches at s == 1.
-	s := 1
-	nextHash := hash(load32(src, s), shift)
-
-	for {
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned (or skipped), look at every third byte, etc.. When a match
-		// is found, immediately go back to looking at every byte. This is a
-		// small loss (~5% performance, ~0.1% density) for compressible data
-		// due to more bookkeeping, but for non-compressible data (such as
-		// JPEG) it's a huge win since the compressor quickly "realizes" the
-		// data is incompressible and doesn't bother looking for matches
-		// everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip := 32
-
-		nextS := s
-		candidate := 0
-		for {
-			s = nextS
-			bytesBetweenHashLookups := skip >> 5
-			nextS = s + bytesBetweenHashLookups
-			skip += bytesBetweenHashLookups
-			if nextS > sLimit {
-				goto emitRemainder
-			}
-			candidate = int(table[nextHash&tableMask])
-			table[nextHash&tableMask] = uint16(s)
-			nextHash = hash(load32(src, nextS), shift)
-			if load32(src, s) == load32(src, candidate) {
-				break
-			}
-		}
-
-		// A 4-byte match has been found. We'll later see if more than 4 bytes
-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
-		// them as literal bytes.
-		d += emitLiteral(dst[d:], src[nextEmit:s])
-
-		// Call emitCopy, and then see if another emitCopy could be our next
-		// move. Repeat until we find no match for the input immediately after
-		// what was consumed by the last emitCopy call.
-		//
-		// If we exit this loop normally then we need to call emitLiteral next,
-		// though we don't yet know how big the literal will be. We handle that
-		// by proceeding to the next iteration of the main loop. We also can
-		// exit this loop via goto if we get close to exhausting the input.
-		for {
-			// Invariant: we have a 4-byte match at s, and no need to emit any
-			// literal bytes prior to s.
-			base := s
-
-			// Extend the 4-byte match as long as possible.
-			//
-			// This is an inlined version of:
-			//	s = extendMatch(src, candidate+4, s+4)
-			s += 4
-			for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
-			}
-
-			d += emitCopy(dst[d:], base-candidate, s-base)
-			nextEmit = s
-			if s >= sLimit {
-				goto emitRemainder
-			}
-
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-1 and at s. If
-			// another emitCopy is not our next move, also calculate nextHash
-			// at s+1. At least on GOARCH=amd64, these three hash calculations
-			// are faster as one load64 call (with some shifts) instead of
-			// three load32 calls.
-			x := load64(src, s-1)
-			prevHash := hash(uint32(x>>0), shift)
-			table[prevHash&tableMask] = uint16(s - 1)
-			currHash := hash(uint32(x>>8), shift)
-			candidate = int(table[currHash&tableMask])
-			table[currHash&tableMask] = uint16(s)
-			if uint32(x>>8) != load32(src, candidate) {
-				nextHash = hash(uint32(x>>16), shift)
-				s++
-				break
-			}
-		}
-	}
-
-emitRemainder:
-	if nextEmit < len(src) {
-		d += emitLiteral(dst[d:], src[nextEmit:])
-	}
-	return d
-}
diff --git a/vendor/github.com/klauspost/compress/snappy/runbench.cmd b/vendor/github.com/klauspost/compress/snappy/runbench.cmd
deleted file mode 100644
index d24eb4b..0000000
--- a/vendor/github.com/klauspost/compress/snappy/runbench.cmd
+++ /dev/null
@@ -1,2 +0,0 @@
-del old.txt
-go test -bench=. >>old.txt && go test -bench=. >>old.txt && go test -bench=. >>old.txt && benchstat -delta-test=ttest old.txt new.txt
diff --git a/vendor/github.com/klauspost/compress/snappy/snappy.go b/vendor/github.com/klauspost/compress/snappy/snappy.go
deleted file mode 100644
index 74a3668..0000000
--- a/vendor/github.com/klauspost/compress/snappy/snappy.go
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright 2011 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package snappy implements the Snappy compression format. It aims for very
-// high speeds and reasonable compression.
-//
-// There are actually two Snappy formats: block and stream. They are related,
-// but different: trying to decompress block-compressed data as a Snappy stream
-// will fail, and vice versa. The block format is the Decode and Encode
-// functions and the stream format is the Reader and Writer types.
-//
-// The block format, the more common case, is used when the complete size (the
-// number of bytes) of the original data is known upfront, at the time
-// compression starts. The stream format, also known as the framing format, is
-// for when that isn't always true.
-//
-// The canonical, C++ implementation is at https://github.com/google/snappy and
-// it only implements the block format.
-package snappy
-
-import (
-	"hash/crc32"
-)
-
-/*
-Each encoded block begins with the varint-encoded length of the decoded data,
-followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
-first byte of each chunk is broken into its 2 least and 6 most significant bits
-called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
-Zero means a literal tag. All other values mean a copy tag.
-
-For literal tags:
-  - If m < 60, the next 1 + m bytes are literal bytes.
-  - Otherwise, let n be the little-endian unsigned integer denoted by the next
-    m - 59 bytes. The next 1 + n bytes after that are literal bytes.
-
-For copy tags, length bytes are copied from offset bytes ago, in the style of
-Lempel-Ziv compression algorithms. In particular:
-  - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
-    The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
-    of the offset. The next byte is bits 0-7 of the offset.
-  - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
-    The length is 1 + m. The offset is the little-endian unsigned integer
-    denoted by the next 2 bytes.
-  - For l == 3, this tag is a legacy format that is no longer issued by most
-    encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in
-    [1, 65). The length is 1 + m. The offset is the little-endian unsigned
-    integer denoted by the next 4 bytes.
-*/
-const (
-	tagLiteral = 0x00
-	tagCopy1   = 0x01
-	tagCopy2   = 0x02
-	tagCopy4   = 0x03
-)
-
-const (
-	checksumSize    = 4
-	chunkHeaderSize = 4
-	magicChunk      = "\xff\x06\x00\x00" + magicBody
-	magicBody       = "sNaPpY"
-
-	// maxBlockSize is the maximum size of the input to encodeBlock. It is not
-	// part of the wire format per se, but some parts of the encoder assume
-	// that an offset fits into a uint16.
-	//
-	// Also, for the framing format (Writer type instead of Encode function),
-	// https://github.com/google/snappy/blob/master/framing_format.txt says
-	// that "the uncompressed data in a chunk must be no longer than 65536
-	// bytes".
-	maxBlockSize = 65536
-
-	// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
-	// hard coded to be a const instead of a variable, so that obufLen can also
-	// be a const. Their equivalence is confirmed by
-	// TestMaxEncodedLenOfMaxBlockSize.
-	maxEncodedLenOfMaxBlockSize = 76490
-
-	obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
-	obufLen       = obufHeaderLen + maxEncodedLenOfMaxBlockSize
-)
-
-const (
-	chunkTypeCompressedData   = 0x00
-	chunkTypeUncompressedData = 0x01
-	chunkTypePadding          = 0xfe
-	chunkTypeStreamIdentifier = 0xff
-)
-
-var crcTable = crc32.MakeTable(crc32.Castagnoli)
-
-// crc implements the checksum specified in section 3 of
-// https://github.com/google/snappy/blob/master/framing_format.txt
-func crc(b []byte) uint32 {
-	c := crc32.Update(0, crcTable, b)
-	return uint32(c>>15|c<<17) + 0xa282ead8
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index bc977a3..e7d7eb0 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -5,11 +5,8 @@
 A high performance compression algorithm is implemented. For now focused on speed. 
 
 This package provides [compression](#Compressor) to and [decompression](#Decompressor) of Zstandard content. 
-Note that custom dictionaries are not supported yet, so if your code relies on that, 
-you cannot use the package as-is.
 
 This package is pure Go and without use of "unsafe". 
-If a significant speedup can be achieved using "unsafe", it may be added as an option later.
 
 The `zstd` package is provided as open source software using a Go standard license.
 
@@ -27,22 +24,21 @@
 ### Status: 
 
 STABLE - there may always be subtle bugs, a wide variety of content has been tested and the library is actively 
-used by several projects. This library is being continuously [fuzz-tested](https://github.com/klauspost/compress-fuzz),
-kindly supplied by [fuzzit.dev](https://fuzzit.dev/).
+used by several projects. This library is being [fuzz-tested](https://github.com/klauspost/compress-fuzz) for all updates.
 
 There may still be specific combinations of data types/size/settings that could lead to edge cases, 
 so as always, testing is recommended.  
 
 For now, a high speed (fastest) and medium-fast (default) compressor has been implemented. 
 
-The "Fastest" compression ratio is roughly equivalent to zstd level 1. 
-The "Default" compression ratio is roughly equivalent to zstd level 3 (default).
+* The "Fastest" compression ratio is roughly equivalent to zstd level 1. 
+* The "Default" compression ratio is roughly equivalent to zstd level 3 (default).
+* The "Better" compression ratio is roughly equivalent to zstd level 7.
+* The "Best" compression ratio is roughly equivalent to zstd level 11.
 
 In terms of speed, it is typically 2x as fast as the stdlib deflate/gzip in its fastest mode. 
 The compression ratio compared to stdlib is around level 3, but usually 3x as fast.
 
-Compared to cgo zstd, the speed is around level 3 (default), but compression slightly worse, between level 1&2.
-
  
 ### Usage
 
@@ -57,11 +53,11 @@
 ```Go
 // Compress input to output.
 func Compress(in io.Reader, out io.Writer) error {
-    w, err := NewWriter(output)
+    enc, err := zstd.NewWriter(out)
     if err != nil {
         return err
     }
-    _, err := io.Copy(w, input)
+    _, err = io.Copy(enc, in)
     if err != nil {
         enc.Close()
         return err
@@ -142,117 +138,110 @@
 I have collected some speed examples to compare speed and compression against other compressors.
 
 * `file` is the input file.
-* `out` is the compressor used. `zskp` is this package. `gzstd` is gzip standard library. `zstd` is the Datadog cgo library.
-* `level` is the compression level used. For `zskp` level 1 is "fastest", level 2 is "default".
+* `out` is the compressor used. `zskp` is this package. `zstd` is the Datadog cgo library. `gzstd/gzkp` is gzip standard and this library.
+* `level` is the compression level used. For `zskp` level 1 is "fastest", level 2 is "default"; 3 is "better", 4 is "best".
 * `insize`/`outsize` is the input/output size.
 * `millis` is the number of milliseconds used for compression.
 * `mb/s` is megabytes (2^20 bytes) per second.
 
 ```
-The test data for the Large Text Compression Benchmark is the first
-10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
-http://mattmahoney.net/dc/textdata.html
+Silesia Corpus:
+http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip
 
-file    out     level   insize  outsize     millis  mb/s
-enwik9  zskp    1   1000000000  343833033   5840    163.30
-enwik9  zskp    2   1000000000  317822183   8449    112.87
-enwik9  gzstd   1   1000000000  382578136   13627   69.98
-enwik9  gzstd   3   1000000000  349139651   22344   42.68
-enwik9  zstd    1   1000000000  357416379   4838    197.12
-enwik9  zstd    3   1000000000  313734522   7556    126.21
+This package:
+file    out     level   insize      outsize     millis  mb/s
+silesia.tar zskp    1   211947520   73101992    643     313.87
+silesia.tar zskp    2   211947520   67504318    969     208.38
+silesia.tar zskp    3   211947520   64595893    2007    100.68
+silesia.tar zskp    4   211947520   60995370    7691    26.28
+
+cgo zstd:
+silesia.tar zstd    1   211947520   73605392    543     371.56
+silesia.tar zstd    3   211947520   66793289    864     233.68
+silesia.tar zstd    6   211947520   62916450    1913    105.66
+silesia.tar zstd    9   211947520   60212393    5063    39.92
+
+gzip, stdlib/this package:
+silesia.tar gzstd   1   211947520   80007735    1654    122.21
+silesia.tar gzkp    1   211947520   80369488    1168    173.06
 
 GOB stream of binary data. Highly compressible.
 https://files.klauspost.com/compress/gob-stream.7z
 
-file        out level   insize      outsize     millis  mb/s
-gob-stream  zskp    1   1911399616  234981983   5100    357.42
-gob-stream  zskp    2   1911399616  208674003   6698    272.15
-gob-stream  gzstd   1   1911399616  357382641   14727   123.78
-gob-stream  gzstd   3   1911399616  327835097   17005   107.19
-gob-stream  zstd    1   1911399616  250787165   4075    447.22
-gob-stream  zstd    3   1911399616  208191888   5511    330.77
+file        out     level   insize  outsize     millis  mb/s
+gob-stream  zskp    1   1911399616  235022249   3088    590.30
+gob-stream  zskp    2   1911399616  205669791   3786    481.34
+gob-stream  zskp    3   1911399616  175034659   9636    189.17
+gob-stream  zskp    4   1911399616  167273881   29337   62.13
+gob-stream  zstd    1   1911399616  249810424   2637    691.26
+gob-stream  zstd    3   1911399616  208192146   3490    522.31
+gob-stream  zstd    6   1911399616  193632038   6687    272.56
+gob-stream  zstd    9   1911399616  177620386   16175   112.70
+gob-stream  gzstd   1   1911399616  357382641   10251   177.82
+gob-stream  gzkp    1   1911399616  362156523   5695    320.08
 
-Highly compressible JSON file. Similar to logs in a lot of ways.
-https://files.klauspost.com/compress/adresser.001.gz
+The test data for the Large Text Compression Benchmark is the first
+10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
+http://mattmahoney.net/dc/textdata.html
 
-file            out level   insize      outsize     millis  mb/s
-adresser.001    zskp    1   1073741824  18510122    1477    692.83
-adresser.001    zskp    2   1073741824  19831697    1705    600.59
-adresser.001    gzstd   1   1073741824  47755503    3079    332.47
-adresser.001    gzstd   3   1073741824  40052381    3051    335.63
-adresser.001    zstd    1   1073741824  16135896    994     1030.18
-adresser.001    zstd    3   1073741824  17794465    905     1131.49
+file    out level   insize      outsize     millis  mb/s
+enwik9  zskp    1   1000000000  343848582   3609    264.18
+enwik9  zskp    2   1000000000  317276632   5746    165.97
+enwik9  zskp    3   1000000000  292243069   12162   78.41
+enwik9  zskp    4   1000000000  275241169   36430   26.18
+enwik9  zstd    1   1000000000  358072021   3110    306.65
+enwik9  zstd    3   1000000000  313734672   4784    199.35
+enwik9  zstd    6   1000000000  295138875   10290   92.68
+enwik9  zstd    9   1000000000  278348700   28549   33.40
+enwik9  gzstd   1   1000000000  382578136   9604    99.30
+enwik9  gzkp    1   1000000000  383825945   6544    145.73
+
+Highly compressible JSON file.
+https://files.klauspost.com/compress/github-june-2days-2019.json.zst
+
+file                        out level   insize      outsize     millis  mb/s
+github-june-2days-2019.json zskp    1   6273951764  699045015   10620   563.40
+github-june-2days-2019.json zskp    2   6273951764  617881763   11687   511.96
+github-june-2days-2019.json zskp    3   6273951764  524340691   34043   175.75
+github-june-2days-2019.json zskp    4   6273951764  503314661   93811   63.78
+github-june-2days-2019.json zstd    1   6273951764  766284037   8450    708.00
+github-june-2days-2019.json zstd    3   6273951764  661889476   10927   547.57
+github-june-2days-2019.json zstd    6   6273951764  642756859   22996   260.18
+github-june-2days-2019.json zstd    9   6273951764  601974523   52413   114.16
+github-june-2days-2019.json gzstd   1   6273951764  1164400847  29948   199.79
+github-june-2days-2019.json gzkp    1   6273951764  1128755542  19236   311.03
 
 VM Image, Linux mint with a few installed applications:
 https://files.klauspost.com/compress/rawstudio-mint14.7z
 
-file    out level   insize  outsize millis  mb/s
-rawstudio-mint14.tar    zskp    1   8558382592  3648168838  33398   244.38
-rawstudio-mint14.tar    zskp    2   8558382592  3376721436  50962   160.16
-rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  84712   96.35
-rawstudio-mint14.tar    gzstd   3   8558382592  3740711978  176344  46.28
-rawstudio-mint14.tar    zstd    1   8558382592  3607859742  27903   292.51
-rawstudio-mint14.tar    zstd    3   8558382592  3341710879  46700   174.77
+file                    out level   insize      outsize     millis  mb/s
+rawstudio-mint14.tar    zskp    1   8558382592  3667489370  20210   403.84
+rawstudio-mint14.tar    zskp    2   8558382592  3364592300  31873   256.07
+rawstudio-mint14.tar    zskp    3   8558382592  3158085214  77675   105.08
+rawstudio-mint14.tar    zskp    4   8558382592  3020370044  404956  20.16
+rawstudio-mint14.tar    zstd    1   8558382592  3609250104  17136   476.27
+rawstudio-mint14.tar    zstd    3   8558382592  3341679997  29262   278.92
+rawstudio-mint14.tar    zstd    6   8558382592  3235846406  77904   104.77
+rawstudio-mint14.tar    zstd    9   8558382592  3160778861  140946  57.91
+rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  57722   141.40
+rawstudio-mint14.tar    gzkp    1   8558382592  3970463184  41749   195.49
 
+CSV data:
+https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst
 
-The test data is designed to test archivers in realistic backup scenarios.
-http://mattmahoney.net/dc/10gb.html
-
-file    out level   insize  outsize millis  mb/s
-10gb.tar    zskp    1   10065157632 4883149814  45715   209.97
-10gb.tar    zskp    2   10065157632 4638110010  60970   157.44
-10gb.tar    gzstd   1   10065157632 5198296126  97769   98.18
-10gb.tar    gzstd   3   10065157632 4932665487  313427  30.63
-10gb.tar    zstd    1   10065157632 4940796535  40391   237.65
-10gb.tar    zstd    3   10065157632 4638618579  52911   181.42
-
-Silesia Corpus:
-http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip
-
-file    out level   insize  outsize millis  mb/s
-silesia.tar zskp    1   211947520   73025800    1108    182.26
-silesia.tar zskp    2   211947520   67674684    1599    126.41
-silesia.tar gzstd   1   211947520   80007735    2515    80.37
-silesia.tar gzstd   3   211947520   73133380    4259    47.45
-silesia.tar zstd    1   211947520   73513991    933     216.64
-silesia.tar zstd    3   211947520   66793301    1377    146.79
+file                    out level   insize      outsize     millis  mb/s
+nyc-taxi-data-10M.csv   zskp    1   3325605752  641339945   8925    355.35
+nyc-taxi-data-10M.csv   zskp    2   3325605752  591748091   11268   281.44
+nyc-taxi-data-10M.csv   zskp    3   3325605752  530289687   25239   125.66
+nyc-taxi-data-10M.csv   zskp    4   3325605752  490907191   65939   48.10
+nyc-taxi-data-10M.csv   zstd    1   3325605752  687399637   8233    385.18
+nyc-taxi-data-10M.csv   zstd    3   3325605752  598514411   10065   315.07
+nyc-taxi-data-10M.csv   zstd    6   3325605752  570522953   20038   158.27
+nyc-taxi-data-10M.csv   zstd    9   3325605752  517554797   64565   49.12
+nyc-taxi-data-10M.csv   gzstd   1   3325605752  928656485   23876   132.83
+nyc-taxi-data-10M.csv   gzkp    1   3325605752  924718719   16388   193.53
 ```
 
-### Converters
-
-As part of the development process a *Snappy* -> *Zstandard* converter was also built.
-
-This can convert a *framed* [Snappy Stream](https://godoc.org/github.com/golang/snappy#Writer) to a zstd stream. 
-Note that a single block is not framed.
-
-Conversion is done by converting the stream directly from Snappy without intermediate full decoding.
-Therefore the compression ratio is much less than what can be done by a full decompression
-and compression, and a faulty Snappy stream may lead to a faulty Zstandard stream without
-any errors being generated.
-No CRC value is being generated and not all CRC values of the Snappy stream are checked.
-However, it provides really fast re-compression of Snappy streams.
-
-
-```
-BenchmarkSnappy_ConvertSilesia-8           1  1156001600 ns/op   183.35 MB/s
-Snappy len 103008711 -> zstd len 82687318
-
-BenchmarkSnappy_Enwik9-8           1  6472998400 ns/op   154.49 MB/s
-Snappy len 508028601 -> zstd len 390921079
-```
-
-
-```Go
-    s := zstd.SnappyConverter{}
-    n, err = s.Convert(input, output)
-    if err != nil {
-        fmt.Println("Re-compressed stream to", n, "bytes")
-    }
-```
-
-The converter `s` can be reused to avoid allocations, even after errors.
-
-
 ## Decompressor
 
 Staus: STABLE - there may still be subtle bugs, but a wide variety of content has been tested.
@@ -273,14 +262,14 @@
 import "github.com/klauspost/compress/zstd"
 
 func Decompress(in io.Reader, out io.Writer) error {
-    d, err := zstd.NewReader(input)
+    d, err := zstd.NewReader(in)
     if err != nil {
         return err
     }
     defer d.Close()
     
     // Copy content...
-    _, err := io.Copy(out, d)
+    _, err = io.Copy(out, d)
     return err
 }
 ```
@@ -309,6 +298,35 @@
 It will only allow a certain number of concurrent operations to run. 
 To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.   
 
+### Dictionaries
+
+Data compressed with [dictionaries](https://github.com/facebook/zstd#the-case-for-small-data-compression) can be decompressed.
+
+Dictionaries are added individually to Decoders.
+Dictionaries are generated by the `zstd --train` command and contains an initial state for the decoder.
+To add a dictionary use the `WithDecoderDicts(dicts ...[]byte)` option with the dictionary data.
+Several dictionaries can be added at once.
+
+The dictionary will be used automatically for the data that specifies them.
+A re-used Decoder will still contain the dictionaries registered.
+
+When registering multiple dictionaries with the same ID, the last one will be used.
+
+It is possible to use dictionaries when compressing data.
+
+To enable a dictionary use `WithEncoderDict(dict []byte)`. Here only one dictionary will be used 
+and it will likely be used even if it doesn't improve compression. 
+
+The used dictionary must be used to decompress the content.
+
+For any real gains, the dictionary should be built with similar data. 
+If an unsuitable dictionary is used the output may be slightly larger than using no dictionary.
+Use the [zstd commandline tool](https://github.com/facebook/zstd/releases) to build a dictionary from sample data.
+For information see [zstd dictionary information](https://github.com/facebook/zstd#the-case-for-small-data-compression). 
+
+For now there is a fixed startup performance penalty for compressing content with dictionaries. 
+This will likely be improved over time. Just be aware to test performance when implementing.  
+
 ### Allocation-less operation
 
 The decoder has been designed to operate without allocations after a warmup. 
@@ -350,36 +368,42 @@
 The first two are streaming decodes and the last are smaller inputs. 
  
 ```
-BenchmarkDecoderSilesia-8             20       642550210 ns/op   329.85 MB/s      3101 B/op        8 allocs/op
-BenchmarkDecoderSilesiaCgo-8         100       384930000 ns/op   550.61 MB/s    451878 B/op     9713 allocs/op
+BenchmarkDecoderSilesia-8                          3     385000067 ns/op     550.51 MB/s        5498 B/op          8 allocs/op
+BenchmarkDecoderSilesiaCgo-8                       6     197666567 ns/op    1072.25 MB/s      270672 B/op          8 allocs/op
 
-BenchmarkDecoderEnwik9-2              10        3146000080 ns/op         317.86 MB/s        2649 B/op          9 allocs/op
-BenchmarkDecoderEnwik9Cgo-2           20        1905900000 ns/op         524.69 MB/s     1125120 B/op      45785 allocs/op
+BenchmarkDecoderEnwik9-8                           1    2027001600 ns/op     493.34 MB/s       10496 B/op         18 allocs/op
+BenchmarkDecoderEnwik9Cgo-8                        2     979499200 ns/op    1020.93 MB/s      270672 B/op          8 allocs/op
 
-BenchmarkDecoder_DecodeAll/z000000.zst-8               200     7049994 ns/op   138.26 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000001.zst-8            100000       19560 ns/op    97.49 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000002.zst-8              5000      297599 ns/op   236.99 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000003.zst-8              2000      725502 ns/op   141.17 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000004.zst-8            200000        9314 ns/op    54.54 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000005.zst-8             10000      137500 ns/op   104.72 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000006.zst-8               500     2316009 ns/op   206.06 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000007.zst-8             20000       64499 ns/op   344.90 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000008.zst-8             50000       24900 ns/op   219.56 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000009.zst-8              1000     2348999 ns/op   154.01 MB/s        40 B/op        2 allocs/op
+Concurrent performance:
 
-BenchmarkDecoder_DecodeAllCgo/z000000.zst-8            500     4268005 ns/op   228.38 MB/s   1228849 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000001.zst-8         100000       15250 ns/op   125.05 MB/s      2096 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000002.zst-8          10000      147399 ns/op   478.49 MB/s     73776 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000003.zst-8           5000      320798 ns/op   319.27 MB/s    139312 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000004.zst-8         200000       10004 ns/op    50.77 MB/s       560 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000005.zst-8          20000       73599 ns/op   195.64 MB/s     19120 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000006.zst-8           1000     1119003 ns/op   426.48 MB/s    557104 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000007.zst-8          20000      103450 ns/op   215.04 MB/s     71296 B/op        9 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000008.zst-8         100000       20130 ns/op   271.58 MB/s      6192 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000009.zst-8           2000     1123500 ns/op   322.00 MB/s    368688 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16                28915         42469 ns/op    4340.07 MB/s         114 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16           116505          9965 ns/op    11900.16 MB/s         16 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16              8952        134272 ns/op    3588.70 MB/s         915 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16               11820        102538 ns/op    4161.90 MB/s         594 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16             34782         34184 ns/op    3661.88 MB/s          60 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16              27712         43447 ns/op    3500.58 MB/s          99 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16                 62826         18750 ns/op    21845.10 MB/s        104 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16          631545          1794 ns/op    57078.74 MB/s          2 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16         1690140           712 ns/op    172938.13 MB/s         1 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16                 10432        113593 ns/op    6180.73 MB/s        1143 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html.zst-16                    113206         10671 ns/op    9596.27 MB/s          15 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16          1530615           779 ns/op    5229.49 MB/s           0 B/op          0 allocs/op
+
+BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16             65217         16192 ns/op    11383.34 MB/s         46 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16        292671          4039 ns/op    29363.19 MB/s          6 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16          26314         46021 ns/op    10470.43 MB/s        293 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16            33897         34900 ns/op    12227.96 MB/s        205 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16         104348         11433 ns/op    10949.01 MB/s         20 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16           75949         15510 ns/op    9805.60 MB/s          32 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16             173910          6756 ns/op    60624.29 MB/s         37 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16       923076          1339 ns/op    76474.87 MB/s          1 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16       922920          1351 ns/op    91102.57 MB/s          2 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16              27649         43618 ns/op    16096.19 MB/s        407 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16                 279073          4160 ns/op    24614.18 MB/s          6 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16        749938          1579 ns/op    2581.71 MB/s           0 B/op          0 allocs/op
 ```
 
-This reflects the performance around May 2019, but this may be out of date.
+This reflects the performance around May 2020, but this may be out of date.
 
 # Contributions
 
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
index 15d79d4..8544585 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -5,6 +5,7 @@
 package zstd
 
 import (
+	"encoding/binary"
 	"errors"
 	"io"
 	"math/bits"
@@ -34,8 +35,12 @@
 	}
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
-	b.fill()
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
 	b.bitsRead += 8 - uint8(highBits(uint32(v)))
 	return nil
 }
@@ -63,21 +68,31 @@
 	if b.bitsRead < 32 {
 		return
 	}
-	// Do single re-slice to avoid bounds checks.
-	v := b.in[b.off-4 : b.off]
+	// 2 bounds checks.
+	v := b.in[b.off-4:]
+	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
 	b.off -= 4
 }
 
+// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
+func (b *bitReader) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
+}
+
 // fill() will make sure at least 32 bits are available.
 func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
 	if b.off >= 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
+		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index ed670bc..b51d922 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -75,21 +75,29 @@
 
 	// Window size of the block.
 	WindowSize uint64
-	Type       blockType
-	RLESize    uint32
+
+	history     chan *history
+	input       chan struct{}
+	result      chan decodeOutput
+	sequenceBuf []seq
+	err         error
+	decWG       sync.WaitGroup
+
+	// Frame to use for singlethreaded decoding.
+	// Should not be used by the decoder itself since parent may be another frame.
+	localFrame *frameDec
+
+	// Block is RLE, this is the size.
+	RLESize uint32
+	tmp     [4]byte
+
+	Type blockType
 
 	// Is this the last block of a frame?
 	Last bool
 
 	// Use less memory
-	lowMem      bool
-	history     chan *history
-	input       chan struct{}
-	result      chan decodeOutput
-	sequenceBuf []seq
-	tmp         [4]byte
-	err         error
-	decWG       sync.WaitGroup
+	lowMem bool
 }
 
 func (b *blockDec) String() string {
@@ -127,25 +135,37 @@
 	b.Type = blockType((bh >> 1) & 3)
 	// find size.
 	cSize := int(bh >> 3)
+	maxSize := maxBlockSize
 	switch b.Type {
 	case blockTypeReserved:
 		return ErrReservedBlockType
 	case blockTypeRLE:
 		b.RLESize = uint32(cSize)
+		if b.lowMem {
+			maxSize = cSize
+		}
 		cSize = 1
 	case blockTypeCompressed:
 		if debug {
 			println("Data size on stream:", cSize)
 		}
 		b.RLESize = 0
+		maxSize = maxCompressedBlockSize
+		if windowSize < maxCompressedBlockSize && b.lowMem {
+			maxSize = int(windowSize)
+		}
 		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
 			if debug {
 				printf("compressed block too big: csize:%d block: %+v\n", uint64(cSize), b)
 			}
 			return ErrCompressedSizeTooBig
 		}
-	default:
+	case blockTypeRaw:
 		b.RLESize = 0
+		// We do not need a destination for raw blocks.
+		maxSize = -1
+	default:
+		panic("Invalid block type")
 	}
 
 	// Read block data.
@@ -156,8 +176,8 @@
 			b.dataStorage = make([]byte, 0, maxBlockSize)
 		}
 	}
-	if cap(b.dst) <= maxBlockSize {
-		b.dst = make([]byte, 0, maxBlockSize+1)
+	if cap(b.dst) <= maxSize {
+		b.dst = make([]byte, 0, maxSize+1)
 	}
 	var err error
 	b.data, err = br.readBig(cSize, b.dataStorage)
@@ -445,26 +465,22 @@
 		if huff == nil {
 			huff = &huff0.Scratch{}
 		}
-		huff.Out = b.literalBuf[:0]
 		huff, literals, err = huff0.ReadTable(literals, huff)
 		if err != nil {
 			println("reading huffman table:", err)
 			return err
 		}
 		// Use our out buffer.
-		huff.Out = b.literalBuf[:0]
-		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
-			literals, err = huff.Decompress4X(literals, litRegenSize)
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
 		} else {
-			literals, err = huff.Decompress1X(literals)
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
 		}
 		if err != nil {
 			println("decoding compressed literals:", err)
 			return err
 		}
 		// Make sure we don't leak our literals buffer
-		huff.Out = nil
 		if len(literals) != litRegenSize {
 			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
 		}
@@ -597,7 +613,7 @@
 	// Decode treeless literal block.
 	if litType == literalsBlockTreeless {
 		// TODO: We could send the history early WITHOUT the stream history.
-		//   This would allow decoding treeless literials before the byte history is available.
+		//   This would allow decoding treeless literals before the byte history is available.
 		//   Silencia stats: Treeless 4393, with: 32775, total: 37168, 11% treeless.
 		//   So not much obvious gain here.
 
@@ -615,15 +631,12 @@
 		var err error
 		// Use our out buffer.
 		huff = hist.huffTree
-		huff.Out = b.literalBuf[:0]
-		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
-			literals, err = huff.Decompress4X(literals, litRegenSize)
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
 		} else {
-			literals, err = huff.Decompress1X(literals)
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
 		}
 		// Make sure we don't leak our literals buffer
-		huff.Out = nil
 		if err != nil {
 			println("decompressing literals:", err)
 			return err
@@ -633,12 +646,13 @@
 		}
 	} else {
 		if hist.huffTree != nil && huff != nil {
-			huffDecoderPool.Put(hist.huffTree)
+			if hist.dict == nil || hist.dict.litEnc != hist.huffTree {
+				huffDecoderPool.Put(hist.huffTree)
+			}
 			hist.huffTree = nil
 		}
 	}
 	if huff != nil {
-		huff.Out = nil
 		hist.huffTree = huff
 	}
 	if debug {
@@ -671,12 +685,21 @@
 	//   If only recent offsets were not transferred, this would be an obvious win.
 	// 	 Also, if first 3 sequences don't reference recent offsets, all sequences can be decoded.
 
+	hbytes := hist.b
+	if len(hbytes) > hist.windowSize {
+		hbytes = hbytes[len(hbytes)-hist.windowSize:]
+		// We do not need history any more.
+		if hist.dict != nil {
+			hist.dict.content = nil
+		}
+	}
+
 	if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
 		println("initializing sequences:", err)
 		return err
 	}
 
-	err = seqs.decode(nSeqs, br, hist.b)
+	err = seqs.decode(nSeqs, br, hbytes)
 	if err != nil {
 		return err
 	}
diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
index 507757d..e1be092 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go
@@ -14,35 +14,52 @@
 )
 
 type blockEnc struct {
-	size      int
-	literals  []byte
-	sequences []seq
-	coders    seqCoders
-	litEnc    *huff0.Scratch
-	wr        bitWriter
+	size       int
+	literals   []byte
+	sequences  []seq
+	coders     seqCoders
+	litEnc     *huff0.Scratch
+	dictLitEnc *huff0.Scratch
+	wr         bitWriter
 
-	extraLits int
-	last      bool
-
+	extraLits         int
 	output            []byte
 	recentOffsets     [3]uint32
 	prevRecentOffsets [3]uint32
+
+	last   bool
+	lowMem bool
 }
 
 // init should be used once the block has been created.
 // If called more than once, the effect is the same as calling reset.
 func (b *blockEnc) init() {
-	if cap(b.literals) < maxCompressedLiteralSize {
-		b.literals = make([]byte, 0, maxCompressedLiteralSize)
+	if b.lowMem {
+		// 1K literals
+		if cap(b.literals) < 1<<10 {
+			b.literals = make([]byte, 0, 1<<10)
+		}
+		const defSeqs = 20
+		if cap(b.sequences) < defSeqs {
+			b.sequences = make([]seq, 0, defSeqs)
+		}
+		// 1K
+		if cap(b.output) < 1<<10 {
+			b.output = make([]byte, 0, 1<<10)
+		}
+	} else {
+		if cap(b.literals) < maxCompressedBlockSize {
+			b.literals = make([]byte, 0, maxCompressedBlockSize)
+		}
+		const defSeqs = 200
+		if cap(b.sequences) < defSeqs {
+			b.sequences = make([]seq, 0, defSeqs)
+		}
+		if cap(b.output) < maxCompressedBlockSize {
+			b.output = make([]byte, 0, maxCompressedBlockSize)
+		}
 	}
-	const defSeqs = 200
-	b.literals = b.literals[:0]
-	if cap(b.sequences) < defSeqs {
-		b.sequences = make([]seq, 0, defSeqs)
-	}
-	if cap(b.output) < maxCompressedBlockSize {
-		b.output = make([]byte, 0, maxCompressedBlockSize)
-	}
+
 	if b.coders.mlEnc == nil {
 		b.coders.mlEnc = &fseEncoder{}
 		b.coders.mlPrev = &fseEncoder{}
@@ -75,6 +92,7 @@
 	if prev != nil {
 		b.recentOffsets = prev.prevRecentOffsets
 	}
+	b.dictLitEnc = nil
 }
 
 // reset will reset the block for a new encode, but in the same stream,
@@ -295,7 +313,7 @@
 	b.output = bh.appendTo(b.output[:0])
 	b.output = append(b.output, a...)
 	if debug {
-		println("Adding RAW block, length", len(a))
+		println("Adding RAW block, length", len(a), "last:", b.last)
 	}
 }
 
@@ -308,25 +326,25 @@
 	dst = bh.appendTo(dst)
 	dst = append(dst, src...)
 	if debug {
-		println("Adding RAW block, length", len(src))
+		println("Adding RAW block, length", len(src), "last:", b.last)
 	}
 	return dst
 }
 
 // encodeLits can be used if the block is only litLen.
-func (b *blockEnc) encodeLits(raw bool) error {
+func (b *blockEnc) encodeLits(lits []byte, raw bool) error {
 	var bh blockHeader
 	bh.setLast(b.last)
-	bh.setSize(uint32(len(b.literals)))
+	bh.setSize(uint32(len(lits)))
 
 	// Don't compress extremely small blocks
-	if len(b.literals) < 32 || raw {
+	if len(lits) < 8 || (len(lits) < 32 && b.dictLitEnc == nil) || raw {
 		if debug {
-			println("Adding RAW block, length", len(b.literals))
+			println("Adding RAW block, length", len(lits), "last:", b.last)
 		}
 		bh.setType(blockTypeRaw)
 		b.output = bh.appendTo(b.output)
-		b.output = append(b.output, b.literals...)
+		b.output = append(b.output, lits...)
 		return nil
 	}
 
@@ -335,13 +353,18 @@
 		reUsed, single bool
 		err            error
 	)
-	if len(b.literals) >= 1024 {
+	if b.dictLitEnc != nil {
+		b.litEnc.TransferCTable(b.dictLitEnc)
+		b.litEnc.Reuse = huff0.ReusePolicyAllow
+		b.dictLitEnc = nil
+	}
+	if len(lits) >= 1024 {
 		// Use 4 Streams.
-		out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
-	} else if len(b.literals) > 32 {
+		out, reUsed, err = huff0.Compress4X(lits, b.litEnc)
+	} else if len(lits) > 32 {
 		// Use 1 stream
 		single = true
-		out, reUsed, err = huff0.Compress1X(b.literals, b.litEnc)
+		out, reUsed, err = huff0.Compress1X(lits, b.litEnc)
 	} else {
 		err = huff0.ErrIncompressible
 	}
@@ -349,23 +372,23 @@
 	switch err {
 	case huff0.ErrIncompressible:
 		if debug {
-			println("Adding RAW block, length", len(b.literals))
+			println("Adding RAW block, length", len(lits), "last:", b.last)
 		}
 		bh.setType(blockTypeRaw)
 		b.output = bh.appendTo(b.output)
-		b.output = append(b.output, b.literals...)
+		b.output = append(b.output, lits...)
 		return nil
 	case huff0.ErrUseRLE:
 		if debug {
-			println("Adding RLE block, length", len(b.literals))
+			println("Adding RLE block, length", len(lits))
 		}
 		bh.setType(blockTypeRLE)
 		b.output = bh.appendTo(b.output)
-		b.output = append(b.output, b.literals[0])
+		b.output = append(b.output, lits[0])
 		return nil
+	case nil:
 	default:
 		return err
-	case nil:
 	}
 	// Compressed...
 	// Now, allow reuse
@@ -384,7 +407,7 @@
 		lh.setType(literalsBlockCompressed)
 	}
 	// Set sizes
-	lh.setSizes(len(out), len(b.literals), single)
+	lh.setSizes(len(out), len(lits), single)
 	bh.setSize(uint32(len(out) + lh.size() + 1))
 
 	// Write block headers.
@@ -444,13 +467,19 @@
 }
 
 // encode will encode the block and append the output in b.output.
-func (b *blockEnc) encode(raw bool) error {
+// Previous offset codes must be pushed if more blocks are expected.
+func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 	if len(b.sequences) == 0 {
-		return b.encodeLits(raw)
+		return b.encodeLits(b.literals, rawAllLits)
 	}
-	// We want some difference
-	if len(b.literals) > (b.size - (b.size >> 5)) {
-		return errIncompressible
+	// We want some difference to at least account for the headers.
+	saved := b.size - len(b.literals) - (b.size >> 5)
+	if saved < 16 {
+		if org == nil {
+			return errIncompressible
+		}
+		b.popOffsets()
+		return b.encodeLits(org, rawAllLits)
 	}
 
 	var bh blockHeader
@@ -466,6 +495,11 @@
 		reUsed, single bool
 		err            error
 	)
+	if b.dictLitEnc != nil {
+		b.litEnc.TransferCTable(b.dictLitEnc)
+		b.litEnc.Reuse = huff0.ReusePolicyAllow
+		b.dictLitEnc = nil
+	}
 	if len(b.literals) >= 1024 && !raw {
 		// Use 4 Streams.
 		out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
@@ -494,11 +528,6 @@
 		if debug {
 			println("Adding literals RLE")
 		}
-	default:
-		if debug {
-			println("Adding literals ERROR:", err)
-		}
-		return err
 	case nil:
 		// Compressed litLen...
 		if reUsed {
@@ -529,6 +558,11 @@
 		if debug {
 			println("Adding literals compressed")
 		}
+	default:
+		if debug {
+			println("Adding literals ERROR:", err)
+		}
+		return err
 	}
 	// Sequence compression
 
@@ -806,7 +840,7 @@
 		mlH[v]++
 		if v > mlMax {
 			mlMax = v
-			if debug && mlMax > maxMatchLengthSymbol {
+			if debugAsserts && mlMax > maxMatchLengthSymbol {
 				panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d), matchlen: %d", mlMax, seq.matchLen))
 			}
 		}
@@ -821,13 +855,13 @@
 		}
 		return int(max)
 	}
-	if mlMax > maxMatchLengthSymbol {
+	if debugAsserts && mlMax > maxMatchLengthSymbol {
 		panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d)", mlMax))
 	}
-	if ofMax > maxOffsetBits {
+	if debugAsserts && ofMax > maxOffsetBits {
 		panic(fmt.Errorf("ofMax > maxOffsetBits (%d)", ofMax))
 	}
-	if llMax > maxLiteralLengthSymbol {
+	if debugAsserts && llMax > maxLiteralLengthSymbol {
 		panic(fmt.Errorf("llMax > maxLiteralLengthSymbol (%d)", llMax))
 	}
 
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index 07321ac..658ef78 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -30,7 +30,7 @@
 type byteBuf []byte
 
 func (b *byteBuf) readSmall(n int) []byte {
-	if debug && n > 8 {
+	if debugAsserts && n > 8 {
 		panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
 	}
 	bb := *b
@@ -82,7 +82,7 @@
 }
 
 func (r *readerWrapper) readSmall(n int) []byte {
-	if debug && n > 8 {
+	if debugAsserts && n > 8 {
 		panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
 	}
 	n2, err := io.ReadFull(r.r, r.tmp[:n])
diff --git a/vendor/github.com/klauspost/compress/zstd/bytereader.go b/vendor/github.com/klauspost/compress/zstd/bytereader.go
index dc4378b..2c4fca1 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytereader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go
@@ -31,7 +31,8 @@
 
 // Int32 returns a little endian int32 starting at current offset.
 func (b byteReader) Int32() int32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
+	b2 = b2[:4]
 	v3 := int32(b2[3])
 	v2 := int32(b2[2])
 	v1 := int32(b2[1])
@@ -55,7 +56,20 @@
 		}
 		return v
 	}
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
+	b2 = b2[:4]
+	v3 := uint32(b2[3])
+	v2 := uint32(b2[2])
+	v1 := uint32(b2[1])
+	v0 := uint32(b2[0])
+	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
+}
+
+// Uint32NC returns a little endian uint32 starting at current offset.
+// The caller must be sure if there are at least 4 bytes left.
+func (b byteReader) Uint32NC() uint32 {
+	b2 := b.b[b.off:]
+	b2 = b2[:4]
 	v3 := uint32(b2[3])
 	v2 := uint32(b2[2])
 	v1 := uint32(b2[1])
diff --git a/vendor/github.com/klauspost/compress/zstd/decodeheader.go b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
new file mode 100644
index 0000000..69736e8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
@@ -0,0 +1,202 @@
+// Copyright 2020+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package zstd
+
+import (
+	"bytes"
+	"errors"
+	"io"
+)
+
+// HeaderMaxSize is the maximum size of a Frame and Block Header.
+// If less is sent to Header.Decode it *may* still contain enough information.
+const HeaderMaxSize = 14 + 3
+
+// Header contains information about the first frame and block within that.
+type Header struct {
+	// Window Size the window of data to keep while decoding.
+	// Will only be set if HasFCS is false.
+	WindowSize uint64
+
+	// Frame content size.
+	// Expected size of the entire frame.
+	FrameContentSize uint64
+
+	// Dictionary ID.
+	// If 0, no dictionary.
+	DictionaryID uint32
+
+	// First block information.
+	FirstBlock struct {
+		// OK will be set if first block could be decoded.
+		OK bool
+
+		// Is this the last block of a frame?
+		Last bool
+
+		// Is the data compressed?
+		// If true CompressedSize will be populated.
+		// Unfortunately DecompressedSize cannot be determined
+		// without decoding the blocks.
+		Compressed bool
+
+		// DecompressedSize is the expected decompressed size of the block.
+		// Will be 0 if it cannot be determined.
+		DecompressedSize int
+
+		// CompressedSize of the data in the block.
+		// Does not include the block header.
+		// Will be equal to DecompressedSize if not Compressed.
+		CompressedSize int
+	}
+
+	// Skippable will be true if the frame is meant to be skipped.
+	// No other information will be populated.
+	Skippable bool
+
+	// If set there is a checksum present for the block content.
+	HasCheckSum bool
+
+	// If this is true FrameContentSize will have a valid value
+	HasFCS bool
+
+	SingleSegment bool
+}
+
+// Decode the header from the beginning of the stream.
+// This will decode the frame header and the first block header if enough bytes are provided.
+// It is recommended to provide at least HeaderMaxSize bytes.
+// If the frame header cannot be read an error will be returned.
+// If there isn't enough input, io.ErrUnexpectedEOF is returned.
+// The FirstBlock.OK will indicate if enough information was available to decode the first block header.
+func (h *Header) Decode(in []byte) error {
+	if len(in) < 4 {
+		return io.ErrUnexpectedEOF
+	}
+	b, in := in[:4], in[4:]
+	if !bytes.Equal(b, frameMagic) {
+		if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
+			return ErrMagicMismatch
+		}
+		*h = Header{Skippable: true}
+		return nil
+	}
+	if len(in) < 1 {
+		return io.ErrUnexpectedEOF
+	}
+
+	// Clear output
+	*h = Header{}
+	fhd, in := in[0], in[1:]
+	h.SingleSegment = fhd&(1<<5) != 0
+	h.HasCheckSum = fhd&(1<<2) != 0
+
+	if fhd&(1<<3) != 0 {
+		return errors.New("reserved bit set on frame header")
+	}
+
+	// Read Window_Descriptor
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
+	if !h.SingleSegment {
+		if len(in) < 1 {
+			return io.ErrUnexpectedEOF
+		}
+		var wd byte
+		wd, in = in[0], in[1:]
+		windowLog := 10 + (wd >> 3)
+		windowBase := uint64(1) << windowLog
+		windowAdd := (windowBase / 8) * uint64(wd&0x7)
+		h.WindowSize = windowBase + windowAdd
+	}
+
+	// Read Dictionary_ID
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
+	if size := fhd & 3; size != 0 {
+		if size == 3 {
+			size = 4
+		}
+		if len(in) < int(size) {
+			return io.ErrUnexpectedEOF
+		}
+		b, in = in[:size], in[size:]
+		if b == nil {
+			return io.ErrUnexpectedEOF
+		}
+		switch size {
+		case 1:
+			h.DictionaryID = uint32(b[0])
+		case 2:
+			h.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8)
+		case 4:
+			h.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
+		}
+	}
+
+	// Read Frame_Content_Size
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_content_size
+	var fcsSize int
+	v := fhd >> 6
+	switch v {
+	case 0:
+		if h.SingleSegment {
+			fcsSize = 1
+		}
+	default:
+		fcsSize = 1 << v
+	}
+
+	if fcsSize > 0 {
+		h.HasFCS = true
+		if len(in) < fcsSize {
+			return io.ErrUnexpectedEOF
+		}
+		b, in = in[:fcsSize], in[fcsSize:]
+		if b == nil {
+			return io.ErrUnexpectedEOF
+		}
+		switch fcsSize {
+		case 1:
+			h.FrameContentSize = uint64(b[0])
+		case 2:
+			// When FCS_Field_Size is 2, the offset of 256 is added.
+			h.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) + 256
+		case 4:
+			h.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) | (uint64(b[2]) << 16) | (uint64(b[3]) << 24)
+		case 8:
+			d1 := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
+			d2 := uint32(b[4]) | (uint32(b[5]) << 8) | (uint32(b[6]) << 16) | (uint32(b[7]) << 24)
+			h.FrameContentSize = uint64(d1) | (uint64(d2) << 32)
+		}
+	}
+
+	// Frame Header done, we will not fail from now on.
+	if len(in) < 3 {
+		return nil
+	}
+	tmp := in[:3]
+	bh := uint32(tmp[0]) | (uint32(tmp[1]) << 8) | (uint32(tmp[2]) << 16)
+	h.FirstBlock.Last = bh&1 != 0
+	blockType := blockType((bh >> 1) & 3)
+	// find size.
+	cSize := int(bh >> 3)
+	switch blockType {
+	case blockTypeReserved:
+		return nil
+	case blockTypeRLE:
+		h.FirstBlock.Compressed = true
+		h.FirstBlock.DecompressedSize = cSize
+		h.FirstBlock.CompressedSize = 1
+	case blockTypeCompressed:
+		h.FirstBlock.Compressed = true
+		h.FirstBlock.CompressedSize = cSize
+	case blockTypeRaw:
+		h.FirstBlock.DecompressedSize = cSize
+		h.FirstBlock.CompressedSize = cSize
+	default:
+		panic("Invalid block type")
+	}
+
+	h.FirstBlock.OK = true
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index 35a3cda..f593e46 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -5,7 +5,6 @@
 package zstd
 
 import (
-	"bytes"
 	"errors"
 	"io"
 	"sync"
@@ -23,17 +22,15 @@
 	// Unreferenced decoders, ready for use.
 	decoders chan *blockDec
 
-	// Unreferenced decoders, ready for use.
-	frames chan *frameDec
-
 	// Streams ready to be decoded.
 	stream chan decodeStream
 
 	// Current read position used for Reader functionality.
 	current decoderState
 
-	// Custom dictionaries
-	dicts map[uint32]struct{}
+	// Custom dictionaries.
+	// Always uses copies.
+	dicts map[uint32]dict
 
 	// streamWg is the waitgroup for all streams
 	streamWg sync.WaitGroup
@@ -66,7 +63,7 @@
 // A Decoder can be used in two modes:
 //
 // 1) As a stream, or
-// 2) For stateless decoding using DecodeAll or DecodeBuffer.
+// 2) For stateless decoding using DecodeAll.
 //
 // Only a single stream can be decoded concurrently, but the same decoder
 // can run multiple concurrent stateless decodes. It is even possible to
@@ -87,12 +84,23 @@
 	d.current.output = make(chan decodeOutput, d.o.concurrent)
 	d.current.flushed = true
 
+	if r == nil {
+		d.current.err = ErrDecoderNilInput
+	}
+
+	// Transfer option dicts.
+	d.dicts = make(map[uint32]dict, len(d.o.dicts))
+	for _, dc := range d.o.dicts {
+		d.dicts[dc.id] = dc
+	}
+	d.o.dicts = nil
+
 	// Create decoders
 	d.decoders = make(chan *blockDec, d.o.concurrent)
-	d.frames = make(chan *frameDec, d.o.concurrent)
 	for i := 0; i < d.o.concurrent; i++ {
-		d.frames <- newFrameDec(d.o)
-		d.decoders <- newBlockDec(d.o.lowMem)
+		dec := newBlockDec(d.o.lowMem)
+		dec.localFrame = newFrameDec(d.o)
+		d.decoders <- dec
 	}
 
 	if r == nil {
@@ -106,7 +114,7 @@
 // When the stream is done, io.EOF will be returned.
 func (d *Decoder) Read(p []byte) (int, error) {
 	if d.stream == nil {
-		return 0, errors.New("no input has been initialized")
+		return 0, ErrDecoderNilInput
 	}
 	var n int
 	for {
@@ -147,12 +155,20 @@
 
 // Reset will reset the decoder the supplied stream after the current has finished processing.
 // Note that this functionality cannot be used after Close has been called.
+// Reset can be called with a nil reader to release references to the previous reader.
+// After being called with a nil reader, no other operations than Reset or DecodeAll or Close
+// should be used.
 func (d *Decoder) Reset(r io.Reader) error {
 	if d.current.err == ErrDecoderClosed {
 		return d.current.err
 	}
+
+	d.drainOutput()
+
 	if r == nil {
-		return errors.New("nil Reader sent as input")
+		d.current.err = ErrDecoderNilInput
+		d.current.flushed = true
+		return nil
 	}
 
 	if d.stream == nil {
@@ -161,15 +177,19 @@
 		go d.startStreamDecoder(d.stream)
 	}
 
-	d.drainOutput()
-
 	// If bytes buffer and < 1MB, do sync decoding anyway.
-	if bb, ok := r.(*bytes.Buffer); ok && bb.Len() < 1<<20 {
+	if bb, ok := r.(byter); ok && bb.Len() < 1<<20 {
+		bb2 := bb
 		if debug {
 			println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
 		}
-		b := bb.Bytes()
-		dst, err := d.DecodeAll(b, nil)
+		b := bb2.Bytes()
+		var dst []byte
+		if cap(d.current.b) > 0 {
+			dst = d.current.b
+		}
+
+		dst, err := d.DecodeAll(b, dst[:0])
 		if err == nil {
 			err = io.EOF
 		}
@@ -177,7 +197,7 @@
 		d.current.err = err
 		d.current.flushed = true
 		if debug {
-			println("sync decode to ", len(dst), "bytes, err:", err)
+			println("sync decode to", len(dst), "bytes, err:", err)
 		}
 		return nil
 	}
@@ -216,20 +236,17 @@
 		println("current already flushed")
 		return
 	}
-	for {
-		select {
-		case v := <-d.current.output:
-			if v.d != nil {
-				if debug {
-					printf("re-adding decoder %p", v.d)
-				}
-				d.decoders <- v.d
+	for v := range d.current.output {
+		if v.d != nil {
+			if debug {
+				printf("re-adding decoder %p", v.d)
 			}
-			if v.err == errEndOfStream {
-				println("current flushed")
-				d.current.flushed = true
-				return
-			}
+			d.decoders <- v.d
+		}
+		if v.err == errEndOfStream {
+			println("current flushed")
+			d.current.flushed = true
+			return
 		}
 	}
 }
@@ -239,7 +256,7 @@
 // Any error encountered during the write is also returned.
 func (d *Decoder) WriteTo(w io.Writer) (int64, error) {
 	if d.stream == nil {
-		return 0, errors.New("no input has been initialized")
+		return 0, ErrDecoderNilInput
 	}
 	var n int64
 	for {
@@ -277,23 +294,34 @@
 	}
 
 	// Grab a block decoder and frame decoder.
-	block, frame := <-d.decoders, <-d.frames
+	block := <-d.decoders
+	frame := block.localFrame
 	defer func() {
 		if debug {
 			printf("re-adding decoder: %p", block)
 		}
-		d.decoders <- block
 		frame.rawInput = nil
 		frame.bBuf = nil
-		d.frames <- frame
+		d.decoders <- block
 	}()
 	frame.bBuf = input
 
 	for {
+		frame.history.reset()
 		err := frame.reset(&frame.bBuf)
 		if err == io.EOF {
+			if debug {
+				println("frame reset return EOF")
+			}
 			return dst, nil
 		}
+		if frame.DictionaryID != nil {
+			dict, ok := d.dicts[*frame.DictionaryID]
+			if !ok {
+				return nil, ErrUnknownDictionary
+			}
+			frame.history.setDict(&dict)
+		}
 		if err != nil {
 			return dst, err
 		}
@@ -302,20 +330,24 @@
 		}
 		if frame.FrameContentSize > 0 && frame.FrameContentSize < 1<<30 {
 			// Never preallocate moe than 1 GB up front.
-			if uint64(cap(dst)) < frame.FrameContentSize {
+			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
 				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
 				copy(dst2, dst)
 				dst = dst2
 			}
 		}
 		if cap(dst) == 0 {
-			// Allocate window size * 2 by default if nothing is provided and we didn't get frame content size.
-			size := frame.WindowSize * 2
+			// Allocate len(input) * 2 by default if nothing is provided
+			// and we didn't get frame content size.
+			size := len(input) * 2
 			// Cap to 1 MB.
 			if size > 1<<20 {
 				size = 1 << 20
 			}
-			dst = make([]byte, 0, frame.WindowSize)
+			if uint64(size) > d.o.maxDecodedSize {
+				size = int(d.o.maxDecodedSize)
+			}
+			dst = make([]byte, 0, size)
 		}
 
 		dst, err = frame.runDecoder(dst, block)
@@ -323,6 +355,9 @@
 			return dst, err
 		}
 		if len(frame.bBuf) == 0 {
+			if debug {
+				println("frame dbuf empty")
+			}
 			break
 		}
 	}
@@ -456,10 +491,19 @@
 		br := readerWrapper{r: stream.r}
 	decodeStream:
 		for {
+			frame.history.reset()
 			err := frame.reset(&br)
 			if debug && err != nil {
 				println("Frame decoder returned", err)
 			}
+			if err == nil && frame.DictionaryID != nil {
+				dict, ok := d.dicts[*frame.DictionaryID]
+				if !ok {
+					err = ErrUnknownDictionary
+				} else {
+					frame.history.setDict(&dict)
+				}
+			}
 			if err != nil {
 				stream.output <- decodeOutput{
 					err: err,
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index 2ac9cd2..c0fd058 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -6,7 +6,6 @@
 
 import (
 	"errors"
-	"fmt"
 	"runtime"
 )
 
@@ -18,6 +17,7 @@
 	lowMem         bool
 	concurrent     int
 	maxDecodedSize uint64
+	dicts          []dict
 }
 
 func (o *decoderOptions) setDefault() {
@@ -42,7 +42,7 @@
 func WithDecoderConcurrency(n int) DOption {
 	return func(o *decoderOptions) error {
 		if n <= 0 {
-			return fmt.Errorf("Concurrency must be at least 1")
+			return errors.New("concurrency must be at least 1")
 		}
 		o.concurrent = n
 		return nil
@@ -60,9 +60,24 @@
 			return errors.New("WithDecoderMaxMemory must be at least 1")
 		}
 		if n > 1<<63 {
-			return fmt.Errorf("WithDecoderMaxmemory must be less than 1 << 63")
+			return errors.New("WithDecoderMaxmemory must be less than 1 << 63")
 		}
 		o.maxDecodedSize = n
 		return nil
 	}
 }
+
+// WithDecoderDicts allows to register one or more dictionaries for the decoder.
+// If several dictionaries with the same ID is provided the last one will be used.
+func WithDecoderDicts(dicts ...[]byte) DOption {
+	return func(o *decoderOptions) error {
+		for _, b := range dicts {
+			d, err := loadDict(b)
+			if err != nil {
+				return err
+			}
+			o.dicts = append(o.dicts, *d)
+		}
+		return nil
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/dict.go b/vendor/github.com/klauspost/compress/zstd/dict.go
new file mode 100644
index 0000000..fa25a18
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/dict.go
@@ -0,0 +1,122 @@
+package zstd
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+
+	"github.com/klauspost/compress/huff0"
+)
+
+type dict struct {
+	id uint32
+
+	litEnc              *huff0.Scratch
+	llDec, ofDec, mlDec sequenceDec
+	//llEnc, ofEnc, mlEnc []*fseEncoder
+	offsets [3]int
+	content []byte
+}
+
+var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec}
+
+// ID returns the dictionary id or 0 if d is nil.
+func (d *dict) ID() uint32 {
+	if d == nil {
+		return 0
+	}
+	return d.id
+}
+
+// DictContentSize returns the dictionary content size or 0 if d is nil.
+func (d *dict) DictContentSize() int {
+	if d == nil {
+		return 0
+	}
+	return len(d.content)
+}
+
+// Load a dictionary as described in
+// https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+func loadDict(b []byte) (*dict, error) {
+	// Check static field size.
+	if len(b) <= 8+(3*4) {
+		return nil, io.ErrUnexpectedEOF
+	}
+	d := dict{
+		llDec: sequenceDec{fse: &fseDecoder{}},
+		ofDec: sequenceDec{fse: &fseDecoder{}},
+		mlDec: sequenceDec{fse: &fseDecoder{}},
+	}
+	if !bytes.Equal(b[:4], dictMagic[:]) {
+		return nil, ErrMagicMismatch
+	}
+	d.id = binary.LittleEndian.Uint32(b[4:8])
+	if d.id == 0 {
+		return nil, errors.New("dictionaries cannot have ID 0")
+	}
+
+	// Read literal table
+	var err error
+	d.litEnc, b, err = huff0.ReadTable(b[8:], nil)
+	if err != nil {
+		return nil, err
+	}
+	d.litEnc.Reuse = huff0.ReusePolicyMust
+
+	br := byteReader{
+		b:   b,
+		off: 0,
+	}
+	readDec := func(i tableIndex, dec *fseDecoder) error {
+		if err := dec.readNCount(&br, uint16(maxTableSymbol[i])); err != nil {
+			return err
+		}
+		if br.overread() {
+			return io.ErrUnexpectedEOF
+		}
+		err = dec.transform(symbolTableX[i])
+		if err != nil {
+			println("Transform table error:", err)
+			return err
+		}
+		if debug {
+			println("Read table ok", "symbolLen:", dec.symbolLen)
+		}
+		// Set decoders as predefined so they aren't reused.
+		dec.preDefined = true
+		return nil
+	}
+
+	if err := readDec(tableOffsets, d.ofDec.fse); err != nil {
+		return nil, err
+	}
+	if err := readDec(tableMatchLengths, d.mlDec.fse); err != nil {
+		return nil, err
+	}
+	if err := readDec(tableLiteralLengths, d.llDec.fse); err != nil {
+		return nil, err
+	}
+	if br.remain() < 12 {
+		return nil, io.ErrUnexpectedEOF
+	}
+
+	d.offsets[0] = int(br.Uint32())
+	br.advance(4)
+	d.offsets[1] = int(br.Uint32())
+	br.advance(4)
+	d.offsets[2] = int(br.Uint32())
+	br.advance(4)
+	if d.offsets[0] <= 0 || d.offsets[1] <= 0 || d.offsets[2] <= 0 {
+		return nil, errors.New("invalid offset in dictionary")
+	}
+	d.content = make([]byte, br.remain())
+	copy(d.content, br.unread())
+	if d.offsets[0] > len(d.content) || d.offsets[1] > len(d.content) || d.offsets[2] > len(d.content) {
+		return nil, fmt.Errorf("initial offset bigger than dictionary content size %d, offsets: %v", len(d.content), d.offsets)
+	}
+
+	return &d, nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_base.go b/vendor/github.com/klauspost/compress/zstd/enc_base.go
new file mode 100644
index 0000000..60f2986
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go
@@ -0,0 +1,178 @@
+package zstd
+
+import (
+	"fmt"
+	"math/bits"
+
+	"github.com/klauspost/compress/zstd/internal/xxhash"
+)
+
+const (
+	dictShardBits = 6
+)
+
+type fastBase struct {
+	// cur is the offset at the start of hist
+	cur int32
+	// maximum offset. Should be at least 2x block size.
+	maxMatchOff int32
+	hist        []byte
+	crc         *xxhash.Digest
+	tmp         [8]byte
+	blk         *blockEnc
+	lastDictID  uint32
+	lowMem      bool
+}
+
+// CRC returns the underlying CRC writer.
+func (e *fastBase) CRC() *xxhash.Digest {
+	return e.crc
+}
+
+// AppendCRC will append the CRC to the destination slice and return it.
+func (e *fastBase) AppendCRC(dst []byte) []byte {
+	crc := e.crc.Sum(e.tmp[:0])
+	dst = append(dst, crc[7], crc[6], crc[5], crc[4])
+	return dst
+}
+
+// WindowSize returns the window size of the encoder,
+// or a window size small enough to contain the input size, if > 0.
+func (e *fastBase) WindowSize(size int) int32 {
+	if size > 0 && size < int(e.maxMatchOff) {
+		b := int32(1) << uint(bits.Len(uint(size)))
+		// Keep minimum window.
+		if b < 1024 {
+			b = 1024
+		}
+		return b
+	}
+	return e.maxMatchOff
+}
+
+// Block returns the current block.
+func (e *fastBase) Block() *blockEnc {
+	return e.blk
+}
+
+func (e *fastBase) addBlock(src []byte) int32 {
+	if debugAsserts && e.cur > bufferReset {
+		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
+	}
+	// check if we have space already
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			e.ensureHist(len(src))
+		} else {
+			if cap(e.hist) < int(e.maxMatchOff+maxCompressedBlockSize) {
+				panic(fmt.Errorf("unexpected buffer cap %d, want at least %d with window %d", cap(e.hist), e.maxMatchOff+maxCompressedBlockSize, e.maxMatchOff))
+			}
+			// Move down
+			offset := int32(len(e.hist)) - e.maxMatchOff
+			copy(e.hist[0:e.maxMatchOff], e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:e.maxMatchOff]
+		}
+	}
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
+}
+
+// ensureHist will ensure that history can keep at least this many bytes.
+func (e *fastBase) ensureHist(n int) {
+	if cap(e.hist) >= n {
+		return
+	}
+	l := e.maxMatchOff
+	if (e.lowMem && e.maxMatchOff > maxCompressedBlockSize) || e.maxMatchOff <= maxCompressedBlockSize {
+		l += maxCompressedBlockSize
+	} else {
+		l += e.maxMatchOff
+	}
+	// Make it at least 1MB.
+	if l < 1<<20 && !e.lowMem {
+		l = 1 << 20
+	}
+	// Make it at least the requested size.
+	if l < int32(n) {
+		l = int32(n)
+	}
+	e.hist = make([]byte, 0, l)
+}
+
+// useBlock will replace the block with the provided one,
+// but transfer recent offsets from the previous.
+func (e *fastBase) UseBlock(enc *blockEnc) {
+	enc.reset(e.blk)
+	e.blk = enc
+}
+
+func (e *fastBase) matchlenNoHist(s, t int32, src []byte) int32 {
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:], src[t:]))
+}
+
+func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
+	if debugAsserts {
+		if s < 0 {
+			err := fmt.Sprintf("s (%d) < 0", s)
+			panic(err)
+		}
+		if t < 0 {
+			err := fmt.Sprintf("s (%d) < 0", s)
+			panic(err)
+		}
+		if s-t > e.maxMatchOff {
+			err := fmt.Sprintf("s (%d) - t (%d) > maxMatchOff (%d)", s, t, e.maxMatchOff)
+			panic(err)
+		}
+		if len(src)-int(s) > maxCompressedBlockSize {
+			panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
+		}
+	}
+
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:], src[t:]))
+}
+
+// Reset the encoding table.
+func (e *fastBase) resetBase(d *dict, singleBlock bool) {
+	if e.blk == nil {
+		e.blk = &blockEnc{lowMem: e.lowMem}
+		e.blk.init()
+	} else {
+		e.blk.reset(nil)
+	}
+	e.blk.initNewEncode()
+	if e.crc == nil {
+		e.crc = xxhash.New()
+	} else {
+		e.crc.Reset()
+	}
+	if d != nil {
+		low := e.lowMem
+		if singleBlock {
+			e.lowMem = true
+		}
+		e.ensureHist(d.DictContentSize() + maxCompressedBlockSize)
+		e.lowMem = low
+	}
+
+	// We offset current position so everything will be out of reach.
+	// If above reset line, history will be purged.
+	if e.cur < bufferReset {
+		e.cur += e.maxMatchOff + int32(len(e.hist))
+	}
+	e.hist = e.hist[:0]
+	if d != nil {
+		// Set offsets (currently not used)
+		for i, off := range d.offsets {
+			e.blk.recentOffsets[i] = uint32(off)
+			e.blk.prevRecentOffsets[i] = e.blk.recentOffsets[i]
+		}
+		// Transfer litenc.
+		e.blk.dictLitEnc = d.litEnc
+		e.hist = append(e.hist, d.content...)
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_best.go b/vendor/github.com/klauspost/compress/zstd/enc_best.go
new file mode 100644
index 0000000..dc1eed5
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/enc_best.go
@@ -0,0 +1,501 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"fmt"
+	"math/bits"
+)
+
+const (
+	bestLongTableBits = 20                     // Bits used in the long match table
+	bestLongTableSize = 1 << bestLongTableBits // Size of the table
+
+	// Note: Increasing the short table bits or making the hash shorter
+	// can actually lead to compression degradation since it will 'steal' more from the
+	// long match table and match offsets are quite big.
+	// This greatly depends on the type of input.
+	bestShortTableBits = 16                      // Bits used in the short match table
+	bestShortTableSize = 1 << bestShortTableBits // Size of the table
+)
+
+// bestFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
+// The long match table contains the previous entry with the same hash,
+// effectively making it a "chain" of length 2.
+// When we find a long match we choose between the two values and select the longest.
+// When we find a short match, after checking the long, we check if we can find a long at n+1
+// and that it is longer (lazy matching).
+type bestFastEncoder struct {
+	fastBase
+	table         [bestShortTableSize]prevEntry
+	longTable     [bestLongTableSize]prevEntry
+	dictTable     []prevEntry
+	dictLongTable []prevEntry
+}
+
+// Encode improves compression...
+func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 4
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = prevEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = prevEntry{}
+			}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			v2 := e.table[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.table[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			v2 := e.longTable[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.longTable[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	const kSearchStrength = 10
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+	offset3 := int32(blk.recentOffsets[2])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	_ = addLiterals
+
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+
+		if debugAsserts && canRepeat && offset1 == 0 {
+			panic("offset0 was 0")
+		}
+
+		type match struct {
+			offset int32
+			s      int32
+			length int32
+			rep    int32
+		}
+		matchAt := func(offset int32, s int32, first uint32, rep int32) match {
+			if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
+				return match{offset: offset, s: s}
+			}
+			return match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
+		}
+
+		bestOf := func(a, b match) match {
+			aScore := b.s - a.s + a.length
+			bScore := a.s - b.s + b.length
+			if a.rep < 0 {
+				aScore = aScore - int32(bits.Len32(uint32(a.offset)))/8
+			}
+			if b.rep < 0 {
+				bScore = bScore - int32(bits.Len32(uint32(b.offset)))/8
+			}
+			if aScore >= bScore {
+				return a
+			}
+			return b
+		}
+		const goodEnough = 100
+
+		nextHashL := hash8(cv, bestLongTableBits)
+		nextHashS := hash4x64(cv, bestShortTableBits)
+		candidateL := e.longTable[nextHashL]
+		candidateS := e.table[nextHashS]
+
+		best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
+		best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
+		best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1))
+		if canRepeat && best.length < goodEnough {
+			best = bestOf(best, matchAt(s-offset1+1, s+1, uint32(cv>>8), 1))
+			best = bestOf(best, matchAt(s-offset2+1, s+1, uint32(cv>>8), 2))
+			best = bestOf(best, matchAt(s-offset3+1, s+1, uint32(cv>>8), 3))
+			if best.length > 0 {
+				best = bestOf(best, matchAt(s-offset1+3, s+3, uint32(cv>>24), 1))
+				best = bestOf(best, matchAt(s-offset2+3, s+3, uint32(cv>>24), 2))
+				best = bestOf(best, matchAt(s-offset3+3, s+3, uint32(cv>>24), 3))
+			}
+		}
+		// Load next and check...
+		e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: candidateL.offset}
+		e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: candidateS.offset}
+
+		// Look far ahead, unless we have a really long match already...
+		if best.length < goodEnough {
+			// No match found, move forward on input, no need to check forward...
+			if best.length < 4 {
+				s += 1 + (s-nextEmit)>>(kSearchStrength-1)
+				if s >= sLimit {
+					break encodeLoop
+				}
+				cv = load6432(src, s)
+				continue
+			}
+
+			s++
+			candidateS = e.table[hash4x64(cv>>8, bestShortTableBits)]
+			cv = load6432(src, s)
+			cv2 := load6432(src, s+1)
+			candidateL = e.longTable[hash8(cv, bestLongTableBits)]
+			candidateL2 := e.longTable[hash8(cv2, bestLongTableBits)]
+
+			best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
+			best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1))
+			best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
+			best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1))
+			best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1))
+
+			// See if we can find a better match by checking where the current best ends.
+			// Use that offset to see if we can find a better full match.
+			if sAt := best.s + best.length; sAt < sLimit {
+				nextHashL := hash8(load6432(src, sAt), bestLongTableBits)
+				candidateEnd := e.longTable[nextHashL]
+				if pos := candidateEnd.offset - e.cur - best.length; pos >= 0 {
+					bestEnd := bestOf(best, matchAt(pos, best.s, load3232(src, best.s), -1))
+					if pos := candidateEnd.prev - e.cur - best.length; pos >= 0 {
+						bestEnd = bestOf(bestEnd, matchAt(pos, best.s, load3232(src, best.s), -1))
+					}
+					best = bestEnd
+				}
+			}
+		}
+
+		// We have a match, we can store the forward value
+		if best.rep > 0 {
+			s = best.s
+			var seq seq
+			seq.matchLen = uint32(best.length - zstdMinMatch)
+
+			// We might be able to match backwards.
+			// Extend as long as we can.
+			start := best.s
+			// We end the search early, so we don't risk 0 literals
+			// and have to do special offset treatment.
+			startLimit := nextEmit + 1
+
+			tMin := s - e.maxMatchOff
+			if tMin < 0 {
+				tMin = 0
+			}
+			repIndex := best.offset
+			for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+				repIndex--
+				start--
+				seq.matchLen++
+			}
+			addLiterals(&seq, start)
+
+			// rep 0
+			seq.offset = uint32(best.rep)
+			if debugSequences {
+				println("repeat sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Index match start+1 (long) -> s - 1
+			index0 := s
+			s = best.s + best.length
+
+			nextEmit = s
+			if s >= sLimit {
+				if debug {
+					println("repeat ended", s, best.length)
+
+				}
+				break encodeLoop
+			}
+			// Index skipped...
+			off := index0 + e.cur
+			for index0 < s-1 {
+				cv0 := load6432(src, index0)
+				h0 := hash8(cv0, bestLongTableBits)
+				h1 := hash4x64(cv0, bestShortTableBits)
+				e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+				e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
+				off++
+				index0++
+			}
+			switch best.rep {
+			case 2:
+				offset1, offset2 = offset2, offset1
+			case 3:
+				offset1, offset2, offset3 = offset3, offset1, offset2
+			}
+			cv = load6432(src, s)
+			continue
+		}
+
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		s = best.s
+		t := best.offset
+		offset1, offset2, offset3 = s-t, offset1, offset2
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the n-byte match as long as possible.
+		l := best.length
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) -> s - 1
+		index0 := s - l + 1
+		// every entry
+		for index0 < s-1 {
+			cv0 := load6432(src, index0)
+			h0 := hash8(cv0, bestLongTableBits)
+			h1 := hash4x64(cv0, bestShortTableBits)
+			off := index0 + e.cur
+			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+			e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
+			index0++
+		}
+
+		cv = load6432(src, s)
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash4x64(cv, bestShortTableBits)
+			nextHashL := hash8(cv, bestLongTableBits)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: e.table[nextHashS].offset}
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	blk.recentOffsets[2] = uint32(offset3)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
+// EncodeNoHist will encode a block with no history and no following blocks.
+// Most notable difference is that src will not be copied for history and
+// we do not need to check for max match length.
+func (e *bestFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.ensureHist(len(src))
+	e.Encode(blk, src)
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *bestFastEncoder) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d == nil {
+		return
+	}
+	// Init or copy dict table
+	if len(e.dictTable) != len(e.table) || d.id != e.lastDictID {
+		if len(e.dictTable) != len(e.table) {
+			e.dictTable = make([]prevEntry, len(e.table))
+		}
+		end := int32(len(d.content)) - 8 + e.maxMatchOff
+		for i := e.maxMatchOff; i < end; i += 4 {
+			const hashLog = bestShortTableBits
+
+			cv := load6432(d.content, i-e.maxMatchOff)
+			nextHash := hash4x64(cv, hashLog)      // 0 -> 4
+			nextHash1 := hash4x64(cv>>8, hashLog)  // 1 -> 5
+			nextHash2 := hash4x64(cv>>16, hashLog) // 2 -> 6
+			nextHash3 := hash4x64(cv>>24, hashLog) // 3 -> 7
+			e.dictTable[nextHash] = prevEntry{
+				prev:   e.dictTable[nextHash].offset,
+				offset: i,
+			}
+			e.dictTable[nextHash1] = prevEntry{
+				prev:   e.dictTable[nextHash1].offset,
+				offset: i + 1,
+			}
+			e.dictTable[nextHash2] = prevEntry{
+				prev:   e.dictTable[nextHash2].offset,
+				offset: i + 2,
+			}
+			e.dictTable[nextHash3] = prevEntry{
+				prev:   e.dictTable[nextHash3].offset,
+				offset: i + 3,
+			}
+		}
+		e.lastDictID = d.id
+	}
+
+	// Init or copy dict table
+	if len(e.dictLongTable) != len(e.longTable) || d.id != e.lastDictID {
+		if len(e.dictLongTable) != len(e.longTable) {
+			e.dictLongTable = make([]prevEntry, len(e.longTable))
+		}
+		if len(d.content) >= 8 {
+			cv := load6432(d.content, 0)
+			h := hash8(cv, bestLongTableBits)
+			e.dictLongTable[h] = prevEntry{
+				offset: e.maxMatchOff,
+				prev:   e.dictLongTable[h].offset,
+			}
+
+			end := int32(len(d.content)) - 8 + e.maxMatchOff
+			off := 8 // First to read
+			for i := e.maxMatchOff + 1; i < end; i++ {
+				cv = cv>>8 | (uint64(d.content[off]) << 56)
+				h := hash8(cv, bestLongTableBits)
+				e.dictLongTable[h] = prevEntry{
+					offset: i,
+					prev:   e.dictLongTable[h].offset,
+				}
+				off++
+			}
+		}
+		e.lastDictID = d.id
+	}
+	// Reset table to initial state
+	copy(e.longTable[:], e.dictLongTable)
+
+	e.cur = e.maxMatchOff
+	// Reset table to initial state
+	copy(e.table[:], e.dictTable)
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
new file mode 100644
index 0000000..6049542
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@@ -0,0 +1,1235 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import "fmt"
+
+const (
+	betterLongTableBits = 19                       // Bits used in the long match table
+	betterLongTableSize = 1 << betterLongTableBits // Size of the table
+
+	// Note: Increasing the short table bits or making the hash shorter
+	// can actually lead to compression degradation since it will 'steal' more from the
+	// long match table and match offsets are quite big.
+	// This greatly depends on the type of input.
+	betterShortTableBits = 13                        // Bits used in the short match table
+	betterShortTableSize = 1 << betterShortTableBits // Size of the table
+
+	betterLongTableShardCnt  = 1 << (betterLongTableBits - dictShardBits)    // Number of shards in the table
+	betterLongTableShardSize = betterLongTableSize / betterLongTableShardCnt // Size of an individual shard
+
+	betterShortTableShardCnt  = 1 << (betterShortTableBits - dictShardBits)     // Number of shards in the table
+	betterShortTableShardSize = betterShortTableSize / betterShortTableShardCnt // Size of an individual shard
+)
+
+type prevEntry struct {
+	offset int32
+	prev   int32
+}
+
+// betterFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
+// The long match table contains the previous entry with the same hash,
+// effectively making it a "chain" of length 2.
+// When we find a long match we choose between the two values and select the longest.
+// When we find a short match, after checking the long, we check if we can find a long at n+1
+// and that it is longer (lazy matching).
+type betterFastEncoder struct {
+	fastBase
+	table     [betterShortTableSize]tableEntry
+	longTable [betterLongTableSize]prevEntry
+}
+
+type betterFastEncoderDict struct {
+	betterFastEncoder
+	dictTable            []tableEntry
+	dictLongTable        []prevEntry
+	shortTableShardDirty [betterShortTableShardCnt]bool
+	longTableShardDirty  [betterLongTableShardCnt]bool
+	allDirty             bool
+}
+
+// Encode improves compression...
+func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = prevEntry{}
+			}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			v2 := e.longTable[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.longTable[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 9
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+		var matched int32
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			off := s + e.cur
+			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
+			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					// Index match start+1 (long) -> s - 1
+					index0 := s + repOff
+					s += lenght + repOff
+
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					continue
+				}
+				const repOff2 = 1
+
+				// We deviate from the reference encoder and also check offset 2.
+				// Still slower and not much better, so disabled.
+				// repIndex = s - offset2 + repOff2
+				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
+					// Consider history as well.
+					var seq seq
+					lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff2
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 2
+					seq.offset = 2
+					if debugSequences {
+						println("repeat sequence 2", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					index0 := s + repOff2
+					s += lenght + repOff2
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					// Swap offsets
+					offset1, offset2 = offset2, offset1
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := candidateL.offset - e.cur
+			coffsetLP := candidateL.prev - e.cur
+
+			// Check if we have a long match.
+			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetL+8, src) + 8
+				t = coffsetL
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+
+				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+					// Found a long match, at least 8 bytes.
+					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
+					if prevMatch > matched {
+						matched = prevMatch
+						t = coffsetLP
+					}
+					if debugAsserts && s <= t {
+						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+					}
+					if debugAsserts && s-t > e.maxMatchOff {
+						panic("s - t >e.maxMatchOff")
+					}
+					if debugMatches {
+						println("long match")
+					}
+				}
+				break
+			}
+
+			// Check if we have a long match on prev.
+			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
+				t = coffsetLP
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			coffsetS := candidateS.offset - e.cur
+
+			// Check if we have a short match.
+			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				matched = e.matchlen(s+4, coffsetS+4, src) + 4
+
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hash8(cv, betterLongTableBits)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = candidateL.offset - e.cur
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("long match (after short)")
+						}
+						break
+					}
+				}
+
+				// Check prev long...
+				coffsetL = candidateL.prev - e.cur
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match (after short)")
+						}
+						break
+					}
+				}
+				t = coffsetS
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// Try to find a better match by searching for a long match at the end of the current best match
+		if true && s+matched < sLimit {
+			nextHashL := hash8(load6432(src, s+matched), betterLongTableBits)
+			cv := load3232(src, s)
+			candidateL := e.longTable[nextHashL]
+			coffsetL := candidateL.offset - e.cur - matched
+			if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+				// Found a long match, at least 4 bytes.
+				matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+				if matchedNext > matched {
+					t = coffsetL
+					matched = matchedNext
+					if debugMatches {
+						println("long match at end-of-match")
+					}
+				}
+			}
+
+			// Check prev long...
+			if true {
+				coffsetL = candidateL.prev - e.cur - matched
+				if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+					// Found a long match, at least 4 bytes.
+					matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+					if matchedNext > matched {
+						t = coffsetL
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match at end-of-match")
+						}
+					}
+				}
+			}
+		}
+		// A match has been found. Update recent offsets.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the n-byte match as long as possible.
+		l := matched
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) -> s - 1
+		index0 := s - l + 1
+		for index0 < s-1 {
+			cv0 := load6432(src, index0)
+			cv1 := cv0 >> 8
+			h0 := hash8(cv0, betterLongTableBits)
+			off := index0 + e.cur
+			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+			e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			index0 += 2
+		}
+
+		cv = load6432(src, s)
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
+// EncodeNoHist will encode a block with no history and no following blocks.
+// Most notable difference is that src will not be copied for history and
+// we do not need to check for max match length.
+func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.ensureHist(len(src))
+	e.Encode(blk, src)
+}
+
+// Encode improves compression...
+func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = prevEntry{}
+			}
+			e.cur = e.maxMatchOff
+			e.allDirty = true
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			v2 := e.longTable[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.longTable[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		e.allDirty = true
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 9
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+		var matched int32
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			off := s + e.cur
+			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+			e.markShortShardDirty(nextHashS)
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					// Index match start+1 (long) -> s - 1
+					index0 := s + repOff
+					s += lenght + repOff
+
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.markLongShardDirty(h0)
+						h1 := hash5(cv1, betterShortTableBits)
+						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markShortShardDirty(h1)
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					continue
+				}
+				const repOff2 = 1
+
+				// We deviate from the reference encoder and also check offset 2.
+				// Still slower and not much better, so disabled.
+				// repIndex = s - offset2 + repOff2
+				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
+					// Consider history as well.
+					var seq seq
+					lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff2
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 2
+					seq.offset = 2
+					if debugSequences {
+						println("repeat sequence 2", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					index0 := s + repOff2
+					s += lenght + repOff2
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.markLongShardDirty(h0)
+						h1 := hash5(cv1, betterShortTableBits)
+						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markShortShardDirty(h1)
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					// Swap offsets
+					offset1, offset2 = offset2, offset1
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := candidateL.offset - e.cur
+			coffsetLP := candidateL.prev - e.cur
+
+			// Check if we have a long match.
+			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetL+8, src) + 8
+				t = coffsetL
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+
+				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+					// Found a long match, at least 8 bytes.
+					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
+					if prevMatch > matched {
+						matched = prevMatch
+						t = coffsetLP
+					}
+					if debugAsserts && s <= t {
+						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+					}
+					if debugAsserts && s-t > e.maxMatchOff {
+						panic("s - t >e.maxMatchOff")
+					}
+					if debugMatches {
+						println("long match")
+					}
+				}
+				break
+			}
+
+			// Check if we have a long match on prev.
+			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
+				t = coffsetLP
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			coffsetS := candidateS.offset - e.cur
+
+			// Check if we have a short match.
+			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				matched = e.matchlen(s+4, coffsetS+4, src) + 4
+
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hash8(cv, betterLongTableBits)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = candidateL.offset - e.cur
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
+				e.markLongShardDirty(nextHashL)
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("long match (after short)")
+						}
+						break
+					}
+				}
+
+				// Check prev long...
+				coffsetL = candidateL.prev - e.cur
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match (after short)")
+						}
+						break
+					}
+				}
+				t = coffsetS
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+		// Try to find a better match by searching for a long match at the end of the current best match
+		if s+matched < sLimit {
+			nextHashL := hash8(load6432(src, s+matched), betterLongTableBits)
+			cv := load3232(src, s)
+			candidateL := e.longTable[nextHashL]
+			coffsetL := candidateL.offset - e.cur - matched
+			if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+				// Found a long match, at least 4 bytes.
+				matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+				if matchedNext > matched {
+					t = coffsetL
+					matched = matchedNext
+					if debugMatches {
+						println("long match at end-of-match")
+					}
+				}
+			}
+
+			// Check prev long...
+			if true {
+				coffsetL = candidateL.prev - e.cur - matched
+				if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+					// Found a long match, at least 4 bytes.
+					matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+					if matchedNext > matched {
+						t = coffsetL
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match at end-of-match")
+						}
+					}
+				}
+			}
+		}
+		// A match has been found. Update recent offsets.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the n-byte match as long as possible.
+		l := matched
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) -> s - 1
+		index0 := s - l + 1
+		for index0 < s-1 {
+			cv0 := load6432(src, index0)
+			cv1 := cv0 >> 8
+			h0 := hash8(cv0, betterLongTableBits)
+			off := index0 + e.cur
+			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+			e.markLongShardDirty(h0)
+			h1 := hash5(cv1, betterShortTableBits)
+			e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			e.markShortShardDirty(h1)
+			index0 += 2
+		}
+
+		cv = load6432(src, s)
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShortShardDirty(nextHashS)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d != nil {
+		panic("betterFastEncoder: Reset with dict")
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *betterFastEncoderDict) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d == nil {
+		return
+	}
+	// Init or copy dict table
+	if len(e.dictTable) != len(e.table) || d.id != e.lastDictID {
+		if len(e.dictTable) != len(e.table) {
+			e.dictTable = make([]tableEntry, len(e.table))
+		}
+		end := int32(len(d.content)) - 8 + e.maxMatchOff
+		for i := e.maxMatchOff; i < end; i += 4 {
+			const hashLog = betterShortTableBits
+
+			cv := load6432(d.content, i-e.maxMatchOff)
+			nextHash := hash5(cv, hashLog)      // 0 -> 4
+			nextHash1 := hash5(cv>>8, hashLog)  // 1 -> 5
+			nextHash2 := hash5(cv>>16, hashLog) // 2 -> 6
+			nextHash3 := hash5(cv>>24, hashLog) // 3 -> 7
+			e.dictTable[nextHash] = tableEntry{
+				val:    uint32(cv),
+				offset: i,
+			}
+			e.dictTable[nextHash1] = tableEntry{
+				val:    uint32(cv >> 8),
+				offset: i + 1,
+			}
+			e.dictTable[nextHash2] = tableEntry{
+				val:    uint32(cv >> 16),
+				offset: i + 2,
+			}
+			e.dictTable[nextHash3] = tableEntry{
+				val:    uint32(cv >> 24),
+				offset: i + 3,
+			}
+		}
+		e.lastDictID = d.id
+		e.allDirty = true
+	}
+
+	// Init or copy dict table
+	if len(e.dictLongTable) != len(e.longTable) || d.id != e.lastDictID {
+		if len(e.dictLongTable) != len(e.longTable) {
+			e.dictLongTable = make([]prevEntry, len(e.longTable))
+		}
+		if len(d.content) >= 8 {
+			cv := load6432(d.content, 0)
+			h := hash8(cv, betterLongTableBits)
+			e.dictLongTable[h] = prevEntry{
+				offset: e.maxMatchOff,
+				prev:   e.dictLongTable[h].offset,
+			}
+
+			end := int32(len(d.content)) - 8 + e.maxMatchOff
+			off := 8 // First to read
+			for i := e.maxMatchOff + 1; i < end; i++ {
+				cv = cv>>8 | (uint64(d.content[off]) << 56)
+				h := hash8(cv, betterLongTableBits)
+				e.dictLongTable[h] = prevEntry{
+					offset: i,
+					prev:   e.dictLongTable[h].offset,
+				}
+				off++
+			}
+		}
+		e.lastDictID = d.id
+		e.allDirty = true
+	}
+
+	// Reset table to initial state
+	{
+		dirtyShardCnt := 0
+		if !e.allDirty {
+			for i := range e.shortTableShardDirty {
+				if e.shortTableShardDirty[i] {
+					dirtyShardCnt++
+				}
+			}
+		}
+		const shardCnt = betterShortTableShardCnt
+		const shardSize = betterShortTableShardSize
+		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+			copy(e.table[:], e.dictTable)
+			for i := range e.shortTableShardDirty {
+				e.shortTableShardDirty[i] = false
+			}
+		} else {
+			for i := range e.shortTableShardDirty {
+				if !e.shortTableShardDirty[i] {
+					continue
+				}
+
+				copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+				e.shortTableShardDirty[i] = false
+			}
+		}
+	}
+	{
+		dirtyShardCnt := 0
+		if !e.allDirty {
+			for i := range e.shortTableShardDirty {
+				if e.shortTableShardDirty[i] {
+					dirtyShardCnt++
+				}
+			}
+		}
+		const shardCnt = betterLongTableShardCnt
+		const shardSize = betterLongTableShardSize
+		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+			copy(e.longTable[:], e.dictLongTable)
+			for i := range e.longTableShardDirty {
+				e.longTableShardDirty[i] = false
+			}
+		} else {
+			for i := range e.longTableShardDirty {
+				if !e.longTableShardDirty[i] {
+					continue
+				}
+
+				copy(e.longTable[i*shardSize:(i+1)*shardSize], e.dictLongTable[i*shardSize:(i+1)*shardSize])
+				e.longTableShardDirty[i] = false
+			}
+		}
+	}
+	e.cur = e.maxMatchOff
+	e.allDirty = false
+}
+
+func (e *betterFastEncoderDict) markLongShardDirty(entryNum uint32) {
+	e.longTableShardDirty[entryNum/betterLongTableShardSize] = true
+}
+
+func (e *betterFastEncoderDict) markShortShardDirty(entryNum uint32) {
+	e.shortTableShardDirty[entryNum/betterShortTableShardSize] = true
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index ee3b09b..8629d43 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -4,11 +4,16 @@
 
 package zstd
 
+import "fmt"
+
 const (
 	dFastLongTableBits = 17                      // Bits used in the long match table
 	dFastLongTableSize = 1 << dFastLongTableBits // Size of the table
 	dFastLongTableMask = dFastLongTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
 
+	dLongTableShardCnt  = 1 << (dFastLongTableBits - dictShardBits) // Number of shards in the table
+	dLongTableShardSize = dFastLongTableSize / tableShardCnt        // Size of an individual shard
+
 	dFastShortTableBits = tableBits                // Bits used in the short match table
 	dFastShortTableSize = 1 << dFastShortTableBits // Size of the table
 	dFastShortTableMask = dFastShortTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
@@ -19,6 +24,13 @@
 	longTable [dFastLongTableSize]tableEntry
 }
 
+type doubleFastEncoderDict struct {
+	fastEncoderDict
+	longTable           [dFastLongTableSize]tableEntry
+	dictLongTable       []tableEntry
+	longTableShardDirty [dLongTableShardCnt]bool
+}
+
 // Encode mimmics functionality in zstd_dfast.c
 func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
 	const (
@@ -29,7 +41,7 @@
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur > (1<<30)+e.maxMatchOff {
+	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
@@ -61,6 +73,7 @@
 			e.longTable[i].offset = v
 		}
 		e.cur = e.maxMatchOff
+		break
 	}
 
 	s := e.addBlock(src)
@@ -77,10 +90,7 @@
 	sLimit := int32(len(src)) - inputMargin
 	// stepSize is the number of bytes to skip on every main loop iteration.
 	// It should be >= 1.
-	stepSize := int32(e.o.targetLength)
-	if stepSize == 0 {
-		stepSize++
-	}
+	const stepSize = 1
 
 	const kSearchStrength = 8
 
@@ -110,7 +120,7 @@
 		canRepeat := len(blk.sequences) > 2
 
 		for {
-			if debug && canRepeat && offset1 == 0 {
+			if debugAsserts && canRepeat && offset1 == 0 {
 				panic("offset0 was 0")
 			}
 
@@ -169,55 +179,6 @@
 					cv = load6432(src, s)
 					continue
 				}
-				const repOff2 = 1
-				// We deviate from the reference encoder and also check offset 2.
-				// Slower and not consistently better, so disabled.
-				// repIndex = s - offset2 + repOff2
-				if false && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff2*8)) {
-					// Consider history as well.
-					var seq seq
-					lenght := 4 + e.matchlen(s+4+repOff2, repIndex+4, src)
-
-					seq.matchLen = uint32(lenght - zstdMinMatch)
-
-					// We might be able to match backwards.
-					// Extend as long as we can.
-					start := s + repOff2
-					// We end the search early, so we don't risk 0 literals
-					// and have to do special offset treatment.
-					startLimit := nextEmit + 1
-
-					tMin := s - e.maxMatchOff
-					if tMin < 0 {
-						tMin = 0
-					}
-					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
-						repIndex--
-						start--
-						seq.matchLen++
-					}
-					addLiterals(&seq, start)
-
-					// rep 2
-					seq.offset = 2
-					if debugSequences {
-						println("repeat sequence 2", seq, "next s:", s)
-					}
-					blk.sequences = append(blk.sequences, seq)
-					s += lenght + repOff2
-					nextEmit = s
-					if s >= sLimit {
-						if debug {
-							println("repeat ended", s, lenght)
-
-						}
-						break encodeLoop
-					}
-					cv = load6432(src, s)
-					// Swap offsets
-					offset1, offset2 = offset2, offset1
-					continue
-				}
 			}
 			// Find the offsets of our two matches.
 			coffsetL := s - (candidateL.offset - e.cur)
@@ -229,10 +190,10 @@
 				// Reference encoder checks all 8 bytes, we only check 4,
 				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
 				t = candidateL.offset - e.cur
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
 				if debugMatches {
@@ -266,13 +227,13 @@
 				}
 
 				t = candidateS.offset - e.cur
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
-				if debug && t < 0 {
+				if debugAsserts && t < 0 {
 					panic("t<0")
 				}
 				if debugMatches {
@@ -294,11 +255,11 @@
 		offset2 = offset1
 		offset1 = s - t
 
-		if debug && s <= t {
-			panic("s <= t")
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 		}
 
-		if debug && canRepeat && int(offset1) > len(src) {
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
 			panic("invalid offset")
 		}
 
@@ -369,7 +330,7 @@
 			}
 
 			// Store this, since we have it.
-			nextHashS := hash5(cv1>>8, dFastShortTableBits)
+			nextHashS := hash5(cv, dFastShortTableBits)
 			nextHashL := hash8(cv, dFastLongTableBits)
 
 			// We have at least 4 byte match.
@@ -424,7 +385,7 @@
 	)
 
 	// Protect against e.cur wraparound.
-	if e.cur > (1<<30)+e.maxMatchOff {
+	if e.cur >= bufferReset {
 		for i := range e.table[:] {
 			e.table[i] = tableEntry{}
 		}
@@ -447,10 +408,7 @@
 	sLimit := int32(len(src)) - inputMargin
 	// stepSize is the number of bytes to skip on every main loop iteration.
 	// It should be >= 1.
-	stepSize := int32(e.o.targetLength)
-	if stepSize == 0 {
-		stepSize++
-	}
+	const stepSize = 1
 
 	const kSearchStrength = 8
 
@@ -545,10 +503,10 @@
 				// Reference encoder checks all 8 bytes, we only check 4,
 				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
 				t = candidateL.offset - e.cur
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d). cur: %d", s, t, e.cur))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
 				if debugMatches {
@@ -582,13 +540,13 @@
 				}
 
 				t = candidateS.offset - e.cur
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
-				if debug && t < 0 {
+				if debugAsserts && t < 0 {
 					panic("t<0")
 				}
 				if debugMatches {
@@ -610,8 +568,8 @@
 		offset2 = offset1
 		offset1 = s - t
 
-		if debug && s <= t {
-			panic("s <= t")
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 		}
 
 		// Extend the 4-byte match as long as possible.
@@ -723,4 +681,441 @@
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 
+	// We do not store history, so we must offset e.cur to avoid false matches for next user.
+	if e.cur < bufferReset {
+		e.cur += int32(len(src))
+	}
+}
+
+// Encode will encode the content, with a dictionary if initialized for it.
+func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = tableEntry{}
+			}
+			e.markAllShardsDirty()
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.longTable[i].offset = v
+		}
+		e.markAllShardsDirty()
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 8
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashS := hash5(cv, dFastShortTableBits)
+			nextHashL := hash8(cv, dFastLongTableBits)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = entry
+			e.markShardDirty(nextHashS)
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+					s += lenght + repOff
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+					cv = load6432(src, s)
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := s - (candidateL.offset - e.cur)
+			coffsetS := s - (candidateS.offset - e.cur)
+
+			// Check if we have a long match.
+			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+				// Found a long match, likely at least 8 bytes.
+				// Reference encoder checks all 8 bytes, we only check 4,
+				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+				t = candidateL.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			// Check if we have a short match.
+			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hash8(cv, dFastLongTableBits)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = s - (candidateL.offset - e.cur) + checkAt
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
+				e.markLongShardDirty(nextHashL)
+				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+					// Found a long match, likely at least 8 bytes.
+					// Reference encoder checks all 8 bytes, we only check 4,
+					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+					t = candidateL.offset - e.cur
+					s += checkAt
+					if debugMatches {
+						println("long match (after short)")
+					}
+					break
+				}
+
+				t = candidateS.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlen(s+4, t+4, src) + 4
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) and start+2 (short)
+		index0 := s - l + 1
+		// Index match end-2 (long) and end-1 (short)
+		index1 := s - 2
+
+		cv0 := load6432(src, index0)
+		cv1 := load6432(src, index1)
+		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
+		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
+		longHash1 := hash8(cv0, dFastLongTableBits)
+		longHash2 := hash8(cv0, dFastLongTableBits)
+		e.longTable[longHash1] = te0
+		e.longTable[longHash2] = te1
+		e.markLongShardDirty(longHash1)
+		e.markLongShardDirty(longHash2)
+		cv0 >>= 8
+		cv1 >>= 8
+		te0.offset++
+		te1.offset++
+		te0.val = uint32(cv0)
+		te1.val = uint32(cv1)
+		hashVal1 := hash5(cv0, dFastShortTableBits)
+		hashVal2 := hash5(cv1, dFastShortTableBits)
+		e.table[hashVal1] = te0
+		e.markShardDirty(hashVal1)
+		e.table[hashVal2] = te1
+		e.markShardDirty(hashVal2)
+
+		cv = load6432(src, s)
+
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash5(cv, dFastShortTableBits)
+			nextHashL := hash8(cv, dFastLongTableBits)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = entry
+			e.markShardDirty(nextHashS)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+	// If we encoded more than 64K mark all dirty.
+	if len(src) > 64<<10 {
+		e.markAllShardsDirty()
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *doubleFastEncoder) Reset(d *dict, singleBlock bool) {
+	e.fastEncoder.Reset(d, singleBlock)
+	if d != nil {
+		panic("doubleFastEncoder: Reset with dict not supported")
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
+	allDirty := e.allDirty
+	e.fastEncoderDict.Reset(d, singleBlock)
+	if d == nil {
+		return
+	}
+
+	// Init or copy dict table
+	if len(e.dictLongTable) != len(e.longTable) || d.id != e.lastDictID {
+		if len(e.dictLongTable) != len(e.longTable) {
+			e.dictLongTable = make([]tableEntry, len(e.longTable))
+		}
+		if len(d.content) >= 8 {
+			cv := load6432(d.content, 0)
+			e.dictLongTable[hash8(cv, dFastLongTableBits)] = tableEntry{
+				val:    uint32(cv),
+				offset: e.maxMatchOff,
+			}
+			end := int32(len(d.content)) - 8 + e.maxMatchOff
+			for i := e.maxMatchOff + 1; i < end; i++ {
+				cv = cv>>8 | (uint64(d.content[i-e.maxMatchOff+7]) << 56)
+				e.dictLongTable[hash8(cv, dFastLongTableBits)] = tableEntry{
+					val:    uint32(cv),
+					offset: i,
+				}
+			}
+		}
+		e.lastDictID = d.id
+		e.allDirty = true
+	}
+	// Reset table to initial state
+	e.cur = e.maxMatchOff
+
+	dirtyShardCnt := 0
+	if !allDirty {
+		for i := range e.longTableShardDirty {
+			if e.longTableShardDirty[i] {
+				dirtyShardCnt++
+			}
+		}
+	}
+
+	if allDirty || dirtyShardCnt > dLongTableShardCnt/2 {
+		copy(e.longTable[:], e.dictLongTable)
+		for i := range e.longTableShardDirty {
+			e.longTableShardDirty[i] = false
+		}
+		return
+	}
+	for i := range e.longTableShardDirty {
+		if !e.longTableShardDirty[i] {
+			continue
+		}
+
+		copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
+		e.longTableShardDirty[i] = false
+	}
+}
+
+func (e *doubleFastEncoderDict) markLongShardDirty(entryNum uint32) {
+	e.longTableShardDirty[entryNum/dLongTableShardSize] = true
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index 0bdddac..ba4a17e 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -5,15 +5,17 @@
 package zstd
 
 import (
+	"fmt"
+	"math"
 	"math/bits"
-
-	"github.com/klauspost/compress/zstd/internal/xxhash"
 )
 
 const (
-	tableBits      = 15             // Bits used in the table
-	tableSize      = 1 << tableBits // Size of the table
-	tableMask      = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
+	tableBits      = 15                               // Bits used in the table
+	tableSize      = 1 << tableBits                   // Size of the table
+	tableShardCnt  = 1 << (tableBits - dictShardBits) // Number of shards in the table
+	tableShardSize = tableSize / tableShardCnt        // Size of an individual shard
+	tableMask      = tableSize - 1                    // Mask for table indices. Redundant, but can eliminate bounds checks.
 	maxMatchLength = 131074
 )
 
@@ -23,47 +25,15 @@
 }
 
 type fastEncoder struct {
-	o encParams
-	// cur is the offset at the start of hist
-	cur int32
-	// maximum offset. Should be at least 2x block size.
-	maxMatchOff int32
-	hist        []byte
-	crc         *xxhash.Digest
-	table       [tableSize]tableEntry
-	tmp         [8]byte
-	blk         *blockEnc
+	fastBase
+	table [tableSize]tableEntry
 }
 
-// CRC returns the underlying CRC writer.
-func (e *fastEncoder) CRC() *xxhash.Digest {
-	return e.crc
-}
-
-// AppendCRC will append the CRC to the destination slice and return it.
-func (e *fastEncoder) AppendCRC(dst []byte) []byte {
-	crc := e.crc.Sum(e.tmp[:0])
-	dst = append(dst, crc[7], crc[6], crc[5], crc[4])
-	return dst
-}
-
-// WindowSize returns the window size of the encoder,
-// or a window size small enough to contain the input size, if > 0.
-func (e *fastEncoder) WindowSize(size int) int32 {
-	if size > 0 && size < int(e.maxMatchOff) {
-		b := int32(1) << uint(bits.Len(uint(size)))
-		// Keep minimum window.
-		if b < 1024 {
-			b = 1024
-		}
-		return b
-	}
-	return e.maxMatchOff
-}
-
-// Block returns the current block.
-func (e *fastEncoder) Block() *blockEnc {
-	return e.blk
+type fastEncoderDict struct {
+	fastEncoder
+	dictTable       []tableEntry
+	tableShardDirty [tableShardCnt]bool
+	allDirty        bool
 }
 
 // Encode mimmics functionality in zstd_fast.c
@@ -74,7 +44,7 @@
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur > (1<<30)+e.maxMatchOff {
+	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
@@ -94,6 +64,7 @@
 			e.table[i].offset = v
 		}
 		e.cur = e.maxMatchOff
+		break
 	}
 
 	s := e.addBlock(src)
@@ -110,16 +81,12 @@
 	sLimit := int32(len(src)) - inputMargin
 	// stepSize is the number of bytes to skip on every main loop iteration.
 	// It should be >= 2.
-	stepSize := int32(e.o.targetLength)
-	if stepSize == 0 {
-		stepSize++
-	}
-	stepSize++
+	const stepSize = 2
 
 	// TEMPLATE
 	const hashLog = tableBits
 	// seems global, but would be nice to tweak.
-	const kSearchStrength = 8
+	const kSearchStrength = 7
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
@@ -151,7 +118,7 @@
 		canRepeat := len(blk.sequences) > 2
 
 		for {
-			if debug && canRepeat && offset1 == 0 {
+			if debugAsserts && canRepeat && offset1 == 0 {
 				panic("offset0 was 0")
 			}
 
@@ -167,9 +134,22 @@
 			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				lenght := 4 + e.matchlen(s+6, repIndex+4, src)
+				var length int32
+				// length = 4 + e.matchlen(s+6, repIndex+4, src)
+				{
+					a := src[s+6:]
+					b := src[repIndex+4:]
+					endI := len(a) & (math.MaxInt32 - 7)
+					length = int32(endI) + 4
+					for i := 0; i < endI; i += 8 {
+						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+							break
+						}
+					}
+				}
 
-				seq.matchLen = uint32(lenght - zstdMinMatch)
+				seq.matchLen = uint32(length - zstdMinMatch)
 
 				// We might be able to match backwards.
 				// Extend as long as we can.
@@ -195,11 +175,11 @@
 					println("repeat sequence", seq, "next s:", s)
 				}
 				blk.sequences = append(blk.sequences, seq)
-				s += lenght + 2
+				s += length + 2
 				nextEmit = s
 				if s >= sLimit {
 					if debug {
-						println("repeat ended", s, lenght)
+						println("repeat ended", s, length)
 
 					}
 					break encodeLoop
@@ -212,10 +192,10 @@
 			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
 				// found a regular match
 				t = candidate.offset - e.cur
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
 				break
@@ -225,13 +205,13 @@
 				// found a regular match
 				t = candidate2.offset - e.cur
 				s++
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
-				if debug && t < 0 {
+				if debugAsserts && t < 0 {
 					panic("t<0")
 				}
 				break
@@ -246,16 +226,29 @@
 		offset2 = offset1
 		offset1 = s - t
 
-		if debug && s <= t {
-			panic("s <= t")
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 		}
 
-		if debug && canRepeat && int(offset1) > len(src) {
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
 			panic("invalid offset")
 		}
 
 		// Extend the 4-byte match as long as possible.
-		l := e.matchlen(s+4, t+4, src) + 4
+		//l := e.matchlen(s+4, t+4, src) + 4
+		var l int32
+		{
+			a := src[s+4:]
+			b := src[t+4:]
+			endI := len(a) & (math.MaxInt32 - 7)
+			l = int32(endI) + 4
+			for i := 0; i < endI; i += 8 {
+				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+					break
+				}
+			}
+		}
 
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -292,7 +285,20 @@
 		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
+			//l := 4 + e.matchlen(s+4, o2+4, src)
+			var l int32
+			{
+				a := src[s+4:]
+				b := src[o2+4:]
+				endI := len(a) & (math.MaxInt32 - 7)
+				l = int32(endI) + 4
+				for i := 0; i < endI; i += 8 {
+					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+						break
+					}
+				}
+			}
 
 			// Store this, since we have it.
 			nextHash := hash6(cv, hashLog)
@@ -342,8 +348,9 @@
 			panic("src too big")
 		}
 	}
+
 	// Protect against e.cur wraparound.
-	if e.cur > (1<<30)+e.maxMatchOff {
+	if e.cur >= bufferReset {
 		for i := range e.table[:] {
 			e.table[i] = tableEntry{}
 		}
@@ -410,10 +417,23 @@
 			if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				// lenght := 4 + e.matchlen(s+6, repIndex+4, src)
-				lenght := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
+				// length := 4 + e.matchlen(s+6, repIndex+4, src)
+				// length := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
+				var length int32
+				{
+					a := src[s+6:]
+					b := src[repIndex+4:]
+					endI := len(a) & (math.MaxInt32 - 7)
+					length = int32(endI) + 4
+					for i := 0; i < endI; i += 8 {
+						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+							break
+						}
+					}
+				}
 
-				seq.matchLen = uint32(lenght - zstdMinMatch)
+				seq.matchLen = uint32(length - zstdMinMatch)
 
 				// We might be able to match backwards.
 				// Extend as long as we can.
@@ -439,11 +459,11 @@
 					println("repeat sequence", seq, "next s:", s)
 				}
 				blk.sequences = append(blk.sequences, seq)
-				s += lenght + 2
+				s += length + 2
 				nextEmit = s
 				if s >= sLimit {
 					if debug {
-						println("repeat ended", s, lenght)
+						println("repeat ended", s, length)
 
 					}
 					break encodeLoop
@@ -456,12 +476,15 @@
 			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
 				// found a regular match
 				t = candidate.offset - e.cur
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
+				if debugAsserts && t < 0 {
+					panic(fmt.Sprintf("t (%d) < 0, candidate.offset: %d, e.cur: %d, coffset0: %d, e.maxMatchOff: %d", t, candidate.offset, e.cur, coffset0, e.maxMatchOff))
+				}
 				break
 			}
 
@@ -469,13 +492,13 @@
 				// found a regular match
 				t = candidate2.offset - e.cur
 				s++
-				if debug && s <= t {
-					panic("s <= t")
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
-				if debug && s-t > e.maxMatchOff {
+				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
-				if debug && t < 0 {
+				if debugAsserts && t < 0 {
 					panic("t<0")
 				}
 				break
@@ -490,13 +513,29 @@
 		offset2 = offset1
 		offset1 = s - t
 
-		if debug && s <= t {
-			panic("s <= t")
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 		}
 
+		if debugAsserts && t < 0 {
+			panic(fmt.Sprintf("t (%d) < 0 ", t))
+		}
 		// Extend the 4-byte match as long as possible.
 		//l := e.matchlenNoHist(s+4, t+4, src) + 4
-		l := int32(matchLen(src[s+4:], src[t+4:])) + 4
+		// l := int32(matchLen(src[s+4:], src[t+4:])) + 4
+		var l int32
+		{
+			a := src[s+4:]
+			b := src[t+4:]
+			endI := len(a) & (math.MaxInt32 - 7)
+			l = int32(endI) + 4
+			for i := 0; i < endI; i += 8 {
+				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+					break
+				}
+			}
+		}
 
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -534,7 +573,20 @@
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
 			//l := 4 + e.matchlenNoHist(s+4, o2+4, src)
-			l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
+			// l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
+			var l int32
+			{
+				a := src[s+4:]
+				b := src[o2+4:]
+				endI := len(a) & (math.MaxInt32 - 7)
+				l = int32(endI) + 4
+				for i := 0; i < endI; i += 8 {
+					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+						break
+					}
+				}
+			}
 
 			// Store this, since we have it.
 			nextHash := hash6(cv, hashLog)
@@ -567,90 +619,400 @@
 	if debug {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
-}
-
-func (e *fastEncoder) addBlock(src []byte) int32 {
-	// check if we have space already
-	if len(e.hist)+len(src) > cap(e.hist) {
-		if cap(e.hist) == 0 {
-			l := e.maxMatchOff * 2
-			// Make it at least 1MB.
-			if l < 1<<20 {
-				l = 1 << 20
-			}
-			e.hist = make([]byte, 0, l)
-		} else {
-			if cap(e.hist) < int(e.maxMatchOff*2) {
-				panic("unexpected buffer size")
-			}
-			// Move down
-			offset := int32(len(e.hist)) - e.maxMatchOff
-			copy(e.hist[0:e.maxMatchOff], e.hist[offset:])
-			e.cur += offset
-			e.hist = e.hist[:e.maxMatchOff]
-		}
+	// We do not store history, so we must offset e.cur to avoid false matches for next user.
+	if e.cur < bufferReset {
+		e.cur += int32(len(src))
 	}
-	s := int32(len(e.hist))
-	e.hist = append(e.hist, src...)
-	return s
 }
 
-// useBlock will replace the block with the provided one,
-// but transfer recent offsets from the previous.
-func (e *fastEncoder) UseBlock(enc *blockEnc) {
-	enc.reset(e.blk)
-	e.blk = enc
-}
+// Encode will encode the content, with a dictionary if initialized for it.
+func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		inputMargin            = 8
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if e.allDirty || len(src) > 32<<10 {
+		e.fastEncoder.Encode(blk, src)
+		e.allDirty = true
+		return
+	}
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
 
-func (e *fastEncoder) matchlenNoHist(s, t int32, src []byte) int32 {
-	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:], src[t:]))
-}
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
 
-func (e *fastEncoder) matchlen(s, t int32, src []byte) int32 {
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 2.
+	const stepSize = 2
+
+	// TEMPLATE
+	const hashLog = tableBits
+	// seems global, but would be nice to tweak.
+	const kSearchStrength = 7
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
 	if debug {
-		if s < 0 {
-			panic("s<0")
-		}
-		if t < 0 {
-			panic("t<0")
-		}
-		if s-t > e.maxMatchOff {
-			panic(s - t)
-		}
-	}
-	s1 := int(s) + maxMatchLength - 4
-	if s1 > len(src) {
-		s1 = len(src)
+		println("recent offsets:", blk.recentOffsets)
 	}
 
-	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:s1], src[t:]))
+encodeLoop:
+	for {
+		// t will contain the match offset when we find one.
+		// When existing the search loop, we have already checked 4 bytes.
+		var t int32
+
+		// We will not use repeat offsets across blocks.
+		// By not using them for the first 3 matches
+		canRepeat := len(blk.sequences) > 2
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHash := hash6(cv, hashLog)
+			nextHash2 := hash6(cv>>8, hashLog)
+			candidate := e.table[nextHash]
+			candidate2 := e.table[nextHash2]
+			repIndex := s - offset1 + 2
+
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShardDirty(nextHash)
+			e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)}
+			e.markShardDirty(nextHash2)
+
+			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
+				// Consider history as well.
+				var seq seq
+				var length int32
+				// length = 4 + e.matchlen(s+6, repIndex+4, src)
+				{
+					a := src[s+6:]
+					b := src[repIndex+4:]
+					endI := len(a) & (math.MaxInt32 - 7)
+					length = int32(endI) + 4
+					for i := 0; i < endI; i += 8 {
+						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+							break
+						}
+					}
+				}
+
+				seq.matchLen = uint32(length - zstdMinMatch)
+
+				// We might be able to match backwards.
+				// Extend as long as we can.
+				start := s + 2
+				// We end the search early, so we don't risk 0 literals
+				// and have to do special offset treatment.
+				startLimit := nextEmit + 1
+
+				sMin := s - e.maxMatchOff
+				if sMin < 0 {
+					sMin = 0
+				}
+				for repIndex > sMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch {
+					repIndex--
+					start--
+					seq.matchLen++
+				}
+				addLiterals(&seq, start)
+
+				// rep 0
+				seq.offset = 1
+				if debugSequences {
+					println("repeat sequence", seq, "next s:", s)
+				}
+				blk.sequences = append(blk.sequences, seq)
+				s += length + 2
+				nextEmit = s
+				if s >= sLimit {
+					if debug {
+						println("repeat ended", s, length)
+
+					}
+					break encodeLoop
+				}
+				cv = load6432(src, s)
+				continue
+			}
+			coffset0 := s - (candidate.offset - e.cur)
+			coffset1 := s - (candidate2.offset - e.cur) + 1
+			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
+				// found a regular match
+				t = candidate.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				break
+			}
+
+			if coffset1 < e.maxMatchOff && uint32(cv>>8) == candidate2.val {
+				// found a regular match
+				t = candidate2.offset - e.cur
+				s++
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				break
+			}
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+		// A 4-byte match has been found. We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the 4-byte match as long as possible.
+		//l := e.matchlen(s+4, t+4, src) + 4
+		var l int32
+		{
+			a := src[s+4:]
+			b := src[t+4:]
+			endI := len(a) & (math.MaxInt32 - 7)
+			l = int32(endI) + 4
+			for i := 0; i < endI; i += 8 {
+				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+					break
+				}
+			}
+		}
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence.
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		// Don't use repeat offsets
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+		cv = load6432(src, s)
+
+		// Check offset 2
+		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			//l := 4 + e.matchlen(s+4, o2+4, src)
+			var l int32
+			{
+				a := src[s+4:]
+				b := src[o2+4:]
+				endI := len(a) & (math.MaxInt32 - 7)
+				l = int32(endI) + 4
+				for i := 0; i < endI; i += 8 {
+					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+						break
+					}
+				}
+			}
+
+			// Store this, since we have it.
+			nextHash := hash6(cv, hashLog)
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShardDirty(nextHash)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				break encodeLoop
+			}
+			// Prepare next loop.
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
 }
 
-// Reset the encoding table.
-func (e *fastEncoder) Reset() {
-	if e.blk == nil {
-		e.blk = &blockEnc{}
-		e.blk.init()
-	} else {
-		e.blk.reset(nil)
+// ResetDict will reset and set a dictionary if not nil
+func (e *fastEncoder) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d != nil {
+		panic("fastEncoder: Reset with dict")
 	}
-	e.blk.initNewEncode()
-	if e.crc == nil {
-		e.crc = xxhash.New()
-	} else {
-		e.crc.Reset()
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d == nil {
+		return
 	}
-	if cap(e.hist) < int(e.maxMatchOff*2) {
-		l := e.maxMatchOff * 2
-		// Make it at least 1MB.
-		if l < 1<<20 {
-			l = 1 << 20
+
+	// Init or copy dict table
+	if len(e.dictTable) != len(e.table) || d.id != e.lastDictID {
+		if len(e.dictTable) != len(e.table) {
+			e.dictTable = make([]tableEntry, len(e.table))
 		}
-		e.hist = make([]byte, 0, l)
+		if true {
+			end := e.maxMatchOff + int32(len(d.content)) - 8
+			for i := e.maxMatchOff; i < end; i += 3 {
+				const hashLog = tableBits
+
+				cv := load6432(d.content, i-e.maxMatchOff)
+				nextHash := hash6(cv, hashLog)      // 0 -> 5
+				nextHash1 := hash6(cv>>8, hashLog)  // 1 -> 6
+				nextHash2 := hash6(cv>>16, hashLog) // 2 -> 7
+				e.dictTable[nextHash] = tableEntry{
+					val:    uint32(cv),
+					offset: i,
+				}
+				e.dictTable[nextHash1] = tableEntry{
+					val:    uint32(cv >> 8),
+					offset: i + 1,
+				}
+				e.dictTable[nextHash2] = tableEntry{
+					val:    uint32(cv >> 16),
+					offset: i + 2,
+				}
+			}
+		}
+		e.lastDictID = d.id
+		e.allDirty = true
 	}
-	// We offset current position so everything will be out of reach
-	e.cur += e.maxMatchOff + int32(len(e.hist))
-	e.hist = e.hist[:0]
+
+	e.cur = e.maxMatchOff
+	dirtyShardCnt := 0
+	if !e.allDirty {
+		for i := range e.tableShardDirty {
+			if e.tableShardDirty[i] {
+				dirtyShardCnt++
+			}
+		}
+	}
+
+	const shardCnt = tableShardCnt
+	const shardSize = tableShardSize
+	if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+		copy(e.table[:], e.dictTable)
+		for i := range e.tableShardDirty {
+			e.tableShardDirty[i] = false
+		}
+		e.allDirty = false
+		return
+	}
+	for i := range e.tableShardDirty {
+		if !e.tableShardDirty[i] {
+			continue
+		}
+
+		copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+		e.tableShardDirty[i] = false
+	}
+	e.allDirty = false
+}
+
+func (e *fastEncoderDict) markAllShardsDirty() {
+	e.allDirty = true
+}
+
+func (e *fastEncoderDict) markShardDirty(entryNum uint32) {
+	e.tableShardDirty[entryNum/tableShardSize] = true
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_params.go b/vendor/github.com/klauspost/compress/zstd/enc_params.go
deleted file mode 100644
index b6779ec..0000000
--- a/vendor/github.com/klauspost/compress/zstd/enc_params.go
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-type encParams struct {
-	// largest match distance : larger == more compression, more memory needed during decompression
-	windowLog uint8
-
-	// fully searched segment : larger == more compression, slower, more memory (useless for fast)
-	chainLog uint8
-
-	//  dispatch table : larger == faster, more memory
-	hashLog uint8
-
-	// < nb of searches : larger == more compression, slower
-	searchLog uint8
-
-	// < match length searched : larger == faster decompression, sometimes less compression
-	minMatch uint8
-
-	// acceptable match size for optimal parser (only) : larger == more compression, slower
-	targetLength uint32
-
-	// see ZSTD_strategy definition above
-	strategy strategy
-}
-
-// strategy defines the algorithm to use when generating sequences.
-type strategy uint8
-
-const (
-	// Compression strategies, listed from fastest to strongest
-	strategyFast strategy = iota + 1
-	strategyDfast
-	strategyGreedy
-	strategyLazy
-	strategyLazy2
-	strategyBtlazy2
-	strategyBtopt
-	strategyBtultra
-	strategyBtultra2
-	// note : new strategies _might_ be added in the future.
-	//   Only the order (from fast to strong) is guaranteed
-
-)
-
-var defEncParams = [4][]encParams{
-	{ // "default" - for any srcSize > 256 KB
-		// W,  C,  H,  S,  L, TL, strat
-		{19, 12, 13, 1, 6, 1, strategyFast},       // base for negative levels
-		{19, 13, 14, 1, 7, 0, strategyFast},       // level  1
-		{20, 15, 16, 1, 6, 0, strategyFast},       // level  2
-		{21, 16, 17, 1, 5, 1, strategyDfast},      // level  3
-		{21, 18, 18, 1, 5, 1, strategyDfast},      // level  4
-		{21, 18, 19, 2, 5, 2, strategyGreedy},     // level  5
-		{21, 19, 19, 3, 5, 4, strategyGreedy},     // level  6
-		{21, 19, 19, 3, 5, 8, strategyLazy},       // level  7
-		{21, 19, 19, 3, 5, 16, strategyLazy2},     // level  8
-		{21, 19, 20, 4, 5, 16, strategyLazy2},     // level  9
-		{22, 20, 21, 4, 5, 16, strategyLazy2},     // level 10
-		{22, 21, 22, 4, 5, 16, strategyLazy2},     // level 11
-		{22, 21, 22, 5, 5, 16, strategyLazy2},     // level 12
-		{22, 21, 22, 5, 5, 32, strategyBtlazy2},   // level 13
-		{22, 22, 23, 5, 5, 32, strategyBtlazy2},   // level 14
-		{22, 23, 23, 6, 5, 32, strategyBtlazy2},   // level 15
-		{22, 22, 22, 5, 5, 48, strategyBtopt},     // level 16
-		{23, 23, 22, 5, 4, 64, strategyBtopt},     // level 17
-		{23, 23, 22, 6, 3, 64, strategyBtultra},   // level 18
-		{23, 24, 22, 7, 3, 256, strategyBtultra2}, // level 19
-		{25, 25, 23, 7, 3, 256, strategyBtultra2}, // level 20
-		{26, 26, 24, 7, 3, 512, strategyBtultra2}, // level 21
-		{27, 27, 25, 9, 3, 999, strategyBtultra2}, // level 22
-	},
-	{ // for srcSize <= 256 KB
-		// W,  C,  H,  S,  L,  T, strat
-		{18, 12, 13, 1, 5, 1, strategyFast},        // base for negative levels
-		{18, 13, 14, 1, 6, 0, strategyFast},        // level  1
-		{18, 14, 14, 1, 5, 1, strategyDfast},       // level  2
-		{18, 16, 16, 1, 4, 1, strategyDfast},       // level  3
-		{18, 16, 17, 2, 5, 2, strategyGreedy},      // level  4.
-		{18, 18, 18, 3, 5, 2, strategyGreedy},      // level  5.
-		{18, 18, 19, 3, 5, 4, strategyLazy},        // level  6.
-		{18, 18, 19, 4, 4, 4, strategyLazy},        // level  7
-		{18, 18, 19, 4, 4, 8, strategyLazy2},       // level  8
-		{18, 18, 19, 5, 4, 8, strategyLazy2},       // level  9
-		{18, 18, 19, 6, 4, 8, strategyLazy2},       // level 10
-		{18, 18, 19, 5, 4, 12, strategyBtlazy2},    // level 11.
-		{18, 19, 19, 7, 4, 12, strategyBtlazy2},    // level 12.
-		{18, 18, 19, 4, 4, 16, strategyBtopt},      // level 13
-		{18, 18, 19, 4, 3, 32, strategyBtopt},      // level 14.
-		{18, 18, 19, 6, 3, 128, strategyBtopt},     // level 15.
-		{18, 19, 19, 6, 3, 128, strategyBtultra},   // level 16.
-		{18, 19, 19, 8, 3, 256, strategyBtultra},   // level 17.
-		{18, 19, 19, 6, 3, 128, strategyBtultra2},  // level 18.
-		{18, 19, 19, 8, 3, 256, strategyBtultra2},  // level 19.
-		{18, 19, 19, 10, 3, 512, strategyBtultra2}, // level 20.
-		{18, 19, 19, 12, 3, 512, strategyBtultra2}, // level 21.
-		{18, 19, 19, 13, 3, 999, strategyBtultra2}, // level 22.
-	},
-	{ // for srcSize <= 128 KB
-		// W,  C,  H,  S,  L,  T, strat
-		{17, 12, 12, 1, 5, 1, strategyFast},        // base for negative levels
-		{17, 12, 13, 1, 6, 0, strategyFast},        // level  1
-		{17, 13, 15, 1, 5, 0, strategyFast},        // level  2
-		{17, 15, 16, 2, 5, 1, strategyDfast},       // level  3
-		{17, 17, 17, 2, 4, 1, strategyDfast},       // level  4
-		{17, 16, 17, 3, 4, 2, strategyGreedy},      // level  5
-		{17, 17, 17, 3, 4, 4, strategyLazy},        // level  6
-		{17, 17, 17, 3, 4, 8, strategyLazy2},       // level  7
-		{17, 17, 17, 4, 4, 8, strategyLazy2},       // level  8
-		{17, 17, 17, 5, 4, 8, strategyLazy2},       // level  9
-		{17, 17, 17, 6, 4, 8, strategyLazy2},       // level 10
-		{17, 17, 17, 5, 4, 8, strategyBtlazy2},     // level 11
-		{17, 18, 17, 7, 4, 12, strategyBtlazy2},    // level 12
-		{17, 18, 17, 3, 4, 12, strategyBtopt},      // level 13.
-		{17, 18, 17, 4, 3, 32, strategyBtopt},      // level 14.
-		{17, 18, 17, 6, 3, 256, strategyBtopt},     // level 15.
-		{17, 18, 17, 6, 3, 128, strategyBtultra},   // level 16.
-		{17, 18, 17, 8, 3, 256, strategyBtultra},   // level 17.
-		{17, 18, 17, 10, 3, 512, strategyBtultra},  // level 18.
-		{17, 18, 17, 5, 3, 256, strategyBtultra2},  // level 19.
-		{17, 18, 17, 7, 3, 512, strategyBtultra2},  // level 20.
-		{17, 18, 17, 9, 3, 512, strategyBtultra2},  // level 21.
-		{17, 18, 17, 11, 3, 999, strategyBtultra2}, // level 22.
-	},
-	{ // for srcSize <= 16 KB
-		// W,  C,  H,  S,  L,  T, strat
-		{14, 12, 13, 1, 5, 1, strategyFast},        // base for negative levels
-		{14, 14, 15, 1, 5, 0, strategyFast},        // level  1
-		{14, 14, 15, 1, 4, 0, strategyFast},        // level  2
-		{14, 14, 15, 2, 4, 1, strategyDfast},       // level  3
-		{14, 14, 14, 4, 4, 2, strategyGreedy},      // level  4
-		{14, 14, 14, 3, 4, 4, strategyLazy},        // level  5.
-		{14, 14, 14, 4, 4, 8, strategyLazy2},       // level  6
-		{14, 14, 14, 6, 4, 8, strategyLazy2},       // level  7
-		{14, 14, 14, 8, 4, 8, strategyLazy2},       // level  8.
-		{14, 15, 14, 5, 4, 8, strategyBtlazy2},     // level  9.
-		{14, 15, 14, 9, 4, 8, strategyBtlazy2},     // level 10.
-		{14, 15, 14, 3, 4, 12, strategyBtopt},      // level 11.
-		{14, 15, 14, 4, 3, 24, strategyBtopt},      // level 12.
-		{14, 15, 14, 5, 3, 32, strategyBtultra},    // level 13.
-		{14, 15, 15, 6, 3, 64, strategyBtultra},    // level 14.
-		{14, 15, 15, 7, 3, 256, strategyBtultra},   // level 15.
-		{14, 15, 15, 5, 3, 48, strategyBtultra2},   // level 16.
-		{14, 15, 15, 6, 3, 128, strategyBtultra2},  // level 17.
-		{14, 15, 15, 7, 3, 256, strategyBtultra2},  // level 18.
-		{14, 15, 15, 8, 3, 256, strategyBtultra2},  // level 19.
-		{14, 15, 15, 8, 3, 512, strategyBtultra2},  // level 20.
-		{14, 15, 15, 9, 3, 512, strategyBtultra2},  // level 21.
-		{14, 15, 15, 10, 3, 999, strategyBtultra2}, // level 22.
-	},
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index 366dd66..4871dd0 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -35,21 +35,22 @@
 	AppendCRC([]byte) []byte
 	WindowSize(size int) int32
 	UseBlock(*blockEnc)
-	Reset()
+	Reset(d *dict, singleBlock bool)
 }
 
 type encoderState struct {
-	w             io.Writer
-	filling       []byte
-	current       []byte
-	previous      []byte
-	encoder       encoder
-	writing       *blockEnc
-	err           error
-	writeErr      error
-	nWritten      int64
-	headerWritten bool
-	eofWritten    bool
+	w                io.Writer
+	filling          []byte
+	current          []byte
+	previous         []byte
+	encoder          encoder
+	writing          *blockEnc
+	err              error
+	writeErr         error
+	nWritten         int64
+	headerWritten    bool
+	eofWritten       bool
+	fullFrameWritten bool
 
 	// This waitgroup indicates an encode is running.
 	wg sync.WaitGroup
@@ -71,27 +72,24 @@
 	}
 	if w != nil {
 		e.Reset(w)
-	} else {
-		e.init.Do(func() {
-			e.initialize()
-		})
 	}
 	return &e, nil
 }
 
 func (e *Encoder) initialize() {
+	if e.o.concurrent == 0 {
+		e.o.setDefault()
+	}
 	e.encoders = make(chan encoder, e.o.concurrent)
 	for i := 0; i < e.o.concurrent; i++ {
-		e.encoders <- e.o.encoder()
+		enc := e.o.encoder()
+		e.encoders <- enc
 	}
 }
 
 // Reset will re-initialize the writer and new writes will encode to the supplied writer
 // as a new, independent stream.
 func (e *Encoder) Reset(w io.Writer) {
-	e.init.Do(func() {
-		e.initialize()
-	})
 	s := &e.state
 	s.wg.Wait()
 	s.wWg.Wait()
@@ -108,16 +106,17 @@
 		s.encoder = e.o.encoder()
 	}
 	if s.writing == nil {
-		s.writing = &blockEnc{}
+		s.writing = &blockEnc{lowMem: e.o.lowMem}
 		s.writing.init()
 	}
 	s.writing.initNewEncode()
 	s.filling = s.filling[:0]
 	s.current = s.current[:0]
 	s.previous = s.previous[:0]
-	s.encoder.Reset()
+	s.encoder.Reset(e.o.dict, false)
 	s.headerWritten = false
 	s.eofWritten = false
+	s.fullFrameWritten = false
 	s.w = w
 	s.err = nil
 	s.nWritten = 0
@@ -156,7 +155,7 @@
 		if err != nil {
 			return n, err
 		}
-		if debug && len(s.filling) > 0 {
+		if debugAsserts && len(s.filling) > 0 {
 			panic(len(s.filling))
 		}
 	}
@@ -176,14 +175,38 @@
 		return fmt.Errorf("block > maxStoreBlockSize")
 	}
 	if !s.headerWritten {
+		// If we have a single block encode, do a sync compression.
+		if final && len(s.filling) == 0 && !e.o.fullZero {
+			s.headerWritten = true
+			s.fullFrameWritten = true
+			s.eofWritten = true
+			return nil
+		}
+		if final && len(s.filling) > 0 {
+			s.current = e.EncodeAll(s.filling, s.current[:0])
+			var n2 int
+			n2, s.err = s.w.Write(s.current)
+			if s.err != nil {
+				return s.err
+			}
+			s.nWritten += int64(n2)
+			s.current = s.current[:0]
+			s.filling = s.filling[:0]
+			s.headerWritten = true
+			s.fullFrameWritten = true
+			s.eofWritten = true
+			return nil
+		}
+
 		var tmp [maxHeaderSize]byte
 		fh := frameHeader{
 			ContentSize:   0,
 			WindowSize:    uint32(s.encoder.WindowSize(0)),
 			SingleSegment: false,
 			Checksum:      e.o.crc,
-			DictID:        0,
+			DictID:        e.o.dict.ID(),
 		}
+
 		dst, err := fh.appendTo(tmp[:0])
 		if err != nil {
 			return err
@@ -263,7 +286,7 @@
 			// If we got the exact same number of literals as input,
 			// assume the literals cannot be compressed.
 			if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
-				err = blk.encode(e.o.noEntropy)
+				err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
 			}
 			switch err {
 			case errIncompressible:
@@ -293,12 +316,20 @@
 	if debug {
 		println("Using ReadFrom")
 	}
-	// Maybe handle stuff queued?
+
+	// Flush any current writes.
+	if len(e.state.filling) > 0 {
+		if err := e.nextBlock(false); err != nil {
+			return 0, err
+		}
+	}
 	e.state.filling = e.state.filling[:e.o.blockSize]
 	src := e.state.filling
 	for {
 		n2, err := r.Read(src)
-		_, _ = e.state.encoder.CRC().Write(src[:n2])
+		if e.o.crc {
+			_, _ = e.state.encoder.CRC().Write(src[:n2])
+		}
 		// src is now the unfilled part...
 		src = src[n2:]
 		n += int64(n2)
@@ -308,14 +339,14 @@
 			if debug {
 				println("ReadFrom: got EOF final block:", len(e.state.filling))
 			}
-			return n, e.nextBlock(true)
+			return n, nil
+		case nil:
 		default:
 			if debug {
 				println("ReadFrom: got error:", err)
 			}
 			e.state.err = err
 			return n, err
-		case nil:
 		}
 		if len(src) > 0 {
 			if debug {
@@ -363,6 +394,9 @@
 	if err != nil {
 		return err
 	}
+	if e.state.fullFrameWritten {
+		return s.err
+	}
 	s.wg.Wait()
 	s.wWg.Wait()
 
@@ -422,18 +456,13 @@
 		}
 		return dst
 	}
-	e.init.Do(func() {
-		e.o.setDefault()
-		e.initialize()
-	})
+	e.init.Do(e.initialize)
 	enc := <-e.encoders
 	defer func() {
 		// Release encoder reference to last block.
-		enc.Reset()
+		// If a non-single block is needed the encoder will reset again.
 		e.encoders <- enc
 	}()
-	enc.Reset()
-	blk := enc.Block()
 	// Use single segments when above minimum window and below 1MB.
 	single := len(src) < 1<<20 && len(src) > MinWindowSize
 	if e.o.single != nil {
@@ -444,11 +473,11 @@
 		WindowSize:    uint32(enc.WindowSize(len(src))),
 		SingleSegment: single,
 		Checksum:      e.o.crc,
-		DictID:        0,
+		DictID:        e.o.dict.ID(),
 	}
 
 	// If less than 1MB, allocate a buffer up front.
-	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 {
+	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 && !e.o.lowMem {
 		dst = make([]byte, 0, len(src))
 	}
 	dst, err := fh.appendTo(dst)
@@ -456,14 +485,20 @@
 		panic(err)
 	}
 
-	if len(src) <= e.o.blockSize && len(src) <= maxBlockSize {
+	// If we can do everything in one block, prefer that.
+	if len(src) <= maxCompressedBlockSize {
+		enc.Reset(e.o.dict, true)
 		// Slightly faster with no history and everything in one block.
 		if e.o.crc {
 			_, _ = enc.CRC().Write(src)
 		}
-		blk.reset(nil)
+		blk := enc.Block()
 		blk.last = true
-		enc.EncodeNoHist(blk, src)
+		if e.o.dict == nil {
+			enc.EncodeNoHist(blk, src)
+		} else {
+			enc.Encode(blk, src)
+		}
 
 		// If we got the exact same number of literals as input,
 		// assume the literals cannot be compressed.
@@ -472,7 +507,7 @@
 		if len(blk.literals) != len(src) || len(src) != e.o.blockSize {
 			// Output directly to dst
 			blk.output = dst
-			err = blk.encode(e.o.noEntropy)
+			err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
 		}
 
 		switch err {
@@ -488,6 +523,8 @@
 		}
 		blk.output = oldout
 	} else {
+		enc.Reset(e.o.dict, false)
+		blk := enc.Block()
 		for len(src) > 0 {
 			todo := src
 			if len(todo) > e.o.blockSize {
@@ -497,7 +534,6 @@
 			if e.o.crc {
 				_, _ = enc.CRC().Write(todo)
 			}
-			blk.reset(nil)
 			blk.pushOffsets()
 			enc.Encode(blk, todo)
 			if len(src) == 0 {
@@ -507,7 +543,7 @@
 			// If we got the exact same number of literals as input,
 			// assume the literals cannot be compressed.
 			if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize {
-				err = blk.encode(e.o.noEntropy)
+				err = blk.encode(todo, e.o.noEntropy, !e.o.allLitEntropy)
 			}
 
 			switch err {
@@ -522,6 +558,7 @@
 			default:
 				panic(err)
 			}
+			blk.reset(nil)
 		}
 	}
 	if e.o.crc {
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index 40eb457..16d4ab6 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -12,36 +12,56 @@
 
 // options retains accumulated state of multiple options.
 type encoderOptions struct {
-	concurrent int
-	crc        bool
-	single     *bool
-	pad        int
-	blockSize  int
-	windowSize int
-	level      EncoderLevel
-	fullZero   bool
-	noEntropy  bool
+	concurrent      int
+	level           EncoderLevel
+	single          *bool
+	pad             int
+	blockSize       int
+	windowSize      int
+	crc             bool
+	fullZero        bool
+	noEntropy       bool
+	allLitEntropy   bool
+	customWindow    bool
+	customALEntropy bool
+	lowMem          bool
+	dict            *dict
 }
 
 func (o *encoderOptions) setDefault() {
 	*o = encoderOptions{
-		// use less ram: true for now, but may change.
-		concurrent: runtime.GOMAXPROCS(0),
-		crc:        true,
-		single:     nil,
-		blockSize:  1 << 16,
-		windowSize: 1 << 22,
-		level:      SpeedDefault,
+		concurrent:    runtime.GOMAXPROCS(0),
+		crc:           true,
+		single:        nil,
+		blockSize:     1 << 16,
+		windowSize:    8 << 20,
+		level:         SpeedDefault,
+		allLitEntropy: true,
+		lowMem:        false,
 	}
 }
 
 // encoder returns an encoder with the selected options.
 func (o encoderOptions) encoder() encoder {
 	switch o.level {
-	case SpeedDefault:
-		return &doubleFastEncoder{fastEncoder: fastEncoder{maxMatchOff: int32(o.windowSize)}}
 	case SpeedFastest:
-		return &fastEncoder{maxMatchOff: int32(o.windowSize)}
+		if o.dict != nil {
+			return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+		}
+		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+
+	case SpeedDefault:
+		if o.dict != nil {
+			return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}}
+		}
+		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+	case SpeedBetterCompression:
+		if o.dict != nil {
+			return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+		}
+		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+	case SpeedBestCompression:
+		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
 	}
 	panic("unknown compression level")
 }
@@ -53,7 +73,7 @@
 }
 
 // WithEncoderConcurrency will set the concurrency,
-// meaning the maximum number of decoders to run concurrently.
+// meaning the maximum number of encoders to run concurrently.
 // The value supplied must be at least 1.
 // By default this will be set to GOMAXPROCS.
 func WithEncoderConcurrency(n int) EOption {
@@ -67,7 +87,7 @@
 }
 
 // WithWindowSize will set the maximum allowed back-reference distance.
-// The value must be a power of two between WindowSizeMin and WindowSizeMax.
+// The value must be a power of two between MinWindowSize and MaxWindowSize.
 // A larger value will enable better compression but allocate more memory and,
 // for above-default values, take considerably longer.
 // The default value is determined by the compression level.
@@ -83,6 +103,7 @@
 		}
 
 		o.windowSize = n
+		o.customWindow = true
 		if o.blockSize > o.windowSize {
 			o.blockSize = o.windowSize
 		}
@@ -130,25 +151,25 @@
 	// This is roughly equivalent to the default Zstandard mode (level 3).
 	SpeedDefault
 
+	// SpeedBetterCompression will yield better compression than the default.
+	// Currently it is about zstd level 7-8 with ~ 2x-3x the default CPU usage.
+	// By using this, notice that CPU usage may go up in the future.
+	SpeedBetterCompression
+
+	// SpeedBestCompression will choose the best available compression option.
+	// This will offer the best compression no matter the CPU cost.
+	SpeedBestCompression
+
 	// speedLast should be kept as the last actual compression option.
 	// The is not for external usage, but is used to keep track of the valid options.
 	speedLast
-
-	// SpeedBetterCompression will (in the future) yield better compression than the default,
-	// but at approximately 4x the CPU usage of the default.
-	// For now this is not implemented.
-	SpeedBetterCompression = SpeedDefault
-
-	// SpeedBestCompression will choose the best available compression option.
-	// For now this is not implemented.
-	SpeedBestCompression = SpeedDefault
 )
 
 // EncoderLevelFromString will convert a string representation of an encoding level back
 // to a compression level. The compare is not case sensitive.
 // If the string wasn't recognized, (false, SpeedDefault) will be returned.
 func EncoderLevelFromString(s string) (bool, EncoderLevel) {
-	for l := EncoderLevel(speedNotSet + 1); l < speedLast; l++ {
+	for l := speedNotSet + 1; l < speedLast; l++ {
 		if strings.EqualFold(s, l.String()) {
 			return true, l
 		}
@@ -163,8 +184,12 @@
 	switch {
 	case level < 3:
 		return SpeedFastest
-	case level >= 3:
+	case level >= 3 && level < 6:
 		return SpeedDefault
+	case level >= 6 && level < 10:
+		return SpeedBetterCompression
+	case level >= 10:
+		return SpeedBetterCompression
 	}
 	return SpeedDefault
 }
@@ -176,6 +201,10 @@
 		return "fastest"
 	case SpeedDefault:
 		return "default"
+	case SpeedBetterCompression:
+		return "better"
+	case SpeedBestCompression:
+		return "best"
 	default:
 		return "invalid"
 	}
@@ -189,6 +218,22 @@
 			return fmt.Errorf("unknown encoder level")
 		}
 		o.level = l
+		if !o.customWindow {
+			switch o.level {
+			case SpeedFastest:
+				o.windowSize = 4 << 20
+			case SpeedDefault:
+				o.windowSize = 8 << 20
+			case SpeedBetterCompression:
+				o.windowSize = 16 << 20
+			case SpeedBestCompression:
+				o.windowSize = 32 << 20
+			}
+		}
+		if !o.customALEntropy {
+			o.allLitEntropy = l > SpeedFastest
+		}
+
 		return nil
 	}
 }
@@ -203,6 +248,18 @@
 	}
 }
 
+// WithAllLitEntropyCompression will apply entropy compression if no matches are found.
+// Disabling this will skip incompressible data faster, but in cases with no matches but
+// skewed character distribution compression is lost.
+// Default value depends on the compression level selected.
+func WithAllLitEntropyCompression(b bool) EOption {
+	return func(o *encoderOptions) error {
+		o.customALEntropy = true
+		o.allLitEntropy = b
+		return nil
+	}
+}
+
 // WithNoEntropyCompression will always skip entropy compression of literals.
 // This can be useful if content has matches, but unlikely to benefit from entropy
 // compression. Usually the slight speed improvement is not worth enabling this.
@@ -229,3 +286,27 @@
 		return nil
 	}
 }
+
+// WithLowerEncoderMem will trade in some memory cases trade less memory usage for
+// slower encoding speed.
+// This will not change the window size which is the primary function for reducing
+// memory usage. See WithWindowSize.
+func WithLowerEncoderMem(b bool) EOption {
+	return func(o *encoderOptions) error {
+		o.lowMem = b
+		return nil
+	}
+}
+
+// WithEncoderDict allows to register a dictionary that will be used for the encode.
+// The encoder *may* choose to use no dictionary instead for certain payloads.
+func WithEncoderDict(dict []byte) EOption {
+	return func(o *encoderOptions) error {
+		d, err := loadDict(dict)
+		if err != nil {
+			return err
+		}
+		o.dict = d
+		return nil
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 4079074..693c5f0 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -16,16 +16,11 @@
 )
 
 type frameDec struct {
-	o         decoderOptions
-	crc       hash.Hash64
-	frameDone sync.WaitGroup
-	offset    int64
+	o      decoderOptions
+	crc    hash.Hash64
+	offset int64
 
-	WindowSize       uint64
-	DictionaryID     uint32
-	FrameContentSize uint64
-	HasCheckSum      bool
-	SingleSegment    bool
+	WindowSize uint64
 
 	// maxWindowSize is the maximum windows size to support.
 	// should never be bigger than max-int.
@@ -42,15 +37,22 @@
 	// Byte buffer that can be reused for small input blocks.
 	bBuf byteBuf
 
+	FrameContentSize uint64
+	frameDone        sync.WaitGroup
+
+	DictionaryID  *uint32
+	HasCheckSum   bool
+	SingleSegment bool
+
 	// asyncRunning indicates whether the async routine processes input on 'decoding'.
-	asyncRunning   bool
 	asyncRunningMu sync.Mutex
+	asyncRunning   bool
 }
 
 const (
 	// The minimum Window_Size is 1 KB.
 	MinWindowSize = 1 << 10
-	MaxWindowSize = 1 << 30
+	MaxWindowSize = 1 << 29
 )
 
 var (
@@ -119,7 +121,7 @@
 	d.SingleSegment = fhd&(1<<5) != 0
 
 	if fhd&(1<<3) != 0 {
-		return errors.New("Reserved bit set on frame header")
+		return errors.New("reserved bit set on frame header")
 	}
 
 	// Read Window_Descriptor
@@ -140,7 +142,7 @@
 
 	// Read Dictionary_ID
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
-	d.DictionaryID = 0
+	d.DictionaryID = nil
 	if size := fhd & 3; size != 0 {
 		if size == 3 {
 			size = 4
@@ -152,19 +154,22 @@
 			}
 			return io.ErrUnexpectedEOF
 		}
+		var id uint32
 		switch size {
 		case 1:
-			d.DictionaryID = uint32(b[0])
+			id = uint32(b[0])
 		case 2:
-			d.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8)
+			id = uint32(b[0]) | (uint32(b[1]) << 8)
 		case 4:
-			d.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
+			id = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
 		}
 		if debug {
-			println("Dict size", size, "ID:", d.DictionaryID)
+			println("Dict size", size, "ID:", id)
 		}
-		if d.DictionaryID != 0 {
-			return ErrUnknownDictionary
+		if id > 0 {
+			// ID 0 means "sorry, no dictionary anyway".
+			// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
+			d.DictionaryID = &id
 		}
 	}
 
@@ -231,7 +236,11 @@
 		return ErrWindowSizeTooSmall
 	}
 	d.history.windowSize = int(d.WindowSize)
-	d.history.maxSize = d.history.windowSize + maxBlockSize
+	if d.o.lowMem && d.history.windowSize < maxBlockSize {
+		d.history.maxSize = d.history.windowSize * 2
+	} else {
+		d.history.maxSize = d.history.windowSize + maxBlockSize
+	}
 	// history contains input - maybe we do something
 	d.rawInput = br
 	return nil
@@ -318,8 +327,8 @@
 
 func (d *frameDec) initAsync() {
 	if !d.o.lowMem && !d.SingleSegment {
-		// set max extra size history to 20MB.
-		d.history.maxSize = d.history.windowSize + maxBlockSize*10
+		// set max extra size history to 10MB.
+		d.history.maxSize = d.history.windowSize + maxBlockSize*5
 	}
 	// re-alloc if more than one extra block size.
 	if d.o.lowMem && cap(d.history.b) > d.history.maxSize+maxBlockSize {
@@ -345,8 +354,6 @@
 // When the frame has finished decoding the *bufio.Reader
 // containing the remaining input will be sent on frameDec.frameDone.
 func (d *frameDec) startDecoder(output chan decodeOutput) {
-	// TODO: Init to dictionary
-	d.history.reset()
 	written := int64(0)
 
 	defer func() {
@@ -439,8 +446,6 @@
 
 // runDecoder will create a sync decoder that will decode a block of data.
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
-	// TODO: Init to dictionary
-	d.history.reset()
 	saved := d.history.b
 
 	// We use the history for output to avoid copying it.
diff --git a/vendor/github.com/klauspost/compress/zstd/frameenc.go b/vendor/github.com/klauspost/compress/zstd/frameenc.go
index 4479cfe..4ef7f5a 100644
--- a/vendor/github.com/klauspost/compress/zstd/frameenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/frameenc.go
@@ -5,6 +5,7 @@
 package zstd
 
 import (
+	"encoding/binary"
 	"fmt"
 	"io"
 	"math"
@@ -16,7 +17,7 @@
 	WindowSize    uint32
 	SingleSegment bool
 	Checksum      bool
-	DictID        uint32 // Not stored.
+	DictID        uint32
 }
 
 const maxHeaderSize = 14
@@ -30,6 +31,24 @@
 	if f.SingleSegment {
 		fhd |= 1 << 5
 	}
+
+	var dictIDContent []byte
+	if f.DictID > 0 {
+		var tmp [4]byte
+		if f.DictID < 256 {
+			fhd |= 1
+			tmp[0] = uint8(f.DictID)
+			dictIDContent = tmp[:1]
+		} else if f.DictID < 1<<16 {
+			fhd |= 2
+			binary.LittleEndian.PutUint16(tmp[:2], uint16(f.DictID))
+			dictIDContent = tmp[:2]
+		} else {
+			fhd |= 3
+			binary.LittleEndian.PutUint32(tmp[:4], f.DictID)
+			dictIDContent = tmp[:4]
+		}
+	}
 	var fcs uint8
 	if f.ContentSize >= 256 {
 		fcs++
@@ -40,6 +59,7 @@
 	if f.ContentSize >= 0xffffffff {
 		fcs++
 	}
+
 	fhd |= fcs << 6
 
 	dst = append(dst, fhd)
@@ -48,7 +68,9 @@
 		windowLog := (bits.Len32(f.WindowSize-1) - winLogMin) << 3
 		dst = append(dst, uint8(windowLog))
 	}
-
+	if f.DictID > 0 {
+		dst = append(dst, dictIDContent...)
+	}
 	switch fcs {
 	case 0:
 		if f.SingleSegment {
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index 9efe34f..e6d3d49 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -19,7 +19,7 @@
 	 *  Increasing memory usage improves compression ratio
 	 *  Reduced memory usage can improve speed, due to cache effect
 	 *  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
-	maxMemoryUsage = 11
+	maxMemoryUsage = tablelogAbsoluteMax + 2
 
 	maxTableLog    = maxMemoryUsage - 2
 	maxTablesize   = 1 << maxTableLog
@@ -55,7 +55,7 @@
 	if b.remain() < 4 {
 		return errors.New("input too small")
 	}
-	bitStream := b.Uint32()
+	bitStream := b.Uint32NC()
 	nbBits := uint((bitStream & 0xF) + minTablelog) // extract tableLog
 	if nbBits > tablelogAbsoluteMax {
 		println("Invalid tablelog:", nbBits)
@@ -79,7 +79,8 @@
 				n0 += 24
 				if r := b.remain(); r > 5 {
 					b.advance(2)
-					bitStream = b.Uint32() >> bitCount
+					// The check above should make sure we can read 32 bits
+					bitStream = b.Uint32NC() >> bitCount
 				} else {
 					// end of bit stream
 					bitStream >>= 16
@@ -104,10 +105,11 @@
 				charnum++
 			}
 
-			if r := b.remain(); r >= 7 || r+int(bitCount>>3) >= 4 {
+			if r := b.remain(); r >= 7 || r-int(bitCount>>3) >= 4 {
 				b.advance(bitCount >> 3)
 				bitCount &= 7
-				bitStream = b.Uint32() >> bitCount
+				// The check above should make sure we can read 32 bits
+				bitStream = b.Uint32NC() >> bitCount
 			} else {
 				bitStream >>= 2
 			}
@@ -118,7 +120,7 @@
 
 		if int32(bitStream)&(threshold-1) < max {
 			count = int32(bitStream) & (threshold - 1)
-			if debug && nbBits < 1 {
+			if debugAsserts && nbBits < 1 {
 				panic("nbBits underflow")
 			}
 			bitCount += nbBits - 1
@@ -148,17 +150,16 @@
 			threshold >>= 1
 		}
 
-		//println("b.off:", b.off, "len:", len(b.b), "bc:", bitCount, "remain:", b.remain())
-		if r := b.remain(); r >= 7 || r+int(bitCount>>3) >= 4 {
+		if r := b.remain(); r >= 7 || r-int(bitCount>>3) >= 4 {
 			b.advance(bitCount >> 3)
 			bitCount &= 7
+			// The check above should make sure we can read 32 bits
+			bitStream = b.Uint32NC() >> (bitCount & 31)
 		} else {
 			bitCount -= (uint)(8 * (len(b.b) - 4 - b.off))
 			b.off = len(b.b) - 4
-			//println("b.off:", b.off, "len:", len(b.b), "bc:", bitCount, "iend", iend)
+			bitStream = b.Uint32() >> (bitCount & 31)
 		}
-		bitStream = b.Uint32() >> (bitCount & 31)
-		//printf("bitstream is now: 0b%b", bitStream)
 	}
 	s.symbolLen = charnum
 	if s.symbolLen <= 1 {
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
index 619836f..c74681b 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@@ -97,7 +97,7 @@
 func (s *fseEncoder) allocCtable() {
 	tableSize := 1 << s.actualTableLog
 	// get tableSymbol that is big enough.
-	if cap(s.ct.tableSymbol) < int(tableSize) {
+	if cap(s.ct.tableSymbol) < tableSize {
 		s.ct.tableSymbol = make([]byte, tableSize)
 	}
 	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
@@ -202,13 +202,13 @@
 			case 0:
 			case -1, 1:
 				symbolTT[i].deltaNbBits = tl
-				symbolTT[i].deltaFindState = int16(total - 1)
+				symbolTT[i].deltaFindState = total - 1
 				total++
 			default:
 				maxBitsOut := uint32(tableLog) - highBit(uint32(v-1))
 				minStatePlus := uint32(v) << maxBitsOut
 				symbolTT[i].deltaNbBits = (maxBitsOut << 16) - minStatePlus
-				symbolTT[i].deltaFindState = int16(total - v)
+				symbolTT[i].deltaFindState = total - v
 				total += v
 			}
 		}
@@ -327,7 +327,7 @@
 		if err != nil {
 			return err
 		}
-		if debug {
+		if debugAsserts {
 			err = s.validateNorm()
 			if err != nil {
 				return err
@@ -336,7 +336,7 @@
 		return s.buildCTable()
 	}
 	s.norm[largest] += stillToDistribute
-	if debug {
+	if debugAsserts {
 		err := s.validateNorm()
 		if err != nil {
 			return err
@@ -353,8 +353,8 @@
 		distributed  uint32
 		total        = uint32(length)
 		tableLog     = s.actualTableLog
-		lowThreshold = uint32(total >> tableLog)
-		lowOne       = uint32((total * 3) >> (tableLog + 1))
+		lowThreshold = total >> tableLog
+		lowOne       = (total * 3) >> (tableLog + 1)
 	)
 	for i, cnt := range s.count[:s.symbolLen] {
 		if cnt == 0 {
@@ -379,7 +379,7 @@
 
 	if (total / toDistribute) > lowOne {
 		// risk of rounding to zero
-		lowOne = uint32((total * 3) / (toDistribute * 2))
+		lowOne = (total * 3) / (toDistribute * 2)
 		for i, cnt := range s.count[:s.symbolLen] {
 			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
 				s.norm[i] = 1
@@ -619,7 +619,7 @@
 func (s *fseEncoder) bitCost(symbolValue uint8, accuracyLog uint32) uint32 {
 	minNbBits := s.ct.symbolTT[symbolValue].deltaNbBits >> 16
 	threshold := (minNbBits + 1) << 16
-	if debug {
+	if debugAsserts {
 		if !(s.actualTableLog < 16) {
 			panic("!s.actualTableLog < 16")
 		}
@@ -633,7 +633,7 @@
 	// linear interpolation (very approximate)
 	normalizedDeltaFromThreshold := (deltaFromThreshold << accuracyLog) >> s.actualTableLog
 	bitMultiplier := uint32(1) << accuracyLog
-	if debug {
+	if debugAsserts {
 		if s.ct.symbolTT[symbolValue].deltaNbBits+tableSize > threshold {
 			panic("s.ct.symbolTT[symbolValue].deltaNbBits+tableSize > threshold")
 		}
@@ -708,7 +708,6 @@
 	im := int32((nbBitsOut << 16) - first.deltaNbBits)
 	lu := (im >> nbBitsOut) + int32(first.deltaFindState)
 	c.state = c.stateTable[lu]
-	return
 }
 
 // encode the output symbol provided and write it to the bitstream.
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
index 6c17dc1..474cb77 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
@@ -59,7 +59,7 @@
 	}
 	for i, bit := range bits {
 		if base > math.MaxInt32 {
-			panic(fmt.Sprintf("invalid decoding table, base overflows int32"))
+			panic("invalid decoding table, base overflows int32")
 		}
 
 		dst[i] = baseOffset{
diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go
index e8c419b..f783e32 100644
--- a/vendor/github.com/klauspost/compress/zstd/history.go
+++ b/vendor/github.com/klauspost/compress/zstd/history.go
@@ -17,6 +17,7 @@
 	windowSize    int
 	maxSize       int
 	error         bool
+	dict          *dict
 }
 
 // reset will reset the history to initial state of a frame.
@@ -36,12 +37,27 @@
 	}
 	h.decoders = sequenceDecs{}
 	if h.huffTree != nil {
-		huffDecoderPool.Put(h.huffTree)
+		if h.dict == nil || h.dict.litEnc != h.huffTree {
+			huffDecoderPool.Put(h.huffTree)
+		}
 	}
 	h.huffTree = nil
+	h.dict = nil
 	//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
 }
 
+func (h *history) setDict(dict *dict) {
+	if dict == nil {
+		return
+	}
+	h.dict = dict
+	h.decoders.litLengths = dict.llDec
+	h.decoders.offsets = dict.ofDec
+	h.decoders.matchLengths = dict.mlDec
+	h.recentOffsets = dict.offsets
+	h.huffTree = dict.litEnc
+}
+
 // append bytes to history.
 // This function will make sure there is space for it,
 // if the buffer has been allocated with enough extra space.
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
index d580e32..2c9c535 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
@@ -179,13 +179,13 @@
 	MOVQ ·prime2v(SB), R14
 
 	// Load slice.
-	MOVQ b_base+8(FP), CX
-	MOVQ b_len+16(FP), DX
+	MOVQ arg1_base+8(FP), CX
+	MOVQ arg1_len+16(FP), DX
 	LEAQ (CX)(DX*1), BX
 	SUBQ $32, BX
 
 	// Load vN from d.
-	MOVQ d+0(FP), AX
+	MOVQ arg+0(FP), AX
 	MOVQ 0(AX), R8   // v1
 	MOVQ 8(AX), R9   // v2
 	MOVQ 16(AX), R10 // v3
@@ -209,7 +209,7 @@
 	MOVQ R11, 24(AX)
 
 	// The number of bytes written is CX minus the old base pointer.
-	SUBQ b_base+8(FP), CX
+	SUBQ arg1_base+8(FP), CX
 	MOVQ CX, ret+32(FP)
 
 	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index 15a45f7..1dd39e6 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -62,8 +62,10 @@
 	matchLengths sequenceDec
 	prevOffset   [3]int
 	hist         []byte
+	dict         []byte
 	literals     []byte
 	out          []byte
+	windowSize   int
 	maxBits      uint8
 }
 
@@ -82,7 +84,12 @@
 	s.hist = hist.b
 	s.prevOffset = hist.recentOffsets
 	s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits
+	s.windowSize = hist.windowSize
 	s.out = out
+	s.dict = nil
+	if hist.dict != nil {
+		s.dict = hist.dict.content
+	}
 	return nil
 }
 
@@ -98,76 +105,159 @@
 			printf("reading sequence %d, exceeded available data\n", seqs-i)
 			return io.ErrUnexpectedEOF
 		}
-		var litLen, matchOff, matchLen int
+		var ll, mo, ml int
 		if br.off > 4+((maxOffsetBits+16+16)>>3) {
-			litLen, matchOff, matchLen = s.nextFast(br, llState, mlState, ofState)
+			// inlined function:
+			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
+
+			// Final will not read from stream.
+			var llB, mlB, moB uint8
+			ll, llB = llState.final()
+			ml, mlB = mlState.final()
+			mo, moB = ofState.final()
+
+			// extra bits are stored in reverse order.
+			br.fillFast()
+			mo += br.getBits(moB)
+			if s.maxBits > 32 {
+				br.fillFast()
+			}
+			ml += br.getBits(mlB)
+			ll += br.getBits(llB)
+
+			if moB > 1 {
+				s.prevOffset[2] = s.prevOffset[1]
+				s.prevOffset[1] = s.prevOffset[0]
+				s.prevOffset[0] = mo
+			} else {
+				// mo = s.adjustOffset(mo, ll, moB)
+				// Inlined for rather big speedup
+				if ll == 0 {
+					// There is an exception though, when current sequence's literals_length = 0.
+					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
+					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
+					mo++
+				}
+
+				if mo == 0 {
+					mo = s.prevOffset[0]
+				} else {
+					var temp int
+					if mo == 3 {
+						temp = s.prevOffset[0] - 1
+					} else {
+						temp = s.prevOffset[mo]
+					}
+
+					if temp == 0 {
+						// 0 is not valid; input is corrupted; force offset to 1
+						println("temp was 0")
+						temp = 1
+					}
+
+					if mo != 1 {
+						s.prevOffset[2] = s.prevOffset[1]
+					}
+					s.prevOffset[1] = s.prevOffset[0]
+					s.prevOffset[0] = temp
+					mo = temp
+				}
+			}
 			br.fillFast()
 		} else {
-			litLen, matchOff, matchLen = s.next(br, llState, mlState, ofState)
+			ll, mo, ml = s.next(br, llState, mlState, ofState)
 			br.fill()
 		}
 
 		if debugSequences {
-			println("Seq", seqs-i-1, "Litlen:", litLen, "matchOff:", matchOff, "(abs) matchLen:", matchLen)
+			println("Seq", seqs-i-1, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
 		}
 
-		if litLen > len(s.literals) {
-			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", litLen, len(s.literals))
+		if ll > len(s.literals) {
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, len(s.literals))
 		}
-		size := litLen + matchLen + len(s.out)
+		size := ll + ml + len(s.out)
 		if size-startSize > maxBlockSize {
 			return fmt.Errorf("output (%d) bigger than max block size", size)
 		}
 		if size > cap(s.out) {
-			// Not enough size, will be extremely rarely triggered,
+			// Not enough size, which can happen under high volume block streaming conditions
 			// but could be if destination slice is too small for sync operations.
-			// We add maxBlockSize to the capacity.
-			s.out = append(s.out, make([]byte, maxBlockSize)...)
-			s.out = s.out[:len(s.out)-maxBlockSize]
+			// over-allocating here can create a large amount of GC pressure so we try to keep
+			// it as contained as possible
+			used := len(s.out) - startSize
+			addBytes := 256 + ll + ml + used>>2
+			// Clamp to max block size.
+			if used+addBytes > maxBlockSize {
+				addBytes = maxBlockSize - used
+			}
+			s.out = append(s.out, make([]byte, addBytes)...)
+			s.out = s.out[:len(s.out)-addBytes]
 		}
-		if matchLen > maxMatchLen {
-			return fmt.Errorf("match len (%d) bigger than max allowed length", matchLen)
-		}
-		if matchOff > len(s.out)+len(hist)+litLen {
-			return fmt.Errorf("match offset (%d) bigger than current history (%d)", matchOff, len(s.out)+len(hist)+litLen)
-		}
-		if matchOff == 0 && matchLen > 0 {
-			return fmt.Errorf("zero matchoff and matchlen > 0")
+		if ml > maxMatchLen {
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
 		}
 
-		s.out = append(s.out, s.literals[:litLen]...)
-		s.literals = s.literals[litLen:]
+		// Add literals
+		s.out = append(s.out, s.literals[:ll]...)
+		s.literals = s.literals[ll:]
 		out := s.out
 
+		if mo == 0 && ml > 0 {
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+		}
+
+		if mo > len(s.out)+len(hist) || mo > s.windowSize {
+			if len(s.dict) == 0 {
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+			}
+
+			// we may be in dictionary.
+			dictO := len(s.dict) - (mo - (len(s.out) + len(hist)))
+			if dictO < 0 || dictO >= len(s.dict) {
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+			}
+			end := dictO + ml
+			if end > len(s.dict) {
+				out = append(out, s.dict[dictO:]...)
+				mo -= len(s.dict) - dictO
+				ml -= len(s.dict) - dictO
+			} else {
+				out = append(out, s.dict[dictO:end]...)
+				mo = 0
+				ml = 0
+			}
+		}
+
 		// Copy from history.
 		// TODO: Blocks without history could be made to ignore this completely.
-		if v := matchOff - len(s.out); v > 0 {
+		if v := mo - len(s.out); v > 0 {
 			// v is the start position in history from end.
 			start := len(s.hist) - v
-			if matchLen > v {
+			if ml > v {
 				// Some goes into current block.
 				// Copy remainder of history
 				out = append(out, s.hist[start:]...)
-				matchOff -= v
-				matchLen -= v
+				mo -= v
+				ml -= v
 			} else {
-				out = append(out, s.hist[start:start+matchLen]...)
-				matchLen = 0
+				out = append(out, s.hist[start:start+ml]...)
+				ml = 0
 			}
 		}
 		// We must be in current buffer now
-		if matchLen > 0 {
-			start := len(s.out) - matchOff
-			if matchLen <= len(s.out)-start {
+		if ml > 0 {
+			start := len(s.out) - mo
+			if ml <= len(s.out)-start {
 				// No overlap
-				out = append(out, s.out[start:start+matchLen]...)
+				out = append(out, s.out[start:start+ml]...)
 			} else {
 				// Overlapping copy
 				// Extend destination slice and copy one byte at the time.
-				out = out[:len(out)+matchLen]
-				src := out[start : start+matchLen]
+				out = out[:len(out)+ml]
+				src := out[start : start+ml]
 				// Destination is the space we just added.
-				dst := out[len(out)-matchLen:]
+				dst := out[len(out)-ml:]
 				dst = dst[:len(src)]
 				for i := range src {
 					dst[i] = src[i]
diff --git a/vendor/github.com/klauspost/compress/zstd/seqenc.go b/vendor/github.com/klauspost/compress/zstd/seqenc.go
index 36bcc3c..8014174 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqenc.go
@@ -35,7 +35,6 @@
 		// Ensure we cannot reuse by accident
 		prevEnc := *prev
 		prevEnc.symbolLen = 0
-		return
 	}
 	compareSwap(ll, &s.llEnc, &s.llPrev)
 	compareSwap(ml, &s.mlEnc, &s.mlPrev)
diff --git a/vendor/github.com/klauspost/compress/zstd/snappy.go b/vendor/github.com/klauspost/compress/zstd/snappy.go
index 356956b..9d9d1d5 100644
--- a/vendor/github.com/klauspost/compress/zstd/snappy.go
+++ b/vendor/github.com/klauspost/compress/zstd/snappy.go
@@ -10,8 +10,8 @@
 	"hash/crc32"
 	"io"
 
+	"github.com/golang/snappy"
 	"github.com/klauspost/compress/huff0"
-	"github.com/klauspost/compress/snappy"
 )
 
 const (
@@ -111,7 +111,7 @@
 			// Add empty last block
 			r.block.reset(nil)
 			r.block.last = true
-			err := r.block.encodeLits(false)
+			err := r.block.encodeLits(r.block.literals, false)
 			if err != nil {
 				return written, err
 			}
@@ -178,17 +178,16 @@
 				r.err = ErrSnappyCorrupt
 				return written, r.err
 			}
-			err = r.block.encode(false)
+			err = r.block.encode(nil, false, false)
 			switch err {
 			case errIncompressible:
 				r.block.popOffsets()
 				r.block.reset(nil)
 				r.block.literals, err = snappy.Decode(r.block.literals[:n], r.buf[snappyChecksumSize:chunkLen])
 				if err != nil {
-					println("snappy.Decode:", err)
 					return written, err
 				}
-				err = r.block.encodeLits(false)
+				err = r.block.encodeLits(r.block.literals, false)
 				if err != nil {
 					return written, err
 				}
@@ -235,7 +234,7 @@
 				r.err = ErrSnappyCorrupt
 				return written, r.err
 			}
-			err := r.block.encodeLits(false)
+			err := r.block.encodeLits(r.block.literals, false)
 			if err != nil {
 				return written, err
 			}
@@ -417,7 +416,7 @@
 // https://github.com/google/snappy/blob/master/framing_format.txt
 func snappyCRC(b []byte) uint32 {
 	c := crc32.Update(0, crcTable, b)
-	return uint32(c>>15|c<<17) + 0xa282ead8
+	return c>>15 | c<<17 + 0xa282ead8
 }
 
 // snappyDecodedLen returns the length of the decoded block and the number of bytes
diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go
new file mode 100644
index 0000000..e35a0a2
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@@ -0,0 +1,120 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package zstd
+
+import (
+	"errors"
+	"io"
+	"sync"
+)
+
+// ZipMethodWinZip is the method for Zstandard compressed data inside Zip files for WinZip.
+// See https://www.winzip.com/win/en/comp_info.html
+const ZipMethodWinZip = 93
+
+// ZipMethodPKWare is the method number used by PKWARE to indicate Zstandard compression.
+// See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.7.TXT
+const ZipMethodPKWare = 20
+
+var zipReaderPool sync.Pool
+
+// newZipReader cannot be used since we would leak goroutines...
+func newZipReader(r io.Reader) io.ReadCloser {
+	dec, ok := zipReaderPool.Get().(*Decoder)
+	if ok {
+		dec.Reset(r)
+	} else {
+		d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
+		if err != nil {
+			panic(err)
+		}
+		dec = d
+	}
+	return &pooledZipReader{dec: dec}
+}
+
+type pooledZipReader struct {
+	mu  sync.Mutex // guards Close and Read
+	dec *Decoder
+}
+
+func (r *pooledZipReader) Read(p []byte) (n int, err error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if r.dec == nil {
+		return 0, errors.New("Read after Close")
+	}
+	dec, err := r.dec.Read(p)
+
+	return dec, err
+}
+
+func (r *pooledZipReader) Close() error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	var err error
+	if r.dec != nil {
+		err = r.dec.Reset(nil)
+		zipReaderPool.Put(r.dec)
+		r.dec = nil
+	}
+	return err
+}
+
+type pooledZipWriter struct {
+	mu  sync.Mutex // guards Close and Read
+	enc *Encoder
+}
+
+func (w *pooledZipWriter) Write(p []byte) (n int, err error) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.enc == nil {
+		return 0, errors.New("Write after Close")
+	}
+	return w.enc.Write(p)
+}
+
+func (w *pooledZipWriter) Close() error {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	var err error
+	if w.enc != nil {
+		err = w.enc.Close()
+		zipReaderPool.Put(w.enc)
+		w.enc = nil
+	}
+	return err
+}
+
+// ZipCompressor returns a compressor that can be registered with zip libraries.
+// The provided encoder options will be used on all encodes.
+func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
+	var pool sync.Pool
+	return func(w io.Writer) (io.WriteCloser, error) {
+		enc, ok := pool.Get().(*Encoder)
+		if ok {
+			enc.Reset(w)
+		} else {
+			var err error
+			enc, err = NewWriter(w, opts...)
+			if err != nil {
+				return nil, err
+			}
+		}
+		return &pooledZipWriter{enc: enc}, nil
+	}
+}
+
+// ZipDecompressor returns a decompressor that can be registered with zip libraries.
+// See ZipCompressor for example.
+func ZipDecompressor() func(r io.Reader) io.ReadCloser {
+	return func(r io.Reader) io.ReadCloser {
+		d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
+		if err != nil {
+			panic(err)
+		}
+		return d.IOReadCloser()
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index 57a8a2f..1ba308c 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -4,13 +4,24 @@
 package zstd
 
 import (
+	"bytes"
+	"encoding/binary"
 	"errors"
 	"log"
+	"math"
 	"math/bits"
 )
 
+// enable debug printing
 const debug = false
+
+// Enable extra assertions.
+const debugAsserts = debug || false
+
+// print sequence details
 const debugSequences = false
+
+// print detailed matching information
 const debugMatches = false
 
 // force encoder to use predefined tables.
@@ -19,6 +30,9 @@
 // zstdMinMatch is the minimum zstd match length.
 const zstdMinMatch = 3
 
+// Reset the buffer offset when reaching this.
+const bufferReset = math.MaxInt32 - MaxWindowSize
+
 var (
 	// ErrReservedBlockType is returned when a reserved block type is found.
 	// Typically this indicates wrong or corrupted input.
@@ -61,6 +75,10 @@
 	// ErrDecoderClosed will be returned if the Decoder was used after
 	// Close has been called.
 	ErrDecoderClosed = errors.New("decoder used after Close")
+
+	// ErrDecoderNilInput is returned when a nil Reader was provided
+	// and an operation other than Reset/DecodeAll/Close was attempted.
+	ErrDecoderNilInput = errors.New("nil input provided as reader")
 )
 
 func println(a ...interface{}) {
@@ -75,6 +93,17 @@
 	}
 }
 
+// matchLenFast does matching, but will not match the last up to 7 bytes.
+func matchLenFast(a, b []byte) int {
+	endI := len(a) & (math.MaxInt32 - 7)
+	for i := 0; i < endI; i += 8 {
+		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+			return i + bits.TrailingZeros64(diff)>>3
+		}
+	}
+	return endI
+}
+
 // matchLen returns the maximum length.
 // a must be the shortest of the two.
 // The function also returns whether all bytes matched.
@@ -85,52 +114,33 @@
 			return i + (bits.TrailingZeros64(diff) >> 3)
 		}
 	}
+
 	checked := (len(a) >> 3) << 3
 	a = a[checked:]
 	b = b[checked:]
-	// TODO: We could do a 4 check.
 	for i := range a {
 		if a[i] != b[i] {
-			return int(i) + checked
+			return i + checked
 		}
 	}
 	return len(a) + checked
 }
 
-// matchLen returns a match length in src between index s and t
-func matchLenIn(src []byte, s, t int32) int32 {
-	s1 := len(src)
-	b := src[t:]
-	a := src[s:s1]
-	b = b[:len(a)]
-	// Extend the match to be as long as possible.
-	for i := range a {
-		if a[i] != b[i] {
-			return int32(i)
-		}
-	}
-	return int32(len(a))
-}
-
 func load3232(b []byte, i int32) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return binary.LittleEndian.Uint32(b[i:])
 }
 
 func load6432(b []byte, i int32) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }
 
 func load64(b []byte, i int) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }
+
+type byter interface {
+	Bytes() []byte
+	Len() int
+}
+
+var _ byter = &bytes.Buffer{}