VOL-2112 move to voltha-lib-go

Change-Id: Ic1af08003c1d2c698c0cce371e64f47b47b8d875
diff --git a/vendor/github.com/pierrec/lz4/.gitignore b/vendor/github.com/pierrec/lz4/.gitignore
index e48bab3..5e98735 100644
--- a/vendor/github.com/pierrec/lz4/.gitignore
+++ b/vendor/github.com/pierrec/lz4/.gitignore
@@ -30,4 +30,5 @@
 
 # End of https://www.gitignore.io/api/macos
 
-lz4c/lz4c
+cmd/*/*exe
+.idea
\ No newline at end of file
diff --git a/vendor/github.com/pierrec/lz4/.travis.yml b/vendor/github.com/pierrec/lz4/.travis.yml
index b2c806d..fd6c6db 100644
--- a/vendor/github.com/pierrec/lz4/.travis.yml
+++ b/vendor/github.com/pierrec/lz4/.travis.yml
@@ -1,9 +1,13 @@
 language: go
 
+env:
+  - GO111MODULE=off
+
 go:
-  - 1.8.x
   - 1.9.x
   - 1.10.x
+  - 1.11.x
+  - 1.12.x
   - master
 
 matrix:
@@ -16,3 +20,5 @@
 script: 
  - go test -v -cpu=2
  - go test -v -cpu=2 -race
+ - go test -v -cpu=2 -tags noasm
+ - go test -v -cpu=2 -race -tags noasm
diff --git a/vendor/github.com/pierrec/lz4/README.md b/vendor/github.com/pierrec/lz4/README.md
index 50a10ee..be1f52a 100644
--- a/vendor/github.com/pierrec/lz4/README.md
+++ b/vendor/github.com/pierrec/lz4/README.md
@@ -1,24 +1,106 @@
-[![godoc](https://godoc.org/github.com/pierrec/lz4?status.png)](https://godoc.org/github.com/pierrec/lz4)
+# lz4 : LZ4 compression in pure Go
 
-# lz4
-LZ4 compression and decompression in pure Go.
+[![GoDoc](https://godoc.org/github.com/pierrec/lz4?status.svg)](https://godoc.org/github.com/pierrec/lz4)
+[![Build Status](https://travis-ci.org/pierrec/lz4.svg?branch=master)](https://travis-ci.org/pierrec/lz4)
+[![Go Report Card](https://goreportcard.com/badge/github.com/pierrec/lz4)](https://goreportcard.com/report/github.com/pierrec/lz4)
+[![GitHub tag (latest SemVer)](https://img.shields.io/github/tag/pierrec/lz4.svg?style=social)](https://github.com/pierrec/lz4/tags)
 
-## Usage
+## Overview
 
-```go
-import "github.com/pierrec/lz4"
+This package provides a streaming interface to [LZ4 data streams](http://fastcompression.blogspot.fr/2013/04/lz4-streaming-format-final.html) as well as low level compress and uncompress functions for LZ4 data blocks.
+The implementation is based on the reference C [one](https://github.com/lz4/lz4).
+
+## Install
+
+Assuming you have the go toolchain installed:
+
+```
+go get github.com/pierrec/lz4
 ```
 
-## Description
-Package lz4 implements reading and writing lz4 compressed data (a frame),
-as specified in http://fastcompression.blogspot.fr/2013/04/lz4-streaming-format-final.html.
+There is a command line interface tool to compress and decompress LZ4 files.
 
-This package is **compatible with the LZ4 frame format** although the block level compression 
-and decompression functions are exposed and are fully compatible with the lz4 block format 
-definition, they are low level and should not be used directly.
+```
+go install github.com/pierrec/lz4/cmd/lz4c
+```
 
-For a complete description of an lz4 compressed block, see:
-http://fastcompression.blogspot.fr/2011/05/lz4-explained.html
+Usage
 
-See https://github.com/Cyan4973/lz4 for the reference C implementation.
+```
+Usage of lz4c:
+  -version
+        print the program version
 
+Subcommands:
+Compress the given files or from stdin to stdout.
+compress [arguments] [<file name> ...]
+  -bc
+        enable block checksum
+  -l int
+        compression level (0=fastest)
+  -sc
+        disable stream checksum
+  -size string
+        block max size [64K,256K,1M,4M] (default "4M")
+
+Uncompress the given files or from stdin to stdout.
+uncompress [arguments] [<file name> ...]
+
+```
+
+
+## Example
+
+```
+// Compress and uncompress an input string.
+s := "hello world"
+r := strings.NewReader(s)
+
+// The pipe will uncompress the data from the writer.
+pr, pw := io.Pipe()
+zw := lz4.NewWriter(pw)
+zr := lz4.NewReader(pr)
+
+go func() {
+	// Compress the input string.
+	_, _ = io.Copy(zw, r)
+	_ = zw.Close() // Make sure the writer is closed
+	_ = pw.Close() // Terminate the pipe
+}()
+
+_, _ = io.Copy(os.Stdout, zr)
+
+// Output:
+// hello world
+```
+
+## Contributing
+
+Contributions are very welcome for bug fixing, performance improvements...!
+
+- Open an issue with a proper description
+- Send a pull request with appropriate test case(s)
+
+## Contributors
+
+Thanks to all contributors so far:
+
+- [@klauspost](https://github.com/klauspost)
+- [@heidawei](https://github.com/heidawei)
+- [@x4m](https://github.com/x4m)
+- [@Zariel](https://github.com/Zariel)
+- [@edwingeng](https://github.com/edwingeng)
+- [@danielmoy-google](https://github.com/danielmoy-google)
+- [@honda-tatsuya](https://github.com/honda-tatsuya)
+- [@h8liu](https://github.com/h8liu)
+- [@sbinet](https://github.com/sbinet)
+- [@fingon](https://github.com/fingon)
+- [@emfree](https://github.com/emfree)
+- [@lhemala](https://github.com/lhemala)
+- [@connor4312](https://github.com/connor4312)
+- [@oov](https://github.com/oov)
+- [@arya](https://github.com/arya)
+- [@ikkeps](https://github.com/ikkeps)
+
+Special thanks to [@Zariel](https://github.com/Zariel) for his asm implementation of the decoder
+Special thanks to [@klauspost](https://github.com/klauspost) for his work on optimizing the code
diff --git a/vendor/github.com/pierrec/lz4/block.go b/vendor/github.com/pierrec/lz4/block.go
index ef24f17..5755cda 100644
--- a/vendor/github.com/pierrec/lz4/block.go
+++ b/vendor/github.com/pierrec/lz4/block.go
@@ -2,21 +2,14 @@
 
 import (
 	"encoding/binary"
-	"errors"
+	"fmt"
+	"math/bits"
 )
 
-var (
-	// ErrInvalidSourceShortBuffer is returned by UncompressBlock or CompressBLock when a compressed
-	// block is corrupted or the destination buffer is not large enough for the uncompressed data.
-	ErrInvalidSourceShortBuffer = errors.New("lz4: invalid source or destination buffer too short")
-	// ErrInvalid is returned when reading an invalid LZ4 archive.
-	ErrInvalid = errors.New("lz4: bad magic number")
-)
-
-// blockHash hashes 4 bytes into a value < winSize.
-func blockHash(x uint32) uint32 {
-	const hasher uint32 = 2654435761 // Knuth multiplicative hash.
-	return x * hasher >> hashShift
+// blockHash hashes the lower 6 bytes into a value < htSize.
+func blockHash(x uint64) uint32 {
+	const prime6bytes = 227718039650203
+	return uint32(((x << (64 - 48)) * prime6bytes) >> (64 - hashLog))
 }
 
 // CompressBlockBound returns the maximum size of a given buffer of size n, when not compressible.
@@ -30,75 +23,14 @@
 // The destination buffer must be sized appropriately.
 //
 // An error is returned if the source data is invalid or the destination buffer is too small.
-func UncompressBlock(src, dst []byte) (si int, err error) {
-	defer func() {
-		// It is now faster to let the runtime panic and recover on out of bound slice access
-		// than checking indices as we go along.
-		if recover() != nil {
-			err = ErrInvalidSourceShortBuffer
-		}
-	}()
-	sn := len(src)
-	if sn == 0 {
+func UncompressBlock(src, dst []byte) (int, error) {
+	if len(src) == 0 {
 		return 0, nil
 	}
-	var di int
-
-	for {
-		// Literals and match lengths (token).
-		b := int(src[si])
-		si++
-
-		// Literals.
-		if lLen := b >> 4; lLen > 0 {
-			if lLen == 0xF {
-				for src[si] == 0xFF {
-					lLen += 0xFF
-					si++
-				}
-				lLen += int(src[si])
-				si++
-			}
-			i := si
-			si += lLen
-			di += copy(dst[di:], src[i:si])
-
-			if si >= sn {
-				return di, nil
-			}
-		}
-
-		si++
-		_ = src[si] // Bound check elimination.
-		offset := int(src[si-1]) | int(src[si])<<8
-		si++
-
-		// Match.
-		mLen := b & 0xF
-		if mLen == 0xF {
-			for src[si] == 0xFF {
-				mLen += 0xFF
-				si++
-			}
-			mLen += int(src[si])
-			si++
-		}
-		mLen += minMatch
-
-		// Copy the match.
-		i := di - offset
-		if offset > 0 && mLen >= offset {
-			// Efficiently copy the match dst[di-offset:di] into the dst slice.
-			bytesToCopy := offset * (mLen / offset)
-			expanded := dst[i:]
-			for n := offset; n <= bytesToCopy+offset; n *= 2 {
-				copy(expanded[n:], expanded[:n])
-			}
-			di += bytesToCopy
-			mLen -= bytesToCopy
-		}
-		di += copy(dst[di:], dst[i:i+mLen])
+	if di := decodeBlock(dst, src); di >= 0 {
+		return di, nil
 	}
+	return 0, ErrInvalidSourceShortBuffer
 }
 
 // CompressBlock compresses the source buffer into the destination one.
@@ -109,58 +41,98 @@
 //
 // An error is returned if the destination buffer is too small.
 func CompressBlock(src, dst []byte, hashTable []int) (di int, err error) {
-	defer func() {
-		if recover() != nil {
-			err = ErrInvalidSourceShortBuffer
-		}
-	}()
+	defer recoverBlock(&err)
 
+	// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
+	// This significantly speeds up incompressible data and usually has very small impact on compresssion.
+	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
+	const adaptSkipLog = 7
 	sn, dn := len(src)-mfLimit, len(dst)
 	if sn <= 0 || dn == 0 {
 		return 0, nil
 	}
-	var si int
+	if len(hashTable) < htSize {
+		return 0, fmt.Errorf("hash table too small, should be at least %d in size", htSize)
+	}
+	// Prove to the compiler the table has at least htSize elements.
+	// The compiler can see that "uint32() >> hashShift" cannot be out of bounds.
+	hashTable = hashTable[:htSize]
+
+	// si: Current position of the search.
+	// anchor: Position of the current literals.
+	var si, anchor int
 
 	// Fast scan strategy: the hash table only stores the last 4 bytes sequences.
-	// const accInit = 1 << skipStrength
-
-	anchor := si // Position of the current literals.
-	// acc := accInit // Variable step: improves performance on non-compressible data.
-
 	for si < sn {
-		// Hash the next 4 bytes (sequence)...
-		match := binary.LittleEndian.Uint32(src[si:])
+		// Hash the next 6 bytes (sequence)...
+		match := binary.LittleEndian.Uint64(src[si:])
 		h := blockHash(match)
+		h2 := blockHash(match >> 8)
 
+		// We check a match at s, s+1 and s+2 and pick the first one we get.
+		// Checking 3 only requires us to load the source one.
 		ref := hashTable[h]
+		ref2 := hashTable[h2]
 		hashTable[h] = si
-		if ref >= sn { // Invalid reference (dirty hashtable).
-			si++
-			continue
-		}
+		hashTable[h2] = si + 1
 		offset := si - ref
+
+		// If offset <= 0 we got an old entry in the hash table.
 		if offset <= 0 || offset >= winSize || // Out of window.
-			match != binary.LittleEndian.Uint32(src[ref:]) { // Hash collision on different matches.
-			// si += acc >> skipStrength
-			// acc++
-			si++
-			continue
+			uint32(match) != binary.LittleEndian.Uint32(src[ref:]) { // Hash collision on different matches.
+			// No match. Start calculating another hash.
+			// The processor can usually do this out-of-order.
+			h = blockHash(match >> 16)
+			ref = hashTable[h]
+
+			// Check the second match at si+1
+			si += 1
+			offset = si - ref2
+
+			if offset <= 0 || offset >= winSize ||
+				uint32(match>>8) != binary.LittleEndian.Uint32(src[ref2:]) {
+				// No match. Check the third match at si+2
+				si += 1
+				offset = si - ref
+				hashTable[h] = si
+
+				if offset <= 0 || offset >= winSize ||
+					uint32(match>>16) != binary.LittleEndian.Uint32(src[ref:]) {
+					// Skip one extra byte (at si+3) before we check 3 matches again.
+					si += 2 + (si-anchor)>>adaptSkipLog
+					continue
+				}
+			}
 		}
 
 		// Match found.
-		// acc = accInit
 		lLen := si - anchor // Literal length.
+		// We already matched 4 bytes.
+		mLen := 4
 
-		// Encode match length part 1.
-		si += minMatch
-		mLen := si // Match length has minMatch already.
-		// Find the longest match, first looking by batches of 8 bytes.
-		for si < sn && binary.LittleEndian.Uint64(src[si:]) == binary.LittleEndian.Uint64(src[si-offset:]) {
-			si += 8
+		// Extend backwards if we can, reducing literals.
+		tOff := si - offset - 1
+		for lLen > 0 && tOff >= 0 && src[si-1] == src[tOff] {
+			si--
+			tOff--
+			lLen--
+			mLen++
 		}
-		// Then byte by byte.
-		for si < sn && src[si] == src[si-offset] {
-			si++
+
+		// Add the match length, so we continue search at the end.
+		// Use mLen to store the offset base.
+		si, mLen = si+mLen, si+minMatch
+
+		// Find the longest match by looking by batches of 8 bytes.
+		for si < sn {
+			x := binary.LittleEndian.Uint64(src[si:]) ^ binary.LittleEndian.Uint64(src[si-offset:])
+			if x == 0 {
+				si += 8
+			} else {
+				// Stop is first non-zero byte.
+				si += bits.TrailingZeros64(x) >> 3
+				break
+			}
 		}
 
 		mLen = si - mLen
@@ -186,7 +158,7 @@
 		di++
 
 		// Literals.
-		copy(dst[di:], src[anchor:anchor+lLen])
+		copy(dst[di:di+lLen], src[anchor:anchor+lLen])
 		di += lLen + 2
 		anchor = si
 
@@ -203,6 +175,13 @@
 			dst[di] = byte(mLen)
 			di++
 		}
+		// Check if we can load next values.
+		if si >= sn {
+			break
+		}
+		// Hash match end-2
+		h = blockHash(binary.LittleEndian.Uint64(src[si-2:]))
+		hashTable[h] = si - 2
 	}
 
 	if anchor == 0 {
@@ -230,10 +209,16 @@
 		// Incompressible.
 		return 0, nil
 	}
-	di += copy(dst[di:], src[anchor:])
+	di += copy(dst[di:di+len(src)-anchor], src[anchor:])
 	return di, nil
 }
 
+// blockHash hashes 4 bytes into a value < winSize.
+func blockHashHC(x uint32) uint32 {
+	const hasher uint32 = 2654435761 // Knuth multiplicative hash.
+	return x * hasher >> (32 - winSizeLog)
+}
+
 // CompressBlockHC compresses the source buffer src into the destination dst
 // with max search depth (use 0 or negative value for no max).
 //
@@ -243,11 +228,12 @@
 //
 // An error is returned if the destination buffer is too small.
 func CompressBlockHC(src, dst []byte, depth int) (di int, err error) {
-	defer func() {
-		if recover() != nil {
-			err = ErrInvalidSourceShortBuffer
-		}
-	}()
+	defer recoverBlock(&err)
+
+	// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
+	// This significantly speeds up incompressible data and usually has very small impact on compresssion.
+	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
+	const adaptSkipLog = 7
 
 	sn, dn := len(src)-mfLimit, len(dst)
 	if sn <= 0 || dn == 0 {
@@ -256,7 +242,7 @@
 	var si int
 
 	// hashTable: stores the last position found for a given hash
-	// chaingTable: stores previous positions for a given hash
+	// chainTable: stores previous positions for a given hash
 	var hashTable, chainTable [winSize]int
 
 	if depth <= 0 {
@@ -267,7 +253,7 @@
 	for si < sn {
 		// Hash the next 4 bytes (sequence).
 		match := binary.LittleEndian.Uint32(src[si:])
-		h := blockHash(match)
+		h := blockHashHC(match)
 
 		// Follow the chain until out of window and give the longest match.
 		mLen := 0
@@ -280,13 +266,17 @@
 			}
 			ml := 0
 			// Compare the current position with a previous with the same hash.
-			for ml < sn-si && binary.LittleEndian.Uint64(src[next+ml:]) == binary.LittleEndian.Uint64(src[si+ml:]) {
-				ml += 8
+			for ml < sn-si {
+				x := binary.LittleEndian.Uint64(src[next+ml:]) ^ binary.LittleEndian.Uint64(src[si+ml:])
+				if x == 0 {
+					ml += 8
+				} else {
+					// Stop is first non-zero byte.
+					ml += bits.TrailingZeros64(x) >> 3
+					break
+				}
 			}
-			for ml < sn-si && src[next+ml] == src[si+ml] {
-				ml++
-			}
-			if ml+1 < minMatch || ml <= mLen {
+			if ml < minMatch || ml <= mLen {
 				// Match too small (<minMath) or smaller than the current match.
 				continue
 			}
@@ -301,7 +291,7 @@
 
 		// No match found.
 		if mLen == 0 {
-			si++
+			si += 1 + (si-anchor)>>adaptSkipLog
 			continue
 		}
 
@@ -315,7 +305,7 @@
 		for si, ml := winStart, si+mLen; si < ml; {
 			match >>= 8
 			match |= uint32(src[si+3]) << 24
-			h := blockHash(match)
+			h := blockHashHC(match)
 			chainTable[si&winMask] = hashTable[h]
 			hashTable[h] = si
 			si++
@@ -347,7 +337,7 @@
 		di++
 
 		// Literals.
-		copy(dst[di:], src[anchor:anchor+lLen])
+		copy(dst[di:di+lLen], src[anchor:anchor+lLen])
 		di += lLen
 		anchor = si
 
@@ -392,6 +382,6 @@
 		// Incompressible.
 		return 0, nil
 	}
-	di += copy(dst[di:], src[anchor:])
+	di += copy(dst[di:di+len(src)-anchor], src[anchor:])
 	return di, nil
 }
diff --git a/vendor/github.com/pierrec/lz4/decode_amd64.go b/vendor/github.com/pierrec/lz4/decode_amd64.go
new file mode 100644
index 0000000..43cc14f
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/decode_amd64.go
@@ -0,0 +1,8 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package lz4
+
+//go:noescape
+func decodeBlock(dst, src []byte) int
diff --git a/vendor/github.com/pierrec/lz4/decode_amd64.s b/vendor/github.com/pierrec/lz4/decode_amd64.s
new file mode 100644
index 0000000..20fef39
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/decode_amd64.s
@@ -0,0 +1,375 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// AX scratch
+// BX scratch
+// CX scratch
+// DX token
+//
+// DI &dst
+// SI &src
+// R8 &dst + len(dst)
+// R9 &src + len(src)
+// R11 &dst
+// R12 short output end
+// R13 short input end
+// func decodeBlock(dst, src []byte) int
+// using 50 bytes of stack currently
+TEXT ·decodeBlock(SB), NOSPLIT, $64-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ DI, R11
+	MOVQ dst_len+8(FP), R8
+	ADDQ DI, R8
+
+	MOVQ src_base+24(FP), SI
+	MOVQ src_len+32(FP), R9
+	ADDQ SI, R9
+
+	// shortcut ends
+	// short output end
+	MOVQ R8, R12
+	SUBQ $32, R12
+	// short input end
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+loop:
+	// for si < len(src)
+	CMPQ SI, R9
+	JGE end
+
+	// token := uint32(src[si])
+	MOVBQZX (SI), DX
+	INCQ SI
+
+	// lit_len = token >> 4
+	// if lit_len > 0
+	// CX = lit_len
+	MOVQ DX, CX
+	SHRQ $4, CX
+
+	// if lit_len != 0xF
+	CMPQ CX, $0xF
+	JEQ lit_len_loop_pre
+	CMPQ DI, R12
+	JGE lit_len_loop_pre
+	CMPQ SI, R13
+	JGE lit_len_loop_pre
+
+	// copy shortcut
+
+	// A two-stage shortcut for the most common case:
+	// 1) If the literal length is 0..14, and there is enough space,
+	// enter the shortcut and copy 16 bytes on behalf of the literals
+	// (in the fast mode, only 8 bytes can be safely copied this way).
+	// 2) Further if the match length is 4..18, copy 18 bytes in a similar
+	// manner; but we ensure that there's enough space in the output for
+	// those 18 bytes earlier, upon entering the shortcut (in other words,
+	// there is a combined check for both stages).
+
+	// copy literal
+	MOVOU (SI), X0
+	MOVOU X0, (DI)
+	ADDQ CX, DI
+	ADDQ CX, SI
+
+	MOVQ DX, CX
+	ANDQ $0xF, CX
+
+	// The second stage: prepare for match copying, decode full info.
+	// If it doesn't work out, the info won't be wasted.
+	// offset := uint16(data[:2])
+	MOVWQZX (SI), DX
+	ADDQ $2, SI
+
+	MOVQ DI, AX
+	SUBQ DX, AX
+	CMPQ AX, DI
+	JGT err_short_buf
+
+	// if we can't do the second stage then jump straight to read the
+	// match length, we already have the offset.
+	CMPQ CX, $0xF
+	JEQ match_len_loop_pre
+	CMPQ DX, $8
+	JLT match_len_loop_pre
+	CMPQ AX, R11
+	JLT err_short_buf
+
+	// memcpy(op + 0, match + 0, 8);
+	MOVQ (AX), BX
+	MOVQ BX, (DI)
+	// memcpy(op + 8, match + 8, 8);
+	MOVQ 8(AX), BX
+	MOVQ BX, 8(DI)
+	// memcpy(op +16, match +16, 2);
+	MOVW 16(AX), BX
+	MOVW BX, 16(DI)
+
+	ADDQ $4, DI // minmatch
+	ADDQ CX, DI
+
+	// shortcut complete, load next token
+	JMP loop
+
+lit_len_loop_pre:
+	// if lit_len > 0
+	CMPQ CX, $0
+	JEQ offset
+	CMPQ CX, $0xF
+	JNE copy_literal
+
+lit_len_loop:
+	// for src[si] == 0xFF
+	CMPB (SI), $0xFF
+	JNE lit_len_finalise
+
+	// bounds check src[si+1]
+	MOVQ SI, AX
+	ADDQ $1, AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	// lit_len += 0xFF
+	ADDQ $0xFF, CX
+	INCQ SI
+	JMP lit_len_loop
+
+lit_len_finalise:
+	// lit_len += int(src[si])
+	// si++
+	MOVBQZX (SI), AX
+	ADDQ AX, CX
+	INCQ SI
+
+copy_literal:
+	// bounds check src and dst
+	MOVQ SI, AX
+	ADDQ CX, AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	MOVQ DI, AX
+	ADDQ CX, AX
+	CMPQ AX, R8
+	JGT err_short_buf
+
+	// whats a good cut off to call memmove?
+	CMPQ CX, $16
+	JGT memmove_lit
+
+	// if len(dst[di:]) < 16
+	MOVQ R8, AX
+	SUBQ DI, AX
+	CMPQ AX, $16
+	JLT memmove_lit
+
+	// if len(src[si:]) < 16
+	MOVQ R9, AX
+	SUBQ SI, AX
+	CMPQ AX, $16
+	JLT memmove_lit
+
+	MOVOU (SI), X0
+	MOVOU X0, (DI)
+
+	JMP finish_lit_copy
+
+memmove_lit:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+	// spill
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP) // need len to inc SI, DI after
+	MOVB DX, 48(SP)
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+	MOVB 48(SP), DX
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+finish_lit_copy:
+	ADDQ CX, SI
+	ADDQ CX, DI
+
+	CMPQ SI, R9
+	JGE end
+
+offset:
+	// CX := mLen
+	// free up DX to use for offset
+	MOVQ DX, CX
+
+	MOVQ SI, AX
+	ADDQ $2, AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	// offset
+	// DX := int(src[si]) | int(src[si+1])<<8
+	MOVWQZX (SI), DX
+	ADDQ $2, SI
+
+	// 0 offset is invalid
+	CMPQ DX, $0
+	JEQ err_corrupt
+
+	ANDB $0xF, CX
+
+match_len_loop_pre:
+	// if mlen != 0xF
+	CMPB CX, $0xF
+	JNE copy_match
+
+match_len_loop:
+	// for src[si] == 0xFF
+	// lit_len += 0xFF
+	CMPB (SI), $0xFF
+	JNE match_len_finalise
+
+	// bounds check src[si+1]
+	MOVQ SI, AX
+	ADDQ $1, AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	ADDQ $0xFF, CX
+	INCQ SI
+	JMP match_len_loop
+
+match_len_finalise:
+	// lit_len += int(src[si])
+	// si++
+	MOVBQZX (SI), AX
+	ADDQ AX, CX
+	INCQ SI
+
+copy_match:
+	// mLen += minMatch
+	ADDQ $4, CX
+
+	// check we have match_len bytes left in dst
+	// di+match_len < len(dst)
+	MOVQ DI, AX
+	ADDQ CX, AX
+	CMPQ AX, R8
+	JGT err_short_buf
+
+	// DX = offset
+	// CX = match_len
+	// BX = &dst + (di - offset)
+	MOVQ DI, BX
+	SUBQ DX, BX
+
+	// check BX is within dst
+	// if BX < &dst
+	CMPQ BX, R11
+	JLT err_short_buf
+
+	// if offset + match_len < di
+	MOVQ BX, AX
+	ADDQ CX, AX
+	CMPQ DI, AX
+	JGT copy_interior_match
+
+	// AX := len(dst[:di])
+	// MOVQ DI, AX
+	// SUBQ R11, AX
+
+	// copy 16 bytes at a time
+	// if di-offset < 16 copy 16-(di-offset) bytes to di
+	// then do the remaining
+
+copy_match_loop:
+	// for match_len >= 0
+	// dst[di] = dst[i]
+	// di++
+	// i++
+	MOVB (BX), AX
+	MOVB AX, (DI)
+	INCQ DI
+	INCQ BX
+	DECQ CX
+
+	CMPQ CX, $0
+	JGT copy_match_loop
+
+	JMP loop
+
+copy_interior_match:
+	CMPQ CX, $16
+	JGT memmove_match
+
+	// if len(dst[di:]) < 16
+	MOVQ R8, AX
+	SUBQ DI, AX
+	CMPQ AX, $16
+	JLT memmove_match
+
+	MOVOU (BX), X0
+	MOVOU X0, (DI)
+
+	ADDQ CX, DI
+	JMP loop
+
+memmove_match:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ BX, 8(SP)
+	MOVQ CX, 16(SP)
+	// spill
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP) // need len to inc SI, DI after
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11 // TODO: make these sensible numbers
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+	ADDQ CX, DI
+	JMP loop
+
+err_corrupt:
+	MOVQ $-1, ret+48(FP)
+	RET
+
+err_short_buf:
+	MOVQ $-2, ret+48(FP)
+	RET
+
+end:
+	SUBQ R11, DI
+	MOVQ DI, ret+48(FP)
+	RET
diff --git a/vendor/github.com/pierrec/lz4/decode_other.go b/vendor/github.com/pierrec/lz4/decode_other.go
new file mode 100644
index 0000000..919888e
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/decode_other.go
@@ -0,0 +1,98 @@
+// +build !amd64 appengine !gc noasm
+
+package lz4
+
+func decodeBlock(dst, src []byte) (ret int) {
+	const hasError = -2
+	defer func() {
+		if recover() != nil {
+			ret = hasError
+		}
+	}()
+
+	var si, di int
+	for {
+		// Literals and match lengths (token).
+		b := int(src[si])
+		si++
+
+		// Literals.
+		if lLen := b >> 4; lLen > 0 {
+			switch {
+			case lLen < 0xF && si+16 < len(src):
+				// Shortcut 1
+				// if we have enough room in src and dst, and the literals length
+				// is small enough (0..14) then copy all 16 bytes, even if not all
+				// are part of the literals.
+				copy(dst[di:], src[si:si+16])
+				si += lLen
+				di += lLen
+				if mLen := b & 0xF; mLen < 0xF {
+					// Shortcut 2
+					// if the match length (4..18) fits within the literals, then copy
+					// all 18 bytes, even if not all are part of the literals.
+					mLen += 4
+					if offset := int(src[si]) | int(src[si+1])<<8; mLen <= offset {
+						i := di - offset
+						end := i + 18
+						if end > len(dst) {
+							// The remaining buffer may not hold 18 bytes.
+							// See https://github.com/pierrec/lz4/issues/51.
+							end = len(dst)
+						}
+						copy(dst[di:], dst[i:end])
+						si += 2
+						di += mLen
+						continue
+					}
+				}
+			case lLen == 0xF:
+				for src[si] == 0xFF {
+					lLen += 0xFF
+					si++
+				}
+				lLen += int(src[si])
+				si++
+				fallthrough
+			default:
+				copy(dst[di:di+lLen], src[si:si+lLen])
+				si += lLen
+				di += lLen
+			}
+		}
+		if si >= len(src) {
+			return di
+		}
+
+		offset := int(src[si]) | int(src[si+1])<<8
+		if offset == 0 {
+			return hasError
+		}
+		si += 2
+
+		// Match.
+		mLen := b & 0xF
+		if mLen == 0xF {
+			for src[si] == 0xFF {
+				mLen += 0xFF
+				si++
+			}
+			mLen += int(src[si])
+			si++
+		}
+		mLen += minMatch
+
+		// Copy the match.
+		expanded := dst[di-offset:]
+		if mLen > offset {
+			// Efficiently copy the match dst[di-offset:di] into the dst slice.
+			bytesToCopy := offset * (mLen / offset)
+			for n := offset; n <= bytesToCopy+offset; n *= 2 {
+				copy(expanded[n:], expanded[:n])
+			}
+			di += bytesToCopy
+			mLen -= bytesToCopy
+		}
+		di += copy(dst[di:di+mLen], expanded[:mLen])
+	}
+}
diff --git a/vendor/github.com/pierrec/lz4/errors.go b/vendor/github.com/pierrec/lz4/errors.go
new file mode 100644
index 0000000..1c45d18
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/errors.go
@@ -0,0 +1,30 @@
+package lz4
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	rdebug "runtime/debug"
+)
+
+var (
+	// ErrInvalidSourceShortBuffer is returned by UncompressBlock or CompressBLock when a compressed
+	// block is corrupted or the destination buffer is not large enough for the uncompressed data.
+	ErrInvalidSourceShortBuffer = errors.New("lz4: invalid source or destination buffer too short")
+	// ErrInvalid is returned when reading an invalid LZ4 archive.
+	ErrInvalid = errors.New("lz4: bad magic number")
+	// ErrBlockDependency is returned when attempting to decompress an archive created with block dependency.
+	ErrBlockDependency = errors.New("lz4: block dependency not supported")
+	// ErrUnsupportedSeek is returned when attempting to Seek any way but forward from the current position.
+	ErrUnsupportedSeek = errors.New("lz4: can only seek forward from io.SeekCurrent")
+)
+
+func recoverBlock(e *error) {
+	if r := recover(); r != nil && *e == nil {
+		if debugFlag {
+			fmt.Fprintln(os.Stderr, r)
+			rdebug.PrintStack()
+		}
+		*e = ErrInvalidSourceShortBuffer
+	}
+}
diff --git a/vendor/github.com/pierrec/lz4/internal/xxh32/xxh32zero.go b/vendor/github.com/pierrec/lz4/internal/xxh32/xxh32zero.go
index 850a6fd..7a76a6b 100644
--- a/vendor/github.com/pierrec/lz4/internal/xxh32/xxh32zero.go
+++ b/vendor/github.com/pierrec/lz4/internal/xxh32/xxh32zero.go
@@ -7,14 +7,15 @@
 )
 
 const (
-	prime32_1 uint32 = 2654435761
-	prime32_2 uint32 = 2246822519
-	prime32_3 uint32 = 3266489917
-	prime32_4 uint32 = 668265263
-	prime32_5 uint32 = 374761393
+	prime1 uint32 = 2654435761
+	prime2 uint32 = 2246822519
+	prime3 uint32 = 3266489917
+	prime4 uint32 = 668265263
+	prime5 uint32 = 374761393
 
-	prime32_1plus2 uint32 = 606290984
-	prime32_minus1 uint32 = 1640531535
+	primeMask   = 0xFFFFFFFF
+	prime1plus2 = uint32((uint64(prime1) + uint64(prime2)) & primeMask) // 606290984
+	prime1minus = uint32((-int64(prime1)) & primeMask)                  // 1640531535
 )
 
 // XXHZero represents an xxhash32 object with seed 0.
@@ -37,10 +38,10 @@
 
 // Reset resets the Hash to its initial state.
 func (xxh *XXHZero) Reset() {
-	xxh.v1 = prime32_1plus2
-	xxh.v2 = prime32_2
+	xxh.v1 = prime1plus2
+	xxh.v2 = prime2
 	xxh.v3 = 0
-	xxh.v4 = prime32_minus1
+	xxh.v4 = prime1minus
 	xxh.totalLen = 0
 	xxh.bufused = 0
 }
@@ -83,20 +84,20 @@
 
 		// fast rotl(13)
 		buf := xxh.buf[:16] // BCE hint.
-		v1 = rol13(v1+binary.LittleEndian.Uint32(buf[:])*prime32_2) * prime32_1
-		v2 = rol13(v2+binary.LittleEndian.Uint32(buf[4:])*prime32_2) * prime32_1
-		v3 = rol13(v3+binary.LittleEndian.Uint32(buf[8:])*prime32_2) * prime32_1
-		v4 = rol13(v4+binary.LittleEndian.Uint32(buf[12:])*prime32_2) * prime32_1
+		v1 = rol13(v1+binary.LittleEndian.Uint32(buf[:])*prime2) * prime1
+		v2 = rol13(v2+binary.LittleEndian.Uint32(buf[4:])*prime2) * prime1
+		v3 = rol13(v3+binary.LittleEndian.Uint32(buf[8:])*prime2) * prime1
+		v4 = rol13(v4+binary.LittleEndian.Uint32(buf[12:])*prime2) * prime1
 		p = r
 		xxh.bufused = 0
 	}
 
 	for n := n - 16; p <= n; p += 16 {
 		sub := input[p:][:16] //BCE hint for compiler
-		v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime32_2) * prime32_1
-		v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime32_2) * prime32_1
-		v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime32_2) * prime32_1
-		v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime32_2) * prime32_1
+		v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime2) * prime1
+		v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime2) * prime1
+		v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime2) * prime1
+		v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime2) * prime1
 	}
 	xxh.v1, xxh.v2, xxh.v3, xxh.v4 = v1, v2, v3, v4
 
@@ -112,25 +113,25 @@
 	if h32 >= 16 {
 		h32 += rol1(xxh.v1) + rol7(xxh.v2) + rol12(xxh.v3) + rol18(xxh.v4)
 	} else {
-		h32 += prime32_5
+		h32 += prime5
 	}
 
 	p := 0
 	n := xxh.bufused
 	buf := xxh.buf
 	for n := n - 4; p <= n; p += 4 {
-		h32 += binary.LittleEndian.Uint32(buf[p:p+4]) * prime32_3
-		h32 = rol17(h32) * prime32_4
+		h32 += binary.LittleEndian.Uint32(buf[p:p+4]) * prime3
+		h32 = rol17(h32) * prime4
 	}
 	for ; p < n; p++ {
-		h32 += uint32(buf[p]) * prime32_5
-		h32 = rol11(h32) * prime32_1
+		h32 += uint32(buf[p]) * prime5
+		h32 = rol11(h32) * prime1
 	}
 
 	h32 ^= h32 >> 15
-	h32 *= prime32_2
+	h32 *= prime2
 	h32 ^= h32 >> 13
-	h32 *= prime32_3
+	h32 *= prime3
 	h32 ^= h32 >> 16
 
 	return h32
@@ -142,19 +143,19 @@
 	h32 := uint32(n)
 
 	if n < 16 {
-		h32 += prime32_5
+		h32 += prime5
 	} else {
-		v1 := prime32_1plus2
-		v2 := prime32_2
+		v1 := prime1plus2
+		v2 := prime2
 		v3 := uint32(0)
-		v4 := prime32_minus1
+		v4 := prime1minus
 		p := 0
 		for n := n - 16; p <= n; p += 16 {
 			sub := input[p:][:16] //BCE hint for compiler
-			v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime32_2) * prime32_1
-			v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime32_2) * prime32_1
-			v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime32_2) * prime32_1
-			v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime32_2) * prime32_1
+			v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime2) * prime1
+			v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime2) * prime1
+			v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime2) * prime1
+			v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime2) * prime1
 		}
 		input = input[p:]
 		n -= p
@@ -163,19 +164,19 @@
 
 	p := 0
 	for n := n - 4; p <= n; p += 4 {
-		h32 += binary.LittleEndian.Uint32(input[p:p+4]) * prime32_3
-		h32 = rol17(h32) * prime32_4
+		h32 += binary.LittleEndian.Uint32(input[p:p+4]) * prime3
+		h32 = rol17(h32) * prime4
 	}
 	for p < n {
-		h32 += uint32(input[p]) * prime32_5
-		h32 = rol11(h32) * prime32_1
+		h32 += uint32(input[p]) * prime5
+		h32 = rol11(h32) * prime1
 		p++
 	}
 
 	h32 ^= h32 >> 15
-	h32 *= prime32_2
+	h32 *= prime2
 	h32 ^= h32 >> 13
-	h32 *= prime32_3
+	h32 *= prime3
 	h32 ^= h32 >> 16
 
 	return h32
@@ -183,12 +184,12 @@
 
 // Uint32Zero hashes x with seed 0.
 func Uint32Zero(x uint32) uint32 {
-	h := prime32_5 + 4 + x*prime32_3
-	h = rol17(h) * prime32_4
+	h := prime5 + 4 + x*prime3
+	h = rol17(h) * prime4
 	h ^= h >> 15
-	h *= prime32_2
+	h *= prime2
 	h ^= h >> 13
-	h *= prime32_3
+	h *= prime3
 	h ^= h >> 16
 	return h
 }
diff --git a/vendor/github.com/pierrec/lz4/lz4.go b/vendor/github.com/pierrec/lz4/lz4.go
index 3580275..cdbf961 100644
--- a/vendor/github.com/pierrec/lz4/lz4.go
+++ b/vendor/github.com/pierrec/lz4/lz4.go
@@ -30,27 +30,25 @@
 	// hashLog determines the size of the hash table used to quickly find a previous match position.
 	// Its value influences the compression speed and memory usage, the lower the faster,
 	// but at the expense of the compression ratio.
-	// 16 seems to be the best compromise.
-	hashLog       = 16
-	hashTableSize = 1 << hashLog
-	hashShift     = uint((minMatch * 8) - hashLog)
+	// 16 seems to be the best compromise for fast compression.
+	hashLog = 16
+	htSize  = 1 << hashLog
 
-	mfLimit      = 8 + minMatch // The last match cannot start within the last 12 bytes.
-	skipStrength = 6            // variable step for fast scan
+	mfLimit = 8 + minMatch // The last match cannot start within the last 12 bytes.
 )
 
 // map the block max size id with its value in bytes: 64Kb, 256Kb, 1Mb and 4Mb.
-var (
-	bsMapID    = map[byte]int{4: 64 << 10, 5: 256 << 10, 6: 1 << 20, 7: 4 << 20}
-	bsMapValue = make(map[int]byte, len(bsMapID))
+const (
+	blockSize64K  = 64 << 10
+	blockSize256K = 256 << 10
+	blockSize1M   = 1 << 20
+	blockSize4M   = 4 << 20
 )
 
-// Reversed.
-func init() {
-	for i, v := range bsMapID {
-		bsMapValue[v] = i
-	}
-}
+var (
+	bsMapID    = map[byte]int{4: blockSize64K, 5: blockSize256K, 6: blockSize1M, 7: blockSize4M}
+	bsMapValue = map[int]byte{blockSize64K: 4, blockSize256K: 5, blockSize1M: 6, blockSize4M: 7}
+)
 
 // Header describes the various flags that can be set on a Writer or obtained from a Reader.
 // The default values match those of the LZ4 frame format definition
diff --git a/vendor/github.com/pierrec/lz4/reader.go b/vendor/github.com/pierrec/lz4/reader.go
index f08db47..126b792 100644
--- a/vendor/github.com/pierrec/lz4/reader.go
+++ b/vendor/github.com/pierrec/lz4/reader.go
@@ -14,6 +14,9 @@
 // The Header may change between Read() calls in case of concatenated frames.
 type Reader struct {
 	Header
+	// Handler called when a block has been successfully read.
+	// It provides the number of bytes read.
+	OnBlockDone func(size int)
 
 	buf      [8]byte       // Scrap buffer.
 	pos      int64         // Current position in src.
@@ -22,6 +25,8 @@
 	data     []byte        // Uncompressed data.
 	idx      int           // Index of unread bytes into data.
 	checksum xxh32.XXHZero // Frame hash.
+	skip     int64         // Bytes to skip before next read.
+	dpos     int64         // Position in dest
 }
 
 // NewReader returns a new LZ4 frame decoder.
@@ -76,7 +81,7 @@
 		return fmt.Errorf("lz4: invalid version: got %d; expected %d", v, Version)
 	}
 	if b>>5&1 == 0 {
-		return fmt.Errorf("lz4: block dependency not supported")
+		return ErrBlockDependency
 	}
 	z.BlockChecksum = b>>4&1 > 0
 	frameSize := b>>3&1 > 0
@@ -101,7 +106,7 @@
 	z.data = z.zdata[:cap(z.zdata)][bSize:]
 	z.idx = len(z.data)
 
-	z.checksum.Write(buf[0:2])
+	_, _ = z.checksum.Write(buf[0:2])
 
 	if frameSize {
 		buf := buf[:8]
@@ -110,7 +115,7 @@
 		}
 		z.Size = binary.LittleEndian.Uint64(buf)
 		z.pos += 8
-		z.checksum.Write(buf)
+		_, _ = z.checksum.Write(buf)
 	}
 
 	// Header checksum.
@@ -158,6 +163,9 @@
 		if debugFlag {
 			debug("reading block from writer")
 		}
+		// Reset uncompressed buffer
+		z.data = z.zdata[:cap(z.zdata)][len(z.zdata):]
+
 		// Block length: 0 = end of frame, highest bit set: uncompressed.
 		bLen, err := z.readUint32()
 		if err != nil {
@@ -208,6 +216,9 @@
 				return 0, err
 			}
 			z.pos += int64(bLen)
+			if z.OnBlockDone != nil {
+				z.OnBlockDone(int(bLen))
+			}
 
 			if z.BlockChecksum {
 				checksum, err := z.readUint32()
@@ -252,10 +263,13 @@
 				return 0, err
 			}
 			z.data = z.data[:n]
+			if z.OnBlockDone != nil {
+				z.OnBlockDone(n)
+			}
 		}
 
 		if !z.NoChecksum {
-			z.checksum.Write(z.data)
+			_, _ = z.checksum.Write(z.data)
 			if debugFlag {
 				debug("current frame checksum %x", z.checksum.Sum32())
 			}
@@ -263,8 +277,20 @@
 		z.idx = 0
 	}
 
+	if z.skip > int64(len(z.data[z.idx:])) {
+		z.skip -= int64(len(z.data[z.idx:]))
+		z.dpos += int64(len(z.data[z.idx:]))
+		z.idx = len(z.data)
+		return 0, nil
+	}
+
+	z.idx += int(z.skip)
+	z.dpos += z.skip
+	z.skip = 0
+
 	n := copy(buf, z.data[z.idx:])
 	z.idx += n
+	z.dpos += int64(n)
 	if debugFlag {
 		debug("copied %d bytes to input", n)
 	}
@@ -272,6 +298,20 @@
 	return n, nil
 }
 
+// Seek implements io.Seeker, but supports seeking forward from the current
+// position only. Any other seek will return an error. Allows skipping output
+// bytes which aren't needed, which in some scenarios is faster than reading
+// and discarding them.
+// Note this may cause future calls to Read() to read 0 bytes if all of the
+// data they would have returned is skipped.
+func (z *Reader) Seek(offset int64, whence int) (int64, error) {
+	if offset < 0 || whence != io.SeekCurrent {
+		return z.dpos + z.skip, ErrUnsupportedSeek
+	}
+	z.skip += offset
+	return z.dpos + z.skip, nil
+}
+
 // Reset discards the Reader's state and makes it equivalent to the
 // result of its original state from NewReader, but reading from r instead.
 // This permits reusing a Reader rather than allocating a new one.
diff --git a/vendor/github.com/pierrec/lz4/writer.go b/vendor/github.com/pierrec/lz4/writer.go
index 0120438..2cc8d95 100644
--- a/vendor/github.com/pierrec/lz4/writer.go
+++ b/vendor/github.com/pierrec/lz4/writer.go
@@ -11,6 +11,9 @@
 // Writer implements the LZ4 frame encoder.
 type Writer struct {
 	Header
+	// Handler called when a block has been successfully written out.
+	// It provides the number of bytes written.
+	OnBlockDone func(size int)
 
 	buf       [19]byte      // magic number(4) + header(flags(2)+[Size(8)+DictID(4)]+checksum(1)) does not exceed 19 bytes
 	dst       io.Writer     // Destination.
@@ -43,11 +46,13 @@
 	}
 	// Allocate the compressed/uncompressed buffers.
 	// The compressed buffer cannot exceed the uncompressed one.
-	if n := 2 * bSize; cap(z.zdata) < n {
-		z.zdata = make([]byte, n, n)
+	if cap(z.zdata) < bSize {
+		// Only allocate if there is not enough capacity.
+		// Allocate both buffers at once.
+		z.zdata = make([]byte, 2*bSize)
 	}
-	z.zdata = z.zdata[:bSize]
-	z.data = z.zdata[:cap(z.zdata)][bSize:]
+	z.data = z.zdata[:bSize]                 // Uncompressed buffer is the first half.
+	z.zdata = z.zdata[:cap(z.zdata)][bSize:] // Compressed buffer is the second half.
 	z.idx = 0
 
 	// Size is optional.
@@ -182,24 +187,26 @@
 	if err := z.writeUint32(bLen); err != nil {
 		return err
 	}
-	if _, err := z.dst.Write(zdata); err != nil {
+	written, err := z.dst.Write(zdata)
+	if err != nil {
 		return err
 	}
+	if h := z.OnBlockDone; h != nil {
+		h(written)
+	}
 
-	if z.BlockChecksum {
-		checksum := xxh32.ChecksumZero(zdata)
+	if !z.BlockChecksum {
 		if debugFlag {
-			debug("block checksum %x", checksum)
+			debug("current frame checksum %x", z.checksum.Sum32())
 		}
-		if err := z.writeUint32(checksum); err != nil {
-			return err
-		}
+		return nil
 	}
+	checksum := xxh32.ChecksumZero(zdata)
 	if debugFlag {
-		debug("current frame checksum %x", z.checksum.Sum32())
+		debug("block checksum %x", checksum)
+		defer func() { debug("current frame checksum %x", z.checksum.Sum32()) }()
 	}
-
-	return nil
+	return z.writeUint32(checksum)
 }
 
 // Flush flushes any pending compressed data to the underlying writer.
@@ -213,7 +220,11 @@
 		return nil
 	}
 
-	return z.compressBlock(z.data[:z.idx])
+	if err := z.compressBlock(z.data[:z.idx]); err != nil {
+		return err
+	}
+	z.idx = 0
+	return nil
 }
 
 // Close closes the Writer, flushing any unwritten data to the underlying io.Writer, but does not close the underlying io.Writer.
@@ -223,7 +234,6 @@
 			return err
 		}
 	}
-
 	if err := z.Flush(); err != nil {
 		return err
 	}
@@ -234,16 +244,14 @@
 	if err := z.writeUint32(0); err != nil {
 		return err
 	}
-	if !z.NoChecksum {
-		checksum := z.checksum.Sum32()
-		if debugFlag {
-			debug("stream checksum %x", checksum)
-		}
-		if err := z.writeUint32(checksum); err != nil {
-			return err
-		}
+	if z.NoChecksum {
+		return nil
 	}
-	return nil
+	checksum := z.checksum.Sum32()
+	if debugFlag {
+		debug("stream checksum %x", checksum)
+	}
+	return z.writeUint32(checksum)
 }
 
 // Reset clears the state of the Writer z such that it is equivalent to its